# Anti-CSV

In this Jupyter notebook all CSV files from the data directory are converted to a more efficient format (such as .parquet, or if this isn't enough .feather). This should allow for faster loading of data later down the line.
Additionally, a first data clean-up takes place: by inferring missing values with default values and rescaling some values to be closer to 1, data loss is tried to be minimized.

In [None]:
# import all necessary packages
import os
from pathlib import Path

import pandas as pd
import numpy as np
from numpy.ma.extras import column_stack

from tqdm.notebook import tqdm

In [None]:
# fix global constants
data_dir = Path('../data/')

In [None]:
# some helper functions down the line
conversion_factor = np.pi / 180.0
def convert_to_rad(_df, _key):
    _df[_key] = _df[_key] * conversion_factor
    return _df

def scale_km(_df, _key):
    _df[_key] = _df[_key] / 1_000.0
    return _df

## GOES data

First of all, the GOES data is copied from `./data/raw/GOES/*.csv` to `./data/preprocessed/GOES/*.parquet`.

In [None]:
goes_raw_data_dir = Path(data_dir / 'raw/GOES/')
goes_out_dir = Path(data_dir / 'preprocessed/GOES/')
goes_aggregated_file = Path(data_dir / 'preprocessed/aggregated/goes.feather')

### First tests

Before creating the more compact data sets used during training, a first analysis takes place. This allows for the choosing the correct data types for all columns. In this instance the precision from `float64` is probably not needed, hence all data is converted down to `float32`. This massively reduces the necessary memory for this data.

In [None]:
# print details about exemplary goes csv file
folder = next(iter(os.listdir(goes_raw_data_dir)))
file = next(iter(os.listdir(goes_raw_data_dir / folder)))
# file = os.listdir(goes_raw_data_dir)[8000]
df = pd.read_csv(
    goes_raw_data_dir / folder / file,
    sep=',',
    index_col='Timestamp',
    parse_dates=True,
)
print(df.info())
df.describe()

In [None]:
# reduce scale for better numerical stability
convert_to_rad(df, 'roll_angle')

# fill nan values
for key, value in {
    'quad_diode': -1.0,
    'xrsa_flux': -1.0, 'xrsb_flux': -1.0,
    'xrsa_flux_observed': -1.0, 'xrsa_flux_electrons': -1.0, 'xrsb_flux_observed': -1.0, 'xrsb_flux_electrons': -1.0,
    'xrsa_flag': 65535, 'xrsb_flag': 65535,
    'xrsa_num': -1.0, 'xrsb_num': -1.0, 'xrsa_flag_excluded': 65535, 'xrsb_flag_excluded': 65535,
    'au_factor': -1.0, 'corrected_current_xrsb2': -1.0, 'roll_angle': -4.0,
    'xrsa1_flux': -1.0, 'xrsa1_flux_observed': -1.0, 'xrsa1_flux_electrons': -1.0,
    'xrsa2_flux': -1.0, 'xrsa2_flux_observed': -1.0, 'xrsa2_flux_electrons': -1.0,
    'xrsb1_flux': -1.0, 'xrsb1_flux_observed': -1.0, 'xrsb1_flux_electrons': -1.0,
    'xrsb2_flux': -1.0, 'xrsb2_flux_observed': -1.0, 'xrsb2_flux_electrons': -1.0,
    'xrs_primary_chan': 65535, 'xrsa1_flag': 255, 'xrsa2_flag': 255, 'xrsb1_flag': 255, 'xrsb2_flag': 255,
    'xrsa1_num': -1, 'xrsa2_num': -1, 'xrsb1_num': -1, 'xrsb2_num': -1,
    'xrsa1_flag_excluded': 65535, 'xrsa2_flag_excluded': 65535, 'xrsb1_flag_excluded': 65535, 'xrsb2_flag_excluded': 65535,
    'yaw_flip_flag': 255
}.items():
    df[key] = df[key].fillna(value)

# Convert to correct formats
df = df.astype({
    'quad_diode': np.float32,
    'xrsa_flux': np.float32,
    'xrsa_flux_observed': np.float32,
    'xrsa_flux_electrons': np.float32,
    'xrsb_flux': np.float32,
    'xrsb_flux_observed': np.float32,
    'xrsb_flux_electrons': np.float32,
    'xrsa_flag': np.uint16,
    'xrsb_flag': np.uint16,
    'xrsa_num': np.int32,
    'xrsb_num': np.int32,
    'xrsa_flag_excluded': np.uint16,
    'xrsb_flag_excluded': np.uint16,
    'au_factor': np.float32,
    'corrected_current_xrsb2': np.float32,
    'roll_angle': np.float32,
    'xrsa1_flux': np.float32,
    'xrsa1_flux_observed': np.float32,
    'xrsa1_flux_electrons': np.float32,
    'xrsa2_flux': np.float32,
    'xrsa2_flux_observed': np.float32,
    'xrsa2_flux_electrons': np.float32,
    'xrsb1_flux': np.float32,
    'xrsb1_flux_observed': np.float32,
    'xrsb1_flux_electrons': np.float32,
    'xrsb2_flux': np.float32,
    'xrsb2_flux_observed': np.float32,
    'xrsb2_flux_electrons': np.float32,
    'xrs_primary_chan': np.uint16,
    'xrsa1_flag': np.uint8,
    'xrsa2_flag': np.uint8,
    'xrsb1_flag': np.uint8,
    'xrsb2_flag': np.uint8,
    'xrsa1_num': np.int32,
    'xrsa2_num': np.int32,
    'xrsb1_num': np.int32,
    'xrsb2_num': np.int32,
    'xrsa1_flag_excluded': np.uint16,
    'xrsa2_flag_excluded': np.uint16,
    'xrsb1_flag_excluded': np.uint16,
    'xrsb2_flag_excluded': np.uint16,
    'yaw_flip_flag': np.uint8,
})
print(df.info())

### Conversion

After finding out, how the data must be transformed, the transformations can be applied to all csv files.

In [None]:
# for all single files
for csv_file in tqdm(os.listdir(goes_raw_data_dir)):
    df = pd.read_csv(
        goes_raw_data_dir / csv_file,
        sep=',',
        index_col='Timestamp',
        parse_dates=True,
    )
    # reduce scale for better numerical stability
    convert_to_rad(df, 'roll_angle')

    # fill nan values
    for key, value in {
        'quad_diode': -1.0,
        'xrsa_flux': -1.0, 'xrsb_flux': -1.0,
        'xrsa_flux_observed': -1.0, 'xrsa_flux_electrons': -1.0, 'xrsb_flux_observed': -1.0, 'xrsb_flux_electrons': -1.0,
        'xrsa_flag': 65535, 'xrsb_flag': 65535,
        'xrsa_num': -1.0, 'xrsb_num': -1.0, 'xrsa_flag_excluded': 65535, 'xrsb_flag_excluded': 65535,
        'au_factor': -1.0, 'corrected_current_xrsb2': -1.0, 'roll_angle': -4.0,
        'xrsa1_flux': -1.0, 'xrsa1_flux_observed': -1.0, 'xrsa1_flux_electrons': -1.0,
        'xrsa2_flux': -1.0, 'xrsa2_flux_observed': -1.0, 'xrsa2_flux_electrons': -1.0,
        'xrsb1_flux': -1.0, 'xrsb1_flux_observed': -1.0, 'xrsb1_flux_electrons': -1.0,
        'xrsb2_flux': -1.0, 'xrsb2_flux_observed': -1.0, 'xrsb2_flux_electrons': -1.0,
        'xrs_primary_chan': 65535, 'xrsa1_flag': 255, 'xrsa2_flag': 255, 'xrsb1_flag': 255, 'xrsb2_flag': 255,
        'xrsa1_num': -1, 'xrsa2_num': -1, 'xrsb1_num': -1, 'xrsb2_num': -1,
        'xrsa1_flag_excluded': 65535, 'xrsa2_flag_excluded': 65535, 'xrsb1_flag_excluded': 65535, 'xrsb2_flag_excluded': 65535,
        'yaw_flip_flag': 255
    }.items():
        df[key] = df[key].fillna(value)

    # Convert to correct formats
    df = df.astype({
        'quad_diode': np.float32,
        'xrsa_flux': np.float32,
        'xrsa_flux_observed': np.float32,
        'xrsa_flux_electrons': np.float32,
        'xrsb_flux': np.float32,
        'xrsb_flux_observed': np.float32,
        'xrsb_flux_electrons': np.float32,
        'xrsa_flag': np.uint16,
        'xrsb_flag': np.uint16,
        'xrsa_num': np.int32,
        'xrsb_num': np.int32,
        'xrsa_flag_excluded': np.uint16,
        'xrsb_flag_excluded': np.uint16,
        'au_factor': np.float32,
        'corrected_current_xrsb2': np.float32,
        'roll_angle': np.float32,
        'xrsa1_flux': np.float32,
        'xrsa1_flux_observed': np.float32,
        'xrsa1_flux_electrons': np.float32,
        'xrsa2_flux': np.float32,
        'xrsa2_flux_observed': np.float32,
        'xrsa2_flux_electrons': np.float32,
        'xrsb1_flux': np.float32,
        'xrsb1_flux_observed': np.float32,
        'xrsb1_flux_electrons': np.float32,
        'xrsb2_flux': np.float32,
        'xrsb2_flux_observed': np.float32,
        'xrsb2_flux_electrons': np.float32,
        'xrs_primary_chan': np.uint16,
        'xrsa1_flag': np.uint8,
        'xrsa2_flag': np.uint8,
        'xrsb1_flag': np.uint8,
        'xrsb2_flag': np.uint8,
        'xrsa1_num': np.int32,
        'xrsa2_num': np.int32,
        'xrsb1_num': np.int32,
        'xrsb2_num': np.int32,
        'xrsa1_flag_excluded': np.uint16,
        'xrsa2_flag_excluded': np.uint16,
        'xrsb1_flag_excluded': np.uint16,
        'xrsb2_flag_excluded': np.uint16,
        'yaw_flip_flag': np.uint8,
    })

    df.to_parquet(goes_out_dir / f'{Path(csv_file).stem}.parquet')

## OMNI data

First of all, the OMNI data is copied from `./data/raw/OMNI2/*.csv` to `./data/preprocessed/OMNI2/*.parquet`.

In [None]:
omni_raw_data_dir = Path(data_dir / 'raw/OMNI2/')
omni_out_dir = Path(data_dir / 'preprocessed/OMNI2/')
# omni_aggregated_file = Path(data_dir / 'preprocessed/aggregated/omni2.feather')

In [None]:
# helper functions
def _convert_nt_to_ut(_df, _key):
    _df[_key] = _df[_key] * 1e-3
    _df.rename(columns={_key: _key.replace('_nT', '_uT')}, inplace=True)

def _convert_T_to_10_5_T(_df, _key):
    _df[_key] = _df[_key] / 1e5
    _df.rename(columns={_key: _key.replace('_K', '_10_5K')}, inplace=True)

def _convert_km_s_to_km_ms(_df, _key):
    _df[_key] = _df[_key] / 1e3
    _df.rename(columns={_key: _key.replace('_km_s', '_km_ms')}, inplace=True)

### First tests

Before creating the more compact data sets used during training, a first analysis takes place. This allows for the choosing the correct data types for all columns. In this instance the precision from `float64` is probably not needed, hence all data is converted down to `float32`. This massively reduces the necessary memory for this data.
(see [this](https://i.sstatic.net/V7kvk.png) for more details of how the correct data type has been chosen)

In [None]:
# print details about exemplary goes csv file
file = next(iter(os.listdir(omni_raw_data_dir)))
df = pd.read_csv(
    omni_raw_data_dir / file,
    sep=',',
    index_col='Timestamp',
    parse_dates=True,
)
print(df.info())
df.describe()

In [None]:
for key in [
    'Scalar_B_nT', 'Vector_B_Magnitude_nT',
    'BX_nT_GSE_GSM', 'BY_nT_GSE', 'BZ_nT_GSE', 'BY_nT_GSM', 'BZ_nT_GSM',
    'RMS_magnitude_nT', 'RMS_field_vector_nT', 'RMS_BX_GSE_nT', 'RMS_BY_GSE_nT', 'RMS_BZ_GSE_nT',
    'Dst_index_nT', 'ap_index_nT',
    'AE_index_nT', 'AL_index_nT', 'AU_index_nT'
]:
    _convert_nt_to_ut(df, key)

for key in ['Lat_Angle_of_B_GSE', 'Long_Angle_of_B_GSE', 'SW_Plasma_flow_long_angle', 'SW_Plasma_flow_lat_angle', 'sigma_phi_V_degrees', 'sigma_theta_V_degrees']:
    convert_to_rad(df, key)
    df.loc[df[key] > 7, key] = -4.0     # these are outliers

for key in ['SW_Plasma_Temperature_K', 'sigma_T_K']:
    _convert_T_to_10_5_T(df, key)

for key in ['SW_Plasma_Speed_km_s', 'sigma_V_km_s']:
    _convert_km_s_to_km_ms(df, key)

df.loc[df['Alpha_Prot_ratio'] > 1.0, 'Alpha_Prot_ratio'] = -1.0
df.loc[df['sigma_ratio'] > 1.0, 'sigma_ratio'] = -1.0

for i in [1, 2, 4, 10, 30, 60]:
    key = f'Proton_flux_>{i}_Mev'
    df[key] = df[key] / 1e3
    df.loc[df[key] > 9e2, key] = -1.0
    df.rename(columns={key: f'Proton_flux_>{i}_Mev_10^-3'}, inplace=True)

df = df.astype({
    'YEAR': np.uint16,
    'DOY': np.uint16,
    'Hour': np.uint8,
    'Bartels_rotation_number': np.uint16,
    'ID_for_IMF_spacecraft': np.uint16,
    'ID_for_SW_Plasma_spacecraft': np.uint16,
    'num_points_IMF_averages': np.int16,
    'num_points_Plasma_averages': np.int16,
    'Scalar_B_uT': np.float32,
    'Vector_B_Magnitude_uT':np.float32,
    'Lat_Angle_of_B_GSE': np.float32,
    'Long_Angle_of_B_GSE': np.float32,
    'BX_uT_GSE_GSM': np.float32,
    'BY_uT_GSE': np.float32,
    'BZ_uT_GSE': np.float32,
    'BY_uT_GSM': np.float32,
    'BZ_uT_GSM': np.float32,
    'RMS_magnitude_uT': np.float32,
    'RMS_field_vector_uT': np.float32,
    'RMS_BX_GSE_uT': np.float32,
    'RMS_BY_GSE_uT': np.float32,
    'RMS_BZ_GSE_uT': np.float32,
    'SW_Plasma_Temperature_10_5K': np.float32,
    'SW_Proton_Density_N_cm3': np.float32,
    'SW_Plasma_Speed_km_ms': np.float32,
    'SW_Plasma_flow_long_angle': np.float32,
    'SW_Plasma_flow_lat_angle': np.float32,
    'Alpha_Prot_ratio': np.float32,
    'sigma_T_10_5K': np.float32,
    'sigma_n_N_cm3': np.float32,
    'sigma_V_km_ms': np.float32,
    'sigma_phi_V_degrees': np.float32,
    'sigma_theta_V_degrees': np.float32,
    'sigma_ratio': np.float32,
    'Flow_pressure': np.float32,
    'E_electric_field': np.float32,
    'Plasma_Beta': np.float32,
    'Alfen_mach_number': np.float32,
    'Magnetosonic_Mach_number': np.float32,
    'Quasy_Invariant': np.float32,
    'Kp_index': np.float32,
    'R_Sunspot_No': np.float32,
    'Dst_index_uT': np.float32,
    'ap_index_uT': np.float32,
    'f10.7_index': np.float32,
    'AE_index_uT': np.float32,
    'AL_index_uT': np.float32,
    'AU_index_uT': np.float32,
    'pc_index': np.float32,
    'Lyman_alpha': np.float32,
    'Proton_flux_>1_Mev_10^-3': np.float32,
    'Proton_flux_>2_Mev_10^-3': np.float32,
    'Proton_flux_>4_Mev_10^-3': np.float32,
    'Proton_flux_>10_Mev_10^-3': np.float32,
    'Proton_flux_>30_Mev_10^-3': np.float32,
    'Proton_flux_>60_Mev_10^-3': np.float32,
    'Flux_FLAG': np.int8
})
print(df.info())
df.describe()

In [None]:
for csv_file in tqdm(os.listdir(omni_raw_data_dir)):
    df = pd.read_csv(
        omni_raw_data_dir / file,
        sep=',',
        index_col='Timestamp',
        parse_dates=True,
    )

    for key in [
        'Scalar_B_nT', 'Vector_B_Magnitude_nT',
        'BX_nT_GSE_GSM', 'BY_nT_GSE', 'BZ_nT_GSE', 'BY_nT_GSM', 'BZ_nT_GSM',
        'RMS_magnitude_nT', 'RMS_field_vector_nT', 'RMS_BX_GSE_nT', 'RMS_BY_GSE_nT', 'RMS_BZ_GSE_nT',
        'Dst_index_nT', 'ap_index_nT',
        'AE_index_nT', 'AL_index_nT', 'AU_index_nT'
    ]:
        _convert_nt_to_ut(df, key)

    for key in ['Lat_Angle_of_B_GSE', 'Long_Angle_of_B_GSE', 'SW_Plasma_flow_long_angle', 'SW_Plasma_flow_lat_angle', 'sigma_phi_V_degrees', 'sigma_theta_V_degrees']:
        convert_to_rad(df, key)
        df.loc[df[key] > 7, key] = -4.0     # these are outliers

    for key in ['SW_Plasma_Temperature_K', 'sigma_T_K']:
        _convert_T_to_10_5_T(df, key)

    for key in ['SW_Plasma_Speed_km_s', 'sigma_V_km_s']:
        _convert_km_s_to_km_ms(df, key)

    df.loc[df['Alpha_Prot_ratio'] > 1.0, 'Alpha_Prot_ratio'] = -1.0
    df.loc[df['sigma_ratio'] > 1.0, 'sigma_ratio'] = -1.0

    for i in [1, 2, 4, 10, 30, 60]:
        key = f'Proton_flux_>{i}_Mev'
        df[key] = df[key] / 1e3
        df.loc[df[key] > 9e2, key] = -1.0
        df.rename(columns={key: f'Proton_flux_>{i}_Mev_10^-3'}, inplace=True)

    df = df.astype({
        'YEAR': np.uint16,
        'DOY': np.uint16,
        'Hour': np.uint8,
        'Bartels_rotation_number': np.uint16,
        'ID_for_IMF_spacecraft': np.uint16,
        'ID_for_SW_Plasma_spacecraft': np.uint16,
        'num_points_IMF_averages': np.int16,
        'num_points_Plasma_averages': np.int16,
        'Scalar_B_uT': np.float32,
        'Vector_B_Magnitude_uT':np.float32,
        'Lat_Angle_of_B_GSE': np.float32,
        'Long_Angle_of_B_GSE': np.float32,
        'BX_uT_GSE_GSM': np.float32,
        'BY_uT_GSE': np.float32,
        'BZ_uT_GSE': np.float32,
        'BY_uT_GSM': np.float32,
        'BZ_uT_GSM': np.float32,
        'RMS_magnitude_uT': np.float32,
        'RMS_field_vector_uT': np.float32,
        'RMS_BX_GSE_uT': np.float32,
        'RMS_BY_GSE_uT': np.float32,
        'RMS_BZ_GSE_uT': np.float32,
        'SW_Plasma_Temperature_10_5K': np.float32,
        'SW_Proton_Density_N_cm3': np.float32,
        'SW_Plasma_Speed_km_ms': np.float32,
        'SW_Plasma_flow_long_angle': np.float32,
        'SW_Plasma_flow_lat_angle': np.float32,
        'Alpha_Prot_ratio': np.float32,
        'sigma_T_10_5K': np.float32,
        'sigma_n_N_cm3': np.float32,
        'sigma_V_km_ms': np.float32,
        'sigma_phi_V_degrees': np.float32,
        'sigma_theta_V_degrees': np.float32,
        'sigma_ratio': np.float32,
        'Flow_pressure': np.float32,
        'E_electric_field': np.float32,
        'Plasma_Beta': np.float32,
        'Alfen_mach_number': np.float32,
        'Magnetosonic_Mach_number': np.float32,
        'Quasy_Invariant': np.float32,
        'Kp_index': np.float32,
        'R_Sunspot_No': np.float32,
        'Dst_index_uT': np.float32,
        'ap_index_uT': np.float32,
        'f10.7_index': np.float32,
        'AE_index_uT': np.float32,
        'AL_index_uT': np.float32,
        'AU_index_uT': np.float32,
        'pc_index': np.float32,
        'Lyman_alpha': np.float32,
        'Proton_flux_>1_Mev_10^-3': np.float32,
        'Proton_flux_>2_Mev_10^-3': np.float32,
        'Proton_flux_>4_Mev_10^-3': np.float32,
        'Proton_flux_>10_Mev_10^-3': np.float32,
        'Proton_flux_>30_Mev_10^-3': np.float32,
        'Proton_flux_>60_Mev_10^-3': np.float32,
        'Flux_FLAG': np.int8
    })

    df.to_parquet(omni_out_dir / f'{Path(csv_file).stem}.parquet')

## SAT_DENSITY data

First of all, the SAT_DENSITY data is copied from `./data/raw/SAT_DENSITY/*.csv` to `./data/preprocessed/SAT_DENSITY/*.parquet`.

In [None]:
sd_raw_data_dir = Path(data_dir / 'raw/SAT_DENSITY/')
sd_out_dir = Path(data_dir / 'preprocessed/SAT_DENSITY/')
# goes_aggregated_file = Path(data_dir / 'preprocessed/aggregated/sat_density.feather')

### First tests

Before creating the more compact data sets used during training, a first analysis takes place. This allows for the choosing the correct data types for all columns. In this instance the precision from `float64` is probably not needed, hence all data is converted down to `float32`. This massively reduces the necessary memory for this data.

In [None]:
# print details about exemplary goes csv file
file = next(iter(os.listdir(sd_raw_data_dir)))
df = pd.read_csv(
    sd_raw_data_dir / file,
    sep=',',
    index_col='Timestamp',
    parse_dates=True,
)
print(df.info())
df.describe()

In [None]:
# Convert to correct formats
df[df['Orbit Mean Density (kg/m^3)'] > 1e+2] = -1.0
df['Orbit Mean Density (kg/m^3)'] *= 1e3
df = df.astype(np.float32)
print(df.info())
df.describe()

### Conversion

After finding out, how the data must be transformed, the transformations can be applied to all csv files.

In [None]:
# for all single files
for csv_file in tqdm(os.listdir(sd_raw_data_dir)):
    df = pd.read_csv(
        sd_raw_data_dir / csv_file,
        sep=',',
        index_col='Timestamp',
        parse_dates=True,
    )
    df[df['Orbit Mean Density (kg/m^3)'] > 1e+2] = -1.0
    df = df.astype(np.float64)
    df.to_parquet(sd_out_dir / f'{Path(csv_file).stem}.parquet')

## Initial states data

First of all, the initial states data is copied from `./data/raw/*.csv` to `./data/preprocessed/*.parquet`.

In [None]:
is_raw_data_dir = Path(data_dir / 'raw/')
is_out_dir = Path(data_dir / 'preprocessed/')
# goes_aggregated_file = Path(data_dir / 'preprocessed/aggregated/sat_density.feather')

In [None]:
# helper functions with additional features
def _scale_km(_df, _key):
    scale_km(_df, _key)
    _df.rename(columns={_key: _key.replace('(km)', '(1000 km)')}, inplace=True)

def _convert_to_rad(_df, _key):
    convert_to_rad(_df, _key)
    _df.rename(columns={_key: _key.replace('(deg)', '(rad)')}, inplace=True)

### First tests

Before creating the more compact data sets used during training, a first analysis takes place. This allows for the choosing the correct data types for all columns. In this instance the precision from `float64` is probably not needed, hence all data is converted down to `float32`. This massively reduces the necessary memory for this data.

In [None]:
# print details about exemplary goes csv file
file = next(filter(lambda x: x.endswith('.csv'), os.listdir(is_raw_data_dir)))
df = pd.read_csv(
    is_raw_data_dir / file,
    sep=',',
)
print(df.info())
df.describe()

In [None]:
# for better precision
for key in [
    'Inclination (deg)', 'RAAN (deg)', 'Argument of Perigee (deg)', 'True Anomaly (deg)', 'Latitude (deg)', 'Longitude (deg)'
]:
    convert_to_rad(df, key)

_scale_km(df, 'Altitude (km)')
_scale_km(df, 'Semi-major Axis (km)')

# Catch obvious outliers
df.loc[df['Longitude (rad)'] > 4, 'Longitude (rad)'] = -4.0
df.loc[df['Latitude (rad)'] > 4, 'Latitude (rad)'] = -4.0
df.loc[df['Altitude (1000 km)'] > 1e+6, 'Altitude (1000 km)'] = -1.0

# change dtypes
df = df.astype({
    'File ID': np.int16,
    'Semi-major Axis (1000 km)': np.float64,
    'Eccentricity': np.float32,
    'Inclination (rad)': np.float32,
    'RAAN (rad)': np.float32,
    'Argument of Perigee (rad)': np.float32,
    'True Anomaly (rad)': np.float32,
    'Latitude (rad)': np.float32,
    'Longitude (rad)': np.float32,
    'Altitude (1000 km)': np.float64,
})

print(df.info())
df.describe()

### Conversion

After finding out, how the data must be transformed, the transformations can be applied to all csv files.

In [None]:
# for all single files
for csv_file in tqdm(list(filter(lambda x: x.endswith('.csv'), os.listdir(is_raw_data_dir)))):
    df = pd.read_csv(
        is_raw_data_dir / csv_file,
        sep=',',
    )

    for key in [
        'Inclination (deg)', 'RAAN (deg)', 'Argument of Perigee (deg)', 'True Anomaly (deg)', 'Latitude (deg)', 'Longitude (deg)'
    ]:
        _convert_to_rad(df, key)

    _scale_km(df, 'Altitude (km)')
    _scale_km(df, 'Semi-major Axis (km)')

    # Catch obvious outliers
    df.loc[df['Longitude (rad)'] > 4, 'Longitude (rad)'] = -4.0
    df.loc[df['Latitude (rad)'] > 4, 'Latitude (rad)'] = -4.0
    df.loc[df['Altitude (1000 km)'] > 1e+6, 'Altitude (1000 km)'] = -1.0

    # change dtypes
    df = df.astype({
        'File ID': np.int16,
        'Semi-major Axis (1000 km)': np.float64,
        'Eccentricity': np.float32,
        'Inclination (rad)': np.float32,
        'RAAN (rad)': np.float32,
        'Argument of Perigee (rad)': np.float32,
        'True Anomaly (rad)': np.float32,
        'Latitude (rad)': np.float32,
        'Longitude (rad)': np.float32,
        'Altitude (1000 km)': np.float64,
    })

    # Save as parquet
    df.to_parquet(is_out_dir / f'{Path(csv_file).stem}.parquet')