# Dataset

> Scripts to build the different datasets used for modeling

In [2]:
#| default_exp dataset

In [3]:
#| export
#| hide
import os
import h5py
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict

from orbit_generation.processing import pad_and_convert_to_3d, segment_and_convert_to_3d, add_time_vector_to_orbits

## Read Data

In [4]:
#| export
def get_orbit_data_from_hdf5(file_path: str                   # Path to the HDF5 file.
                            ) -> Tuple[Dict[int, np.ndarray], # Dictionary of orbits with numerical keys.
                                    pd.DataFrame,             # DataFrame containing orbit features.
                                    Dict[str, float]]:        # Dictionary containing system features.
    """
    Load orbit data from an HDF5 file.
    """
    with h5py.File(file_path, 'r') as file:
        # Extract not_propagated_orbits and store in a list of integers
        not_propagated_orbits = [index - 1 for index in file['not_propagated_orbits'][0].tolist()]
        
        # Extract system features and labels
        system_features = file['system_features'][:]
        system_labels = file['system_labels'][:].astype(str)
        
        # Create a dictionary for system
        system_dict = {label: feature[0] for label, feature in zip(system_labels.flatten().tolist(), system_features)}
        
        # Extract orbit features and labels
        orbit_features = file['orbit_features'][:]
        orbit_labels = file['orbit_labels'][:].astype(str)
        
        # Create a dataframe for orbits
        orbit_df = pd.DataFrame(orbit_features.T, columns=orbit_labels.flatten().tolist())
        
        # Remove rows in orbit_df based on not_propagated_orbits
        orbit_df = orbit_df.drop(not_propagated_orbits).reset_index(drop=True)
        
        # Extract numpy arrays with numerical keys
        orbits = {int(key): file[key][:] for key in file.keys() if key.isdigit()}
        
        # Reset the index of the dictionary to start on 0
        orbits = {i: orbits[key] for i, key in enumerate(sorted(orbits.keys()))}
                
    return orbits, orbit_df, system_dict

In [5]:
#| export
def get_orbit_features_from_hdf5(file_path: str          # Path to the HDF5 file.
                                ) -> pd.DataFrame:       # DataFrame containing orbit features.
    """
    Load orbit DataFrame from an HDF5 file.
    """
    with h5py.File(file_path, 'r') as file:
        # Extract not_propagated_orbits and store in a list of integers
        not_propagated_orbits = [index - 1 for index in file['not_propagated_orbits'][0].tolist()]
        
        # Extract orbit features and labels
        orbit_features = file['orbit_features'][:]
        orbit_labels = file['orbit_labels'][:].astype(str)
        
        # Create a dataframe for orbits
        orbit_df = pd.DataFrame(orbit_features.T, columns=orbit_labels.flatten().tolist())
        
        # Remove rows in orbit_df based on not_propagated_orbits
        orbit_df = orbit_df.drop(not_propagated_orbits).reset_index(drop=True)
                
    return orbit_df

## Datasets fixed Period

In [6]:
#| export
def get_first_period_dataset(file_path: str                  # Path to the HDF5 file.
                            ) -> Tuple[np.ndarray,          # 3D numpy array of padded orbits.
                                       pd.DataFrame,        # DataFrame containing orbit features.
                                       Dict[str, float]]:   # Dictionary containing system features.
    """
    Load and process orbit data from an HDF5 file for the first period.
    """
    # Load the orbit data, features dataframe, and system dictionary from the HDF5 file
    orbits, orbit_df, system_dict = get_orbit_data_from_hdf5(file_path)

    # Extract propagated periods and periods from the DataFrame
    propagated_periods = orbit_df['propagated_periods'].tolist()
    periods = orbit_df['period'].tolist()

    # Remove the file type and extract parts of the file name to determine processing steps
    file_name = os.path.basename(file_path).split('.')[0]
    file_parts = file_name.split('_')

    # Check if the second part of the file name is 'N'
    if file_parts[1] == 'N':
        # Add time vectors to the orbits
        orbits = add_time_vector_to_orbits(orbits, propagated_periods, periods)
        # Pad and convert the orbits to a 3D array using the fourth part of the file name as timesteps
        orbits = pad_and_convert_to_3d(orbits, int(file_parts[3]))

    return orbits, orbit_df, system_dict


## Dataset fixed Step

In [7]:
#| export
def get_segmented_dataset(file_path: str,                     # Path to the HDF5 file.
                          segment_length: int                 # Desired length of each segment.
                         ) -> Tuple[np.ndarray,               # 3D numpy array of segmented orbits.
                                    pd.DataFrame,             # DataFrame containing orbit features.
                                    List[int],                # List of IDs representing each new segment.
                                    Dict[str, float]]:        # Dictionary containing system features.
    """
    Load and process orbit data from an HDF5 file, segmenting each orbit into specified length.
    """
    # Load the orbit data, features dataframe, and system dictionary from the HDF5 file
    orbits, orbit_df, system_dict = get_orbit_data_from_hdf5(file_path)

    # Check if the second part of the file name is 'dt'
    if os.path.basename(file_path).split('_')[1] == 'dt':
        # Segment the orbits and get the corresponding segment IDs
        orbits, orbits_ids = segment_and_convert_to_3d(orbits, segment_length)

    return orbits, orbit_df, orbits_ids, system_dict


In [8]:
#| hide
import nbdev; nbdev.nbdev_export()