# Dataset

> Scripts to build the different datasets used for modeling

In [1]:
#| default_exp dataset

In [2]:
#| export
#| hide
import os
import h5py
import numpy as np
import pandas as pd
from typing import Tuple, Optional, List, Dict
from orbit_generation.data import load_orbit_data, get_orbit_features

## Exact Periods with Time

### All 5 periods

In [3]:
#| export
def get_5p_em_dataset(data_directory: Optional[str] = '../data', 
                      output_file_path: Optional[str] = '../data/5p_dataset_em.npy') -> Tuple[np.memmap, np.ndarray]:
    """
    Load orbit data and corresponding labels. Optionally, save the loaded data.
    If the output file exists, return the data as a memory-mapped array, otherwise process and save the data.
    After saving, the data is also returned as a memory-mapped array.
    """
    # Extract file extension to determine the type
    _, file_extension = os.path.splitext(output_file_path)

    # Validate supported file types
    if file_extension not in ['.hdf5', '.npy']:
        raise ValueError("Unsupported file extension. Supported extensions are '.hdf5' or '.npy'.")

    # Define the path for labels file
    labels_path = f"{os.path.splitext(output_file_path)[0]}_labels.npy"

    # Check if the combined data file already exists
    if os.path.exists(output_file_path):
        # Load data as memmap
        data_memmap = np.load(output_file_path, mmap_mode='r+') if file_extension == '.npy' else None  # Add hdf5 handling if needed
        labels_memmap = np.load(labels_path)
        return data_memmap, labels_memmap

    # Paths to data files
    orbits_file_path = os.path.join(data_directory, "em_orbits.h5")
    features_file_path = os.path.join(data_directory, "em_features.mat")

    # Load orbit labels
    labels_df = get_orbit_features(features_file_path, variable_name='out_EM') 
    labels = pd.Series(labels_df['Orbit Family']).repeat(5).reset_index(drop=True).to_numpy()

    # Load orbit data
    orbit_data = load_orbit_data(orbits_file_path, dataset_path='/files/PERIODIC ORBITS')
    reshaped_array = orbit_data.reshape(36071, 7, 5, 1500)
    orbit_data_final = reshaped_array.transpose(0, 2, 1, 3).reshape(36071 * 5, 7, 1500)

    # Save the data if an output file path is provided
    if file_extension == '.npy':
        np.save(output_file_path, orbit_data_final)
        np.save(labels_path, labels)
        data_memmap = np.load(output_file_path, mmap_mode='r+')

    return data_memmap, labels

### 1st period

In [4]:
#| export
def get_1p_em_dataset(data_directory: Optional[str] = '../data',
                      output_file_path: Optional[str] = '../data/1p_dataset_em.npy') -> Tuple[np.memmap, np.ndarray]:
    """
    Load orbit data from an HDF5 file and corresponding labels from a MAT file, and optionally save the loaded data.
    If the output file exists, it returns the data as a memory-mapped array. After saving, the data is also returned as a memory-mapped array.
    """
    # Extract file extension to determine the type
    _, file_extension = os.path.splitext(output_file_path)

    # Validate supported file types
    if file_extension not in ['.hdf5', '.npy']:
        raise ValueError("Unsupported file extension. Supported extensions are '.hdf5' or '.npy'.")

    # Define the path for labels file
    labels_path = f"{os.path.splitext(output_file_path)[0]}_labels.npy"

    # Check if the combined data file already exists
    if os.path.exists(output_file_path):
        data_memmap = np.load(output_file_path, mmap_mode='r+') if file_extension == '.npy' else None
        labels_memmap = np.load(labels_path)
        return data_memmap, labels_memmap

    # Paths to data files
    orbits_file_path = os.path.join(data_directory, "em_orbits.h5")
    features_file_path = os.path.join(data_directory, "em_features.mat")

    # Load orbit labels
    labels_df = get_orbit_features(features_file_path, variable_name='out_EM') 
    labels = labels_df['Orbit Family'].to_numpy()

    # Load orbit data
    orbit_data = load_orbit_data(orbits_file_path, dataset_path='/files/PERIODIC ORBITS')
    reshaped_orbit_data = orbit_data[:, :, :1500]

    # Save the data if an output file path is provided
    if file_extension == '.npy':
        np.save(output_file_path, reshaped_orbit_data)
        np.save(labels_path, labels)
        data_memmap = np.load(output_file_path, mmap_mode='r+')

    return data_memmap, labels

### Specific Period for each Orbit

In [5]:
#| export
def get_sp_em_dataset(data_directory: Optional[str] = '../data', 
                      output_file_path: Optional[str] = '../data/sp_dataset_em.npy') -> Tuple[np.memmap, np.ndarray]:
    """
    Load orbit data and corresponding labels based on the specified number of periods per orbit.
    If the output file exists, return the data as a memory-mapped array, otherwise process and save the data.
    After saving, the data is also returned as a memory-mapped array.
    """
    # Extract file extension to determine the type
    _, file_extension = os.path.splitext(output_file_path)

    # Validate supported file types
    if file_extension not in ['.hdf5', '.npy']:
        raise ValueError("Unsupported file extension. Supported extensions are '.hdf5' or '.npy'.")

    # Define the path for labels file
    labels_path = f"{os.path.splitext(output_file_path)[0]}_labels.npy"

    # Check if the combined data file already exists
    if os.path.exists(output_file_path):
        data_memmap = np.load(output_file_path, mmap_mode='r+') if file_extension == '.npy' else None
        labels_memmap = np.load(labels_path)
        return data_memmap, labels_memmap

    # Paths to data files
    orbits_file_path = os.path.join(data_directory, "em_orbits.h5")
    features_file_path = os.path.join(data_directory, "em_features.mat")
    periods_file_path = os.path.join(data_directory, "em_periods.npy")

    # Load orbit labels
    labels_df = get_orbit_features(features_file_path, variable_name='out_EM') 
    labels = labels_df['Orbit Family'].to_numpy()

    # Load number of periods per orbit
    periods_per_orbit = np.load(periods_file_path)

    # Load orbit data
    with h5py.File(orbits_file_path, 'r') as file:
        orbit_data = np.array(file['/files/PERIODIC ORBITS'])

    # Initialize the result array and label list
    orbit_data_final = []
    final_labels = []

    # Iterate over the orbits and periods array
    for index, num_periods in enumerate(periods_per_orbit):
        for period_idx in range(num_periods):
            start_idx = 1500 * period_idx
            end_idx = start_idx + 1500
            orbit_slice = orbit_data[index, :, start_idx:end_idx]
            orbit_data_final.append(orbit_slice)
            final_labels.append(labels[index])

    # Convert list to numpy array
    orbit_data_final = np.stack(orbit_data_final)  # This ensures a uniform 3D array

    # Save the data if an output file path is provided
    if file_extension == '.npy':
        np.save(output_file_path, orbit_data_final)
        np.save(labels_path, final_labels)
        data_memmap = np.load(output_file_path, mmap_mode='r+')
        final_labels = np.load(labels_path)

    return data_memmap, final_labels

In [6]:
# data, labels = get_sp_em_dataset()

## Fixed time step

### Specific Period for each Orbit

include labels

In [7]:
#| export
def get_sp_fixed_em_dataset(data_directory: Optional[str] = '../data', 
                            output_data_path: Optional[str] = '../data/em_periods.npy', 
                            output_periods_path: Optional[str] = '../data/em_periods.npy') -> Tuple[Dict[int, np.ndarray], np.ndarray]:
    """
    Load orbit data from an HDF5 file.
    """

    # Define the default data directory based on the script's location if not provided
    if data_directory is None:
        data_directory = os.path.join(os.path.dirname(__file__), "Data")
    
    # Define the file path for the orbit data
    file_path = os.path.join(data_directory, "em_orbits_dt_0_01.h5")
    
    # Initialize a dictionary to hold data from all datasets
    orbit_data = {}
    
    # Open the HDF5 file and read data from each dataset
    with h5py.File(file_path, 'r') as file:
        # Iterate through each dataset in the HDF5 file
        for name in file:
            dataset = np.array(file[name])
            if name.isdigit():
                orbit_data[int(name)] = dataset
            else:
                orbit_data[name] = dataset
    
    # Extract the periods
    periods = orbit_data.get('prop_periods')[0] if 'prop_periods' in orbit_data else None

    # Save the entire orbit data if an output path is provided
    if output_data_path:
        if output_data_path.endswith('.npy'):
            np.save(output_data_path, orbit_data)
        elif output_data_path.endswith('.h5'):
            with h5py.File(output_data_path, 'w') as h5_file:
                for key, data in orbit_data.items():
                    h5_file.create_dataset(key, data=data)

    # Save periods data if an output path is provided
    if output_periods_path:
        if output_periods_path.endswith('.npy'):
            np.save(output_periods_path, periods)
        elif output_periods_path.endswith('.h5'):
            with h5py.File(output_periods_path, 'w') as h5_file:
                h5_file.create_dataset('periods', data=periods)

    return orbit_data, periods

In [8]:
def get_orbit_data_from_hdf5(file_path: str                  # Path to the HDF5 file.
                         ) -> Tuple[Dict[int, np.ndarray],   # Dictionary of orbits with numerical keys.
                                    pd.DataFrame,            # DataFrame containing orbit features.
                                    Dict[str, float]]:       # Dictionary containing system features.
    """
    Load orbit data from an HDF5 file.
    """
    with h5py.File(file_path, 'r') as file:
        # Extract not_propagated_orbits and store in a list of integers
        not_propagated_orbits = [index - 1 for index in file['not_propagated_orbits'][0].tolist()]
        
        # Extract system features and labels
        system_features = file['system_features'][:]
        system_labels = file['system_labels'][:].astype(str)
        
        # Create a dictionary for system
        system_dict = {label: feature[0] for label, feature in zip(system_labels.flatten().tolist(), system_features)}
        
        # Extract orbit features and labels
        orbit_features = file['orbit_features'][:]
        orbit_labels = file['orbit_labels'][:].astype(str)
        
        # Create a dataframe for orbits
        orbit_df = pd.DataFrame(orbit_features.T, columns=orbit_labels.flatten().tolist())
        
        # Remove rows in orbit_df based on not_propagated_orbits
        orbit_df = orbit_df.drop(not_propagated_orbits).reset_index(drop=True)
        
        # Extract numpy arrays with numerical keys
        orbits = {int(key): file[key][:] for key in file.keys() if key.isdigit()}
        
        # Reset the index of the dictionary to start on 0
        orbits = {i: orbits[key] for i, key in enumerate(sorted(orbits.keys()))}
                
    return orbits, orbit_df, system_dict

In [9]:
def add_time_vector_to_orbits(orbits: Dict[int, np.ndarray],  # Dictionary of orbits with numerical keys.
                              orbits_df: pd.DataFrame         # DataFrame containing orbit features.
                             ) -> Dict[int, np.ndarray]:      # Dictionary of updated orbits with time vectors added.
    """
    Add a time vector to each orbit in the dictionary.
    """
    # Create a new dictionary to store the updated orbits
    updated_orbits = {}

    # Iterate over each orbit in the dictionary
    for key, orbit in orbits.items():
        # Get the corresponding row from the dataframe
        orbit_row = orbits_df.loc[int(key)]

        # Extract the propagated_periods and period for this orbit
        propagated_periods = orbit_row['propagated_periods']
        period = orbit_row['period']

        # Compute the new time vector
        tvec = np.linspace(0, propagated_periods * period, orbit.shape[1])

        # Add the time vector as the first vector in the orbit array
        updated_orbit = np.vstack([tvec, orbit])

        # Add the updated orbit to the new dictionary
        updated_orbits[key] = updated_orbit

    return updated_orbits

In [10]:
def pad_and_convert_to_3d(orbits: Dict[int, np.ndarray],     # Dictionary of orbits with numerical keys.
                          timesteps: int                     # Desired number of timesteps.
                         ) -> np.ndarray:                    # 3D numpy array of padded orbits.
    """
    Truncate and pad each orbit to a uniform length and convert to a 3D numpy array.
    """
    # Initialize a list to store the padded arrays
    padded_arrays = []

    # Iterate over each orbit in the dictionary
    for key, orbit in orbits.items():
        # Determine the number of timesteps to take from the orbit
        num_timesteps = min(timesteps, orbit.shape[1])

        # Take the first num_timesteps from the orbit
        truncated_orbit = orbit[:, :num_timesteps]

        # Pad the truncated orbit to have length timesteps in the final dimension
        padded_orbit = np.pad(truncated_orbit, ((0, 0), (0, timesteps - num_timesteps)))

        # Add the padded orbit to the list
        padded_arrays.append(padded_orbit)

    # Convert the list of padded arrays to a 3D numpy array and return it
    return np.stack(padded_arrays)

In [11]:
def segment_and_convert_to_3d(orbits: Dict[int, np.ndarray],  # Dictionary of orbits with numerical keys.
                              segment_length: int             # Desired length of each segment.
                             ) -> Tuple[np.ndarray,           # 3D numpy array of segments.
                                        List[int]]:           # List of IDs representing each new segment.
    """
    Divide each orbit into segments of a given length and convert to a 3D numpy array.
    """
    import numpy as np
    
    # Initialize a list to store the segments and their corresponding IDs
    segments = []
    segment_ids = []

    # Iterate over each orbit in the dictionary
    for key, orbit in orbits.items():
        # Determine the number of complete segments that can be taken from the orbit
        num_segments = orbit.shape[1] // segment_length

        # Iterate over the number of complete segments
        for i in range(num_segments):
            # Take the segment of the desired length
            segment = orbit[:, i*segment_length:(i+1)*segment_length]

            # Add the segment to the list
            segments.append(segment)

            # Add the corresponding ID to the list
            segment_ids.append(key)

    # Convert the list of segments to a 3D numpy array
    segments_3d = np.stack(segments)

    return segments_3d, segment_ids


In [None]:
def get_first_period_dataset(file_path):
    
    orbits, orbit_df, system_dict = get_orbit_data_from_hdf5(file_path)

    file_parts = os.path.basename(file_path).split('_')

    if file_parts[1] == 'N':
        orbits = add_time_vector_to_orbits(orbits, orbit_df)
        orbits = pad_and_convert_to_3d(orbits, file_parts[3])
    
    else:
        

    
    pass

In [23]:
# Replace 'your_file.hdf5' with the path to your .hdf5 file
file_path = '/orbit-generation/data/orbits_dt_0_01/EM_dt_fix_0_01.h5'
orbits, orbit_df, system_dict = get_orbit_data_from_hdf5(file_path)


In [13]:
file_path = '/orbit-generation/data/orbits_fix_1500/EM_N_fix_1500.h5'
orbits, orbit_df, system_dict = get_orbit_data_from_hdf5(file_path)

In [14]:
updated_orbits = add_time_vector_to_orbits(orbits, orbit_df)

In [15]:
len(orbits)

45211

In [21]:
45211*15

678165

In [16]:
orbit=orbits[1622]

In [17]:
orbit.shape

(6, 7496)

In [19]:
segmented_orbits, orbits_ids = segment_and_convert_to_3d(updated_orbits,1499)
segmented_orbits.shape

In [18]:
orbit_labels

NameError: name 'orbit_labels' is not defined

In [None]:
from orbit_generation.propagation import calculate_errors
from orbit_generation.constants import MU

In [None]:
errors = calculate_errors(orbit, MU, time_step=0.01)

ValueError: Invalid orbit_data shape. Must be (n, 6, m) or (n, 7, m)

In [24]:
orbit_df.head()

Unnamed: 0,id_class,x_0,y_0,x_0.1,vx_0,vy_0,vz_0,jacobi,period,stability,propagated_periods
0,1.0,0.941,1.9004420000000002e-23,0.509474,2.968938e-13,-0.124968,-3.122717e-12,2.745412,11.555291,211.184678,3.0
1,1.0,0.941538,-8.592698e-24,0.508602,2.90234e-13,-0.125672,-3.252212e-12,2.746226,11.551622,210.329145,3.0
2,1.0,0.942076,6.956604e-24,0.507729,2.685527e-13,-0.126375,-2.86205e-12,2.747039,11.547936,209.473685,3.0
3,1.0,0.942613,1.763946e-24,0.506856,3.153637e-13,-0.127078,-3.791054e-12,2.74785,11.544233,208.618315,3.0
4,1.0,0.94315,9.441804e-24,0.505982,2.663322e-13,-0.127781,-2.98272e-12,2.74866,11.540511,207.763051,3.0


In [None]:
orbit_df.shape

(45211, 11)

In [None]:
len(orbits)

39817

In [None]:
system_dict

{'mu': 0.01215058560962404,
 'LU': 389703.0,
 'TU': 382981.0,
 'VU': 1.0175517845532807,
 'Lx1': 0.8369151257723573,
 'Ly1': 0.0,
 'Lx2': 1.1556821654448846,
 'Ly2': 0.0,
 'Lx3': -1.005062645810278,
 'Ly3': 0.0,
 'Lx4': 0.48784941439037594,
 'Ly4': 0.8660254037844386,
 'Lx5': 0.48784941439037594,
 'Ly5': -0.8660254037844386}

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()