# Data Utils

> Necessary scripts to read orbits from different formats

In [None]:
#| default_exp data

In [None]:
#| hide
#| export
import h5py
from scipy.io import loadmat
import numpy as np
import os
import pandas as pd
from typing import Optional, Any, Union, List, Dict, Tuple
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from orbit_generation.path import get_data_path

In [None]:
#| hide
#| export
from unittest.mock import patch, MagicMock
from fastcore.test import test_eq

## Loading Data

In [None]:
#| export
def load_orbit_data(file_path: str,  # The path to the .mat, .h5, or .npy file.
                    variable_name: Optional[str] = None,  # Name of the variable in the .mat file, optional.
                    dataset_path: Optional[str] = None  # Path to the dataset in the .h5 file, optional.
                   ) -> Any:  # The loaded orbit data.
    """
    Load orbit data from MATLAB .mat files, HDF5 .h5 files, or NumPy .npy files.
    """
    if file_path.endswith('.mat'):
        if variable_name is None:
            raise ValueError("variable_name must be provided for .mat files")
        mat = loadmat(file_path)
        if variable_name in mat:
            data = mat[variable_name]
        else:
            raise ValueError(f"{variable_name} not found in {file_path}")

    elif file_path.endswith('.h5'):
        with h5py.File(file_path, 'r') as file:
            if dataset_path is None:
                raise ValueError("dataset_path must be provided for .h5 files")
            if dataset_path in file:
                data = np.array(file[dataset_path])
            else:
                raise ValueError(f"{dataset_path} not found in {file_path}")

    elif file_path.endswith('.npy'):
        data = np.load(file_path)

    else:
        raise ValueError("Unsupported file format. Please provide a .mat, .h5, or .npy file.")
    
    return data

In [None]:
#| test load_orbit_data
#| hide
mock_mat_data = {'Xarray': np.array([1, 2, 3])}
mock_h5_data = np.array([4, 5, 6])
mock_npy_data = np.array([7, 8, 9])

# Test for load_orbit_data with .mat file
with patch('__main__.loadmat', return_value=mock_mat_data) as mock_loadmat:
    result = load_orbit_data('test_data.mat', variable_name='Xarray')
    assert (result == mock_mat_data['Xarray']).all(), "MAT file loading failed or data mismatch"
    mock_loadmat.assert_called_once_with('test_data.mat')

# Test for load_orbit_data with .h5 file
with patch('__main__.h5py.File') as mock_h5py:
    mock_file = MagicMock()
    mock_file.__enter__.return_value = {'/files/PERIODIC ORBITS': mock_h5_data}
    mock_h5py.return_value = mock_file
    result = load_orbit_data('test_data.h5', dataset_path='/files/PERIODIC ORBITS')
    assert (result == mock_h5_data).all(), "H5 file loading failed or data mismatch"

# Test for load_orbit_data with .npy file
with patch('numpy.load', return_value=mock_npy_data) as mock_load:
    result = load_orbit_data('test_data.npy')
    assert (result == mock_npy_data).all(), "NPY file loading failed or data mismatch"
    mock_load.assert_called_once_with('test_data.npy')

In [None]:
#| export
def load_memmap_array(file_path: str,  # The path to the .npy file as a string.
                      mode: str = 'c'  # Mode for memory-mapping ('r', 'r+', 'w+', 'c').
                     ) -> np.memmap:   # Returns a memory-mapped array.
    """
    Load a .npy file as a memory-mapped array using numpy.memmap.
    """
    
    # Check if the file exists at the specified path
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"No file found at the specified path: {file_path}")
    
    # Load the .npy file as a memmap object with the specified mode
    return np.load(file_path, mmap_mode=mode)

In [None]:
#| export
def get_orbit_features(file_path: str,  # The path to the file (can be .mat, .h5, or .npy).
                       variable_name: Optional[str] = None,  # Name of the variable in the .mat file, optional.
                       dataset_path: Optional[str] = None  # Path to the dataset in the .h5 file, optional.
                      ) -> pd.DataFrame:  # DataFrame with detailed orbit features.
    """
    Load orbit feature data from a specified file and convert it to a DataFrame.
    """
    # Load data using the previously defined function that supports .mat, .h5, and .npy files
    orbit_data = load_orbit_data(file_path, variable_name=variable_name, dataset_path=dataset_path)
    
    # Define column labels for the DataFrame
    column_labels = [
        'Orbit Family', 'Initial Position X', 'Initial Position Y', 'Initial Position Z',
        'Initial Velocity X', 'Initial Velocity Y', 'Initial Velocity Z',
        'Jacobi Constant', 'Period', 'Stability Index'
    ]
    
    # Create a DataFrame from the loaded data
    features = pd.DataFrame(orbit_data, columns=column_labels)

    return features

In [None]:
#| test get_orbit_features
#| hide
def test_get_orbit_features():
    # Sample data simulating what might be returned by load_orbit_data
    mock_data = np.array([
        [1, 0, 0, 0, 1, 0, 0, 3.0, 2.0, 1.0],
        [2, 1, 1, 1, 0, 1, 0, 2.5, 1.5, 0.5]
    ])
    
    # Expected DataFrame structure
    expected_columns = [
        'Orbit Family', 'Initial Position X', 'Initial Position Y', 'Initial Position Z',
        'Initial Velocity X', 'Initial Velocity Y', 'Initial Velocity Z',
        'Jacobi Constant', 'Period', 'Stability Index'
    ]
    expected_df = pd.DataFrame(mock_data, columns=expected_columns)
    
    # Patch the load_orbit_data function to return mock_data
    with patch('__main__.load_orbit_data', return_value=mock_data) as mock_load_orbit_data:
        # Test for .mat file
        result_df = get_orbit_features('dummy_path.mat', variable_name='dummy_var')
        test_eq(result_df.equals(expected_df), True)
        
        # Ensure the mock was called correctly
        mock_load_orbit_data.assert_called_once_with('dummy_path.mat', variable_name='dummy_var', dataset_path=None)

        # Test for .h5 file with dataset_path
        mock_load_orbit_data.reset_mock()
        result_df = get_orbit_features('dummy_path.h5', dataset_path='dummy_dataset')
        test_eq(result_df.equals(expected_df), True)
        
        # Ensure the mock was called correctly
        mock_load_orbit_data.assert_called_once_with('dummy_path.h5', variable_name=None, dataset_path='dummy_dataset')

        # Test for .npy file
        mock_load_orbit_data.reset_mock()
        result_df = get_orbit_features('dummy_path.npy')
        test_eq(result_df.equals(expected_df), True)
        
        # Ensure the mock was called correctly
        mock_load_orbit_data.assert_called_once_with('dummy_path.npy', variable_name=None, dataset_path=None)

# Call the test function to execute tests
test_get_orbit_features()

## Save Data

In [None]:
#| export
def save_data(data: np.ndarray,  # The numpy array data to save.
              file_name: str  # The name of the file to save the data in, including the extension.
             ) -> None:
    """
    Save a numpy array to a file based on the file extension specified in `file_name`.
    Supports saving to HDF5 (.hdf5) or NumPy (.npy) file formats.
    """
    # Extract file extension from file name
    _, file_extension = os.path.splitext(file_name)
    
    if file_extension == '.hdf5':
        # Open a new HDF5 file
        with h5py.File(file_name, 'w') as f:
            # Create a dataset in the file
            f.create_dataset('data', data=data, compression='gzip', compression_opts=9)
    elif file_extension == '.npy':
        # Save the array to a NumPy .npy file
        np.save(file_name, data)
    else:
        # Raise an error for unsupported file types
        raise ValueError("Unsupported file extension. Supported extensions are '.hdf5' or '.npy'.")

In [None]:
#| test save_data
#| hide
# Test for NPY saving functionality
# Test for NPY saving functionality
def test_save_data_npy():
    data = np.random.rand(5, 5)
    file_name = 'test_data.npy'
    
    with patch('numpy.save', autospec=True) as mock_save:
        save_data(data, file_name)
        mock_save.assert_called_once_with(file_name, data)

# Test for HDF5 saving functionality
def test_save_data_hdf5():
    data = np.random.rand(5, 5)
    file_name = 'test_data.hdf5'
    
    with patch('h5py.File', autospec=True) as mock_file:
        save_data(data, file_name)
        mock_file.assert_called_once_with(file_name, 'w')

# Test for handling invalid file type
def test_save_data_invalid_type():
    data = np.random.rand(5, 5)
    file_name = 'test_data.unknown'
    
    try:
        save_data(data, file_name)
        assert False, "ValueError expected but not raised"
    except ValueError as e:
        assert str(e) == "Unsupported file extension. Supported extensions are '.hdf5' or '.npy'.", "Incorrect error message"

test_save_data_invalid_type()


## Get Example Data

In [None]:
#| export
def get_example_orbit_data(
    ) -> np.ndarray:  # Return type annotation added
    """
    Load example orbit data from a numpy file located in the example_data directory.
    """
    # Construct path to example data file
    data_path = get_data_path() / "example_training_data" / "example_orbits.npy"
    
    # Convert Path to string before passing to load_orbit_data
    data = load_orbit_data(str(data_path))
    
    return data

In [None]:
# | test
data = get_example_orbit_data()
data.shape

(400, 7, 100)

## Order labels and array given target

In [None]:
#| export
def order_labels_and_array_with_target(
    labels: np.ndarray,  # Array of labels to be ordered
    array: np.ndarray,  # Array to be ordered according to labels
    target_label: str,  # Label to order by
    place_at_end: bool = False,  # Whether to place target label at end
    ) -> tuple[np.ndarray, np.ndarray]:  # Returns ordered labels and array
    """
    Orders labels and array by placing entries with target_label either at start or end.
    """
    # Convert labels to a numpy array if it's not already
    labels = np.array(labels)
    n = len(labels)
    
    # Create index arrays to sort based on target label
    primary_indices = [i for i in range(n) if labels[i] == target_label]
    secondary_indices = [i for i in range(n) if labels[i] != target_label]

    # If place_at_end is True, reorder the indices
    if place_at_end:
        combined_indices = secondary_indices + primary_indices
    else:
        combined_indices = primary_indices + secondary_indices
    
    # Use indices to sort labels and array
    ordered_labels = labels[combined_indices]
    ordered_array = array[combined_indices]
    
    return ordered_labels, ordered_array

In [None]:
# Sample labels and a sample 3D array
labels = np.array(['apple', 'banana', 'apple', 'orange', 'banana', 'grape'])
array = np.array([[[1, 2], [3, 4]], 
                  [[5, 6], [7, 8]], 
                  [[9, 10], [11, 12]], 
                  [[13, 14], [15, 16]], 
                  [[17, 18], [19, 20]], 
                  [[21, 22], [23, 24]]])
target_label = 'apple'

ordered_labels, ordered_array = order_labels_and_array_with_target(labels, array, target_label)

print(ordered_labels)
print(ordered_array)

['apple' 'apple' 'banana' 'orange' 'banana' 'grape']
[[[ 1  2]
  [ 3  4]]

 [[ 9 10]
  [11 12]]

 [[ 5  6]
  [ 7  8]]

 [[13 14]
  [15 16]]

 [[17 18]
  [19 20]]

 [[21 22]
  [23 24]]]


## Random Sampler

In [None]:
#| export
def sample_orbits(orbit_data: np.ndarray,  # Array of orbit data with shape (num_orbits, 6, num_time_points)
                  sample_spec: Union[dict, int],  # Number of samples per class (dict) or total samples (int)
                  labels: Optional[np.ndarray] = None,  # Array of labels for each orbit
                  ) -> tuple[np.ndarray, Optional[np.ndarray]]:
    """
    Randomly sample orbits from the provided dataset.
    """
    if labels is not None and isinstance(sample_spec, dict):
        # Sampling specified number of orbits for each class
        indices = []
        for label, count in sample_spec.items():
            class_indices = np.where(labels == label)[0]
            if len(class_indices) < count:
                raise ValueError(f"Not enough samples for class {label}. Requested {count}, available {len(class_indices)}.")
            selected_indices = np.random.choice(class_indices, size=count, replace=False)
            indices.extend(selected_indices)
        indices = np.array(indices)
    else:
        # Random sampling without considering classes
        indices = np.random.choice(orbit_data.shape[0], size=sample_spec, replace=False)
    
    # Select the sampled data and labels
    sampled_data = orbit_data[indices]
    sampled_labels = labels[indices] if labels is not None else None
    
    return sampled_data, sampled_labels

In [None]:
#| hide
#| test sample_orbits_random_sampling_without_labels
def test_sample_orbits_random_sampling_without_labels():
    # Setup a mock dataset
    orbit_data = np.random.rand(100, 6, 10)  # 100 orbits, 6 values, 10 time points
    
    # Perform random sampling without labels
    sampled_data, sampled_labels = sample_orbits(orbit_data, 10)
    
    # Test outcomes
    assert sampled_data.shape == (10, 6, 10), "Shape of sampled data should match the requested sample size"
    assert sampled_labels is None, "Labels should be None when not provided"

#| test sample_orbits_class_specific_sampling
def test_sample_orbits_class_specific_sampling():
    # Setup a mock dataset and labels
    orbit_data = np.random.rand(100, 6, 10)
    labels = np.random.randint(0, 3, size=100)  # 100 labels in 3 classes
    sample_spec = {0: 5, 1: 5}
    
    # Perform class-specific sampling
    sampled_data, sampled_labels = sample_orbits(orbit_data, sample_spec, labels)
    
    # Test outcomes
    assert sampled_data.shape == (10, 6, 10), "Shape of sampled data should match the total requested sample size"
    assert len(sampled_labels) == 10, "Number of labels should match the total requested sample size"
    assert all(label in sample_spec for label in sampled_labels), "All labels should be from requested classes"

#| test sample_orbits_insufficient_class_samples
def test_sample_orbits_insufficient_class_samples():
    # Setup a mock dataset and labels
    orbit_data = np.random.rand(100, 6, 10)
    labels = np.random.randint(0, 1, size=100)  # 100 labels in 1 class only
    sample_spec = {0: 50, 1: 50}  # Requesting 50 samples each from class 0 and 1
    
    # Perform class-specific sampling with expectation of failure
    try:
        sample_orbits(orbit_data, sample_spec, labels)
        assert False, "Expected ValueError due to insufficient samples for class 1"
    except ValueError as e:
        assert str(e) == "Not enough samples for class 1. Requested 50, available 0.", "Error message should indicate insufficient samples for class 1"

# Execute tests to verify the behavior
test_sample_orbits_random_sampling_without_labels()
test_sample_orbits_class_specific_sampling()
test_sample_orbits_insufficient_class_samples()

## Random Discarder

In [None]:
#| export
def discard_random_labels(data: np.ndarray,  # Dataset to filter
                         labels: np.ndarray,  # Labels corresponding to the data
                         discard_labels: Union[List, Dict, int],  # Labels to discard - list, dict or number
                         ) -> Tuple[List, np.ndarray, np.ndarray]:
    """
    Discards random or specified labels from the dataset.
    
    Returns tuple of (discarded labels, filtered data, filtered labels).
    """
    # Handle empty dictionary or empty list
    if isinstance(discard_labels, dict) and not discard_labels:
        return [], data, labels  # Return everything as is if dictionary is empty
    elif isinstance(discard_labels, list) and not discard_labels:
        return [], data, labels  # Return everything as is if list is empty
    
    # Check if discard_labels is a list
    if isinstance(discard_labels, list):
        # Use the provided list of labels to discard
        discarded = np.array(discard_labels)
    elif isinstance(discard_labels, dict):
        # If it's a dictionary, use its keys as labels to discard
        discarded = np.array(list(discard_labels.keys()))
    else:
        # Get unique labels
        unique_labels = np.unique(labels)
        # Randomly select labels to discard
        discarded = np.random.choice(unique_labels, size=discard_labels, replace=False)
    
    # Create a mask for samples that are not discarded
    mask = ~np.isin(labels, discarded)
    
    # Return the discarded labels and the filtered dataset
    return discarded.tolist(), data[mask], labels[mask]


## Remove Duplicates preserve Order

In [None]:
#| export
def remove_duplicates_preserve_order(input_list: List,  # Input list that may contain duplicates
                                   ) -> List:  # Returns list with duplicates removed while preserving order
    """
    Removes duplicate items from a list while preserving the original order.
    """
    unique_items = []
    seen = set()
    for item in input_list:
        if item not in seen:
            seen.add(item)
            unique_items.append(item)
    return unique_items

## Dataloaders

In [None]:
#| export
def create_dataloaders(scaled_data: torch.Tensor,  # Input tensor of scaled data
                      val_split: float = 0.2,  # Fraction of data to use for validation
                      batch_size: int = 32,  # Batch size for dataloaders
                      ) -> Tuple[DataLoader, Optional[DataLoader]]:  # Returns train and optional val dataloaders
    """
    Creates train and validation dataloaders from input tensor data.
    """
    if val_split > 0:
        X_train, X_val = train_test_split(
            scaled_data,
            test_size=val_split,
            shuffle=True,
            random_state=42
        )
        train_dataset = TensorDataset(X_train)
        val_dataset = TensorDataset(X_val)
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
    else:
        train_dataset = TensorDataset(scaled_data)
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
        val_dataloader = None
    
    return train_dataloader, val_dataloader

## Scaler

In [None]:
#| export
EPS = 1e-18  # A small epsilon to prevent division by zero

In [None]:
#| export
class TSFeatureWiseScaler():
    """
    Scales time series data feature-wise using PyTorch tensors.

    Parameters:
    -----------
    feature_range : tuple(float, float), optional
        Tuple representing the minimum and maximum feature values (default is (0, 1)).

    Attributes:
    -----------
    _min_v : float
        Minimum feature value.
    _max_v : float
        Maximum feature value.
    """
    def __init__(self, feature_range: tuple = (0, 1)) -> None:
        assert len(feature_range) == 2
        self._min_v, self._max_v = feature_range

    def fit(self, X: torch.Tensor) -> "TSFeatureWiseScaler":
        """
        Fits the scaler to the data.
        
        :param X: Input data. Shape: (N, F, T) where N is number of data points,
                  F is number of features, and T is number of time steps.
        :type X: torch.Tensor
        
        :returns: The fitted scaler object.
        :rtype: TSFeatureWiseScaler
        """
        F = X.shape[1]
        self.mins = torch.zeros(F, device=X.device)
        self.maxs = torch.zeros(F, device=X.device)

        for i in range(F):
            self.mins[i] = torch.min(X[:, i, :])
            self.maxs[i] = torch.max(X[:, i, :])

        return self

    def transform(self, X: torch.Tensor) -> torch.Tensor:
        """
        Transforms the data.
        
        :param X: Input data. Shape: (N, F, T)
        :type X: torch.Tensor
        
        :returns: Scaled data.
        :rtype: torch.Tensor
        """
        X_scaled = X.clone()
        for i in range(X.shape[1]):
            X_scaled[:, i, :] = ((X[:, i, :] - self.mins[i]) / (self.maxs[i] - self.mins[i] + 1e-8)) * (self._max_v - self._min_v) + self._min_v
        return X_scaled

    def inverse_transform(self, X: torch.Tensor) -> torch.Tensor:
        """
        Inverse-transforms the data.
        
        :param X: Scaled data. Shape: (N, F, T)
        :type X: torch.Tensor
        
        :returns: Original data.
        :rtype: torch.Tensor
        """
        X_inv = X.clone()
        for i in range(X.shape[1]):
            X_inv[:, i, :] = (X[:, i, :] - self._min_v) / (self._max_v - self._min_v)
            X_inv[:, i, :] = X_inv[:, i, :] * (self.maxs[i] - self.mins[i] + 1e-8) + self.mins[i]
        return X_inv

    def fit_transform(self, X: torch.Tensor) -> torch.Tensor:
        """
        Fits the scaler to the data and transforms it.
        
        :param X: Input data. Shape: (N, F, T)
        :type X: torch.Tensor
        
        :returns: Scaled data.
        :rtype: torch.Tensor
        """
        self.fit(X)
        return self.transform(X)

In [None]:
#| export
class TSGlobalScaler():
    """
    Scales time series data globally using PyTorch tensors.

    Attributes:
    -----------
    min : float
        Minimum value encountered in the data.
    max : float
        Maximum value encountered in the data.
    """
    def fit(self, X: torch.Tensor) -> "TSGlobalScaler":
        """
        Fits the scaler to the data.
        
        :param X: Input data.
        :type X: torch.Tensor
        
        :returns: The fitted scaler object.
        :rtype: TSGlobalScaler
        """
        self.min = torch.min(X)
        self.max = torch.max(X)
        return self

    def transform(self, X: torch.Tensor) -> torch.Tensor:
        """
        Transforms the data.
        
        :param X: Input data.
        :type X: torch.Tensor
        
        :returns: Scaled X.
        :rtype: torch.Tensor
        """
        return (X - self.min) / (self.max - self.min + EPS)

    def inverse_transform(self, X: torch.Tensor) -> torch.Tensor:
        """
        Inverse-transforms the data.
        
        :param X: Scaled data.
        :type X: torch.Tensor
        
        :returns: Original data.
        :rtype: torch.Tensor
        """
        X = X * (self.max - self.min + EPS)
        X = X + self.min
        return X

    def fit_transform(self, X: torch.Tensor) -> torch.Tensor:
        """
        Fits the scaler to the data and transforms it.
        
        :param X: Input data.
        :type X: torch.Tensor
        
        :returns: Scaled input data X.
        :rtype: torch.Tensor
        """
        self.fit(X)
        return self.transform(X)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()