# Just a simple notebook to split dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys
import gc
import shutil
from tqdm.auto import tqdm

cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
base_dir = os.path.dirname(parent_dir)
src_dir = base_dir + "/src"

sys.path.insert(0, src_dir)

from data_analysis import data_processing, plot_positions, config

## Select which eMNS dataset to use

In [None]:
####################################
########### Which eMNS? ############
####################################
emns = "octomag" # "octomag" or "navion"


# Setup paths according to eMNS
if emns.lower() == "octomag":
    data_dir = base_dir + "/data/octomag_data"
    data_path = data_dir + "/clean_data/clean_data.pkl"
    split_data_dir = data_dir + "/split_dataset"

elif emns.lower() == "navion":
    data_dir = base_dir + "/data/navion_data/clean_data"
    data_path = data_dir + "/clean_data.pkl"
    split_data_dir = data_dir + "/../split_dataset"
else:
    raise ValueError("emns must be either 'octomag' or 'navion'")

training_data_file = split_data_dir + "/training_data.pkl"
test_data_file = split_data_dir + "/test_data.pkl"
validation_data_file = split_data_dir + "/validation_data.pkl"

# Print paths for verification
print(f"Data path: {data_path}")
print(f"Split data directory: {split_data_dir}")
print(f"Training data file: {training_data_file}")
print(f"Test data file: {test_data_file}")
print(f"Validation data file: {validation_data_file}")

In [None]:
# Define split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

## Load data

Use the compacted data which already ignores malformed files. Still need to correct dc offset.

In [None]:
data = pd.read_pickle(data_path)

plot_positions(data, title="All Data Positions")

## Split and store

Splitting is done based solely on the position features.

We shuffle the data with a fixed seed for reproducibility.

In [None]:
train, val, test = data_processing.split_dataset_positional(data, train_ratio, val_ratio, test_ratio, shuffle=True, random_state=config.SPLIT_SEED)

print(f"Full dataset size: {len(data)}")
print(f"Training set size: {len(train)}")
print(f"Validation set size: {len(val)}")
print(f"Test set size: {len(test)}")

In [None]:
# Plot training set positions
plot_positions(train, title="Training Set Positions")

In [None]:
# Plot validation set positions
plot_positions(val, title="Validation Set Positions")

In [None]:
# Plot test set positions
plot_positions(test, title="Test Set Positions")

In [None]:
# Store split datasets
os.makedirs(split_data_dir, exist_ok=True)
train.to_pickle(training_data_file)
val.to_pickle(validation_data_file)
test.to_pickle(test_data_file)