In [None]:
"""
TUH EEG Dataset Loading with HDF5

This notebook demonstrates how to:
1. Process TUH EDF files to HDF5 format (one-time)
2. Load the HDF5 dataset (fast, braindecode-compatible)
3. Use with braindecode preprocessing and windowing
"""
import os
import sys
from pathlib import Path

# Add cerebro to path
sys.path.insert(0, str(Path.cwd().parent))

eeg_data = "/projects/academic/wenyaoxu/anarghya/research/eeg-data"
tuh_dir = os.path.join(eeg_data, "tuh", "tueg", "v2.0.1")
hdf5_path = os.path.join(eeg_data, "tuh_eeg_processed.h5")

print(f"TUH directory: {tuh_dir}")
print(f"HDF5 file: {hdf5_path}")
print(f"HDF5 exists: {os.path.exists(hdf5_path)}")

In [None]:
# Step 1: Process TUH to HDF5 (one-time operation - commented out)
# This will take several hours for the full 1.7TB dataset
# Uncomment to run:

# from scripts.process_tuh_to_hdf5 import create_hdf5_dataset
#
# create_hdf5_dataset(
#     tuh_dir=tuh_dir,
#     output_hdf5=hdf5_path,
#     n_jobs=16,  # Adjust based on your CPU cores
#     compression='gzip',
#     compression_level=4,  # Balance between speed and compression
#     resume=True,  # Can resume if interrupted
# )

# Step 2: Load HDF5 dataset (fast!)
from cerebro.data.tuh import TUHDataset

dataset = TUHDataset(hdf5_path)
print(f"Total recordings: {len(dataset)}")

metadata = dataset.get_metadata()
print(f"\nMetadata columns: {metadata.columns.tolist()}")
print(f"\nSample metadata:")
print(metadata.head())