## Convert raw GDF files into individual per-trial CSV files
1. Read each subject’s TRAINING file (A01T.gdf, etc.)
2. Extract events from the file
3. For every cue event (769–772):
- That timestamp = cue onset
- Extract 4 seconds of data starting at cue onset
- That equals 1000 samples
4. Keep only channels 0–21 (first 22 channels)
- Drop last 3 EOG channels.
5. Each extracted segment becomes one trial: Shape = (22, 1000)
6. Convert class labels:
769 → class 0
770 → class 1
771 → class 2
772 → class 3
7. Split trials into:
- 80% training
- 20% test
(stratified so class balance stays equal)
8. Save each trial as a CSV:
- 22 rows
- 1000 columns
- First column = channel name (optional but your classifier loader expects it)
- Remaining columns = numeric values


In [None]:
"""
PSEUDOCODE (Python-style) for the full pipeline:
1) Preprocess raw GDF into per-trial CSVs in Original/ directory
2) Train a per-(subject,class) VAE generator and write Generated/ directory
3) Train/evaluate classifier twice (original-trained vs generated-trained)

This is NOT runnable code. It is structured pseudocode to show exactly what to do.
"""

from pathlib import Path


# ---------------------------
# CONFIG / CONSTANTS
# ---------------------------

SUBJECTS = ["A01", "A02", "A03", "A04", "A05", "A06", "A07", "A08", "A09"]

# Event codes (from dataset description)
CUE_TO_CLASS = {
    769: 0,  # left hand
    770: 1,  # right hand
    771: 2,  # feet
    772: 3,  # tongue
}

SFREQ = 250                     # Hz
EPOCH_SECONDS = 4               # use 4s of data after cue onset (2s->6s)
EPOCH_SAMPLES = SFREQ * EPOCH_SECONDS  # 1000

N_EEG_CHANNELS = 22             # drop the 3 EOG channels

TRAIN_SPLIT = 0.80              # 80/20 split
RANDOM_SEED = 0

# Output directory
OUT_ROOT = Path("bci_iv_2a_data")
ORIGINAL_ROOT = OUT_ROOT / "Original"
GENERATED_ROOT = OUT_ROOT / "Generated"
# ============================================================
# PART 1: PREPROCESSING (GDF -> Original/<subject>/{train,test}/{0..3}/*.csv)
# ============================================================

def preprocess_all_subjects(gdf_dir: Path) -> None:
    """
    For each subject:
      - Load subject's training GDF (e.g., A01T.gdf)
      - Extract cue events (769..772)
      - Epoch 4s (1000 samples) from cue onset
      - Keep 22 EEG channels
      - Map cue code -> class index 0..3
      - Split epochs into train/test
      - Save each epoch as CSV into the required folder structure
    """
    for subj in SUBJECTS:
        gdf_path = gdf_dir / f"{subj}T.gdf"
        raw = read_gdf(gdf_path)  # returns continuous multichannel signal + metadata

        events = extract_events(raw)  # list of (sample_index, event_code)
        cue_events = filter_events(events, allowed_codes=set(CUE_TO_CLASS.keys()))

        trials, labels = [], []

        for (sample_index, event_code) in cue_events:
            # 1) Compute segment boundaries
            start = sample_index
            end = sample_index + EPOCH_SAMPLES

            # 2) Extract signal segment (channels x time)
            segment = raw_signal_slice(raw, start=start, end=end)  # shape: (all_channels, EPOCH_SAMPLES)

            # 3) Keep EEG only (drop EOG)
            eeg_segment = segment[:N_EEG_CHANNELS, :]  # shape: (22, 1000)

            # 4) Store trial and mapped label
            cls = CUE_TO_CLASS[event_code]
            trials.append(eeg_segment)
            labels.append(cls)

        # 5) Split into train/test in a stratified way (keep class balance)
        X_train, X_test, y_train, y_test = stratified_split(
            X=trials, y=labels, train_ratio=TRAIN_SPLIT, seed=RANDOM_SEED
        )

        # 6) Create folders and write CSV files
        create_original_folders(subject=subj)

        write_trials_to_csv(
            X_train, y_train,
            out_dir=ORIGINAL_ROOT / subj / "train",
            include_channel_name_column=True,   # matches repo expectation for Original
        )

        write_trials_to_csv(
            X_test, y_test,
            out_dir=ORIGINAL_ROOT / subj / "test",
            include_channel_name_column=True,
        )


def create_original_folders(subject: str) -> None:
    """
    Create:
      Original/<subject>/{train,test}/{0,1,2,3}/
    """
    for split in ["train", "test"]:
        for cls in [0, 1, 2, 3]:
            (ORIGINAL_ROOT / subject / split / str(cls)).mkdir(parents=True, exist_ok=True)


def write_trials_to_csv(X, y, out_dir: Path, include_channel_name_column: bool) -> None:
    """
    Save each trial as CSV into out_dir/<class>/trial_XXXXX.csv

    Each trial is a matrix: 22 x 1000
    If include_channel_name_column is True:
      - first column is a channel name string (so the classifier loader can drop it)
      - remaining columns are numeric
    If False:
      - numeric only (for Generated data compatibility)
    """
    counters = {0: 0, 1: 0, 2: 0, 3: 0}

    for trial, cls in zip(X, y):
        idx = counters[cls]
        counters[cls] += 1

        cls_dir = out_dir / str(cls)
        cls_dir.mkdir(parents=True, exist_ok=True)

        filename = cls_dir / f"trial_{idx:05d}.csv"

        # Pseudocode for writing:
        # if include_channel_name_column:
        #   csv rows: ["C3", v1, v2, ..., v1000]
        # else:
        #   csv rows: [v1, v2, ..., v1000]
        write_csv_matrix(filename, trial, include_channel_name_column=include_channel_name_column)

