In [14]:
!pip install mne



### Imports, config, and output folders

In [20]:
# === Imports & Config ===
import glob
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.signal import welch
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                             roc_curve, auc)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import joblib


# ---- Set where your EDF subject folders live ----
# Example structure:
#   DATA_ROOT/
#     S001/S001R01.edf, S001R02.edf, ...
#     S002/S002R01.edf, ...
DATA_ROOT  = r"C:\Users\Admin\Desktop\ALVIN\eeg-motor-movementimagery-dataset-1.0.0"  # <-- change to your EDF root
OUTPUT_DIR = r"C:\Users\Admin\Desktop\ALVIN\outputs_final_loso"  # separate from your CSV project

# ---- Analysis knobs ----
WINDOW_SEC = 5.0         # per-window length (seconds)
OVERLAP    = 0         # 50% overlap
DURATION_S = None        # None = use full file; or set (e.g., 60.0) to trim
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# 19-channel target order (legacy 10/20 names you used before)
CHANNELS_19 = ['Fp1','Fp2','F3','F4','F7','F8','T3','T4','C3','C4',
               'T5','T6','P3','P4','O1','O2','Fz','Cz','Pz']

# Map modern MNE names to your legacy labels (so features align with your old pipeline)
CHANNEL_ALIAS = {'T7':'T3', 'T8':'T4', 'P7':'T5', 'P8':'T6'}

# EEG bands (same as before, stopping at 45 Hz to avoid 50 Hz powerline)
BANDS = {
    "Delta": (0.5, 4.0),
    "Theta": (4.0, 7.0),
    "Alpha": (8.0, 13.0),
    "Beta":  (13.0, 30.0),
    "Gamma": (30.0, 45.0),
}

# ---- Organized output folders (mirrors your preferred layout) ----
DIRS = {
    "corrected_csv": os.path.join(OUTPUT_DIR, "corrected_csv"),  # mostly unused for EDF, kept for parity
    "features":      os.path.join(OUTPUT_DIR, "features"),
    "models":        os.path.join(OUTPUT_DIR, "models"),
    "plots":         os.path.join(OUTPUT_DIR, "plots"),
    "similarity":    os.path.join(OUTPUT_DIR, "similarity"),

    # features subfolders
    "feat_sessions": os.path.join(OUTPUT_DIR, "features", "sessions"),  # per-session window features
    "feat_master":   os.path.join(OUTPUT_DIR, "features", "master"),    # concatenated master tables
    "feat_splits":   os.path.join(OUTPUT_DIR, "features", "splits"),    # train/val/test CSVs

    # plots subfolders
    "plots_psd":     os.path.join(OUTPUT_DIR, "plots", "psd"),
    "plots_band":    os.path.join(OUTPUT_DIR, "plots", "band"),
    "plots_cm":      os.path.join(OUTPUT_DIR, "plots", "cm"),
    "plots_roc":     os.path.join(OUTPUT_DIR, "plots", "roc"),

    # models subfolders
    "models_ckpt":   os.path.join(OUTPUT_DIR, "models", "checkpoints"),
    "models_final":  os.path.join(OUTPUT_DIR, "models", "final"),
}
for d in DIRS.values():
    os.makedirs(d, exist_ok=True)

print("Output tree:")
for k,v in DIRS.items():
    print(f" - {k}: {v}")


Output tree:
 - corrected_csv: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\corrected_csv
 - features: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\features
 - models: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\models
 - plots: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\plots
 - similarity: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\similarity
 - feat_sessions: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\features\sessions
 - feat_master: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\features\master
 - feat_splits: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\features\splits
 - plots_psd: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\plots\psd
 - plots_band: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\plots\band
 - plots_cm: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\plots\cm
 - plots_roc: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\plots\roc
 - models_ckpt: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\models\checkpoints
 - models_final: C

### Discover all subjects & sessions

In [23]:
edf_paths = sorted(glob.glob(os.path.join(DATA_ROOT, "S???", "S???R??.edf")))
if not edf_paths:
    raise FileNotFoundError(f"No EDF files found under {DATA_ROOT}. Check the path and pattern.")

rows = []
pat = re.compile(r"(S\d{3})[\\/](S\d{3})R(\d{2})\.edf$", re.IGNORECASE)

for p in edf_paths:
    m = pat.search(p.replace("/", os.sep))
    if not m:
        # fallback: try filename only
        base = os.path.basename(p)
        m2 = re.match(r"(S\d{3})R(\d{2})\.edf$", base, re.IGNORECASE)
        if not m2:
            continue
        subj = base[:4]   # e.g., S109
        sess = m2.group(1)  # R##
    else:
        subj = m.group(2)     # S109 (second capture)
        sess = f"R{m.group(3)}"  # R01, R02, ...

    event_path = os.path.splitext(p)[0] + ".edf.event"
    rows.append({"Subject": subj.upper(), "Session": sess.upper(),
                 "edf_path": p, "event_path": event_path if os.path.exists(event_path) else None})

index_df = pd.DataFrame(rows).sort_values(["Subject","Session"]).reset_index(drop=True)

# Save the index for traceability
index_csv = os.path.join(DIRS["feat_master"], "session_index.csv")
index_df.to_csv(index_csv, index=False)

# Quick summaries
counts = index_df.groupby("Subject")["Session"].count().rename("n_sessions")
summary = counts.reset_index().sort_values("n_sessions", ascending=False)

print(f"Found {len(index_df)} sessions across {summary.shape[0]} subjects.")
display(index_df.head(10))
display(summary.head(10))


Found 1526 sessions across 109 subjects.


Unnamed: 0,Subject,Session,edf_path,event_path
0,S001,R01,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...
1,S001,R02,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...
2,S001,R03,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...
3,S001,R04,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...
4,S001,R05,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...
5,S001,R06,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...
6,S001,R07,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...
7,S001,R08,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...
8,S001,R09,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...
9,S001,R10,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...,C:\Users\Admin\Desktop\ALVIN\eeg-motor-movemen...


Unnamed: 0,Subject,n_sessions
0,S001,14
69,S070,14
80,S081,14
79,S080,14
78,S079,14
77,S078,14
76,S077,14
75,S076,14
74,S075,14
73,S074,14


### EDF loader + channel normalization (19-channel frame)

In [26]:
# === Cell 3A: Inspect raw channel names & normalization ===
import re
import mne

# Pick first EDF file
row = index_df.iloc[0]
edf_path = row["edf_path"]
raw = mne.io.read_raw_edf(edf_path, preload=False, verbose="ERROR")
orig = raw.ch_names

def rough_norm(name: str) -> str:
    n = name.upper().strip()
    n = n.replace("EEG ", "").replace("EEG_", "").replace("EEG", "")
    n = re.sub(r"[ \-_()]", "", n)
    for suf in ("REF","LE","RE","A1","M1","A2","M2","AVG","AVERAGE","AVGREF"):
        if n.endswith(suf):
            n = n[: -len(suf)]
    n = {"T7":"T3","T8":"T4","P7":"T5","P8":"T6"}.get(n, n)
    pretty = {
        "FP1":"Fp1","FP2":"Fp2","F3":"F3","F4":"F4","F7":"F7","F8":"F8",
        "T3":"T3","T4":"T4","C3":"C3","C4":"C4","T5":"T5","T6":"T6",
        "P3":"P3","P4":"P4","O1":"O1","O2":"O2","FZ":"Fz","CZ":"Cz","PZ":"Pz"
    }
    return pretty.get(n, n)

norm = [rough_norm(c) for c in orig]
df_names = pd.DataFrame({"original": orig, "normalized": norm})
print("First 40 channels:")
display(df_names.head(40))

targets = set(CHANNELS_19)
present = sorted(list(set(norm).intersection(targets)))
missing = sorted(list(targets.difference(present)))
print("Present targets:", present)
print("Missing targets:", missing)

First 40 channels:


Unnamed: 0,original,normalized
0,Fc5.,FC5.
1,Fc3.,FC3.
2,Fc1.,FC1.
3,Fcz.,FCZ.
4,Fc2.,FC2.
5,Fc4.,FC4.
6,Fc6.,FC6.
7,C5..,C5..
8,C3..,C3..
9,C1..,C1..


Present targets: []
Missing targets: ['C3', 'C4', 'Cz', 'F3', 'F4', 'F7', 'F8', 'Fp1', 'Fp2', 'Fz', 'O1', 'O2', 'P3', 'P4', 'Pz', 'T3', 'T4', 'T5', 'T6']


In [27]:
# === Cell 3B: Strict EDF loader & smoke test ===
import re
import numpy as np

def _normalize_eeg_name_strict(name: str) -> str:
    n = name.upper().strip()
    n = n.replace("EEG ", "").replace("EEG_", "").replace("EEG", "")
    n = re.sub(r"[^A-Z0-9]", "", n)
    for suf in ("REF","LE","RE","A1","M1","A2","M2","AVG","AVERAGE","AVGREF"):
        if n.endswith(suf):
            n = n[: -len(suf)]
    n = {"T7":"T3","T8":"T4","P7":"T5","P8":"T6"}.get(n, n)
    pretty = {
        "FP1":"Fp1","FP2":"Fp2","F3":"F3","F4":"F4","F7":"F7","F8":"F8",
        "T3":"T3","T4":"T4","C3":"C3","C4":"C4","T5":"T5","T6":"T6",
        "P3":"P3","P4":"P4","O1":"O1","O2":"O2","FZ":"Fz","CZ":"Cz","PZ":"Pz"
    }
    return pretty.get(n, n)

def load_edf_19(edf_path, duration_s=DURATION_S, target_channels=CHANNELS_19):
    raw = mne.io.read_raw_edf(edf_path, preload=True, verbose="ERROR")
    fs = float(raw.info["sfreq"])

    # Rename channels
    rename = {}
    for ch in raw.ch_names:
        nn = _normalize_eeg_name_strict(ch)
        if nn != ch:
            rename[ch] = nn
    if rename:
        raw.rename_channels(rename)

    # Crop to fixed duration
    if duration_s is not None:
        n_samp = int(duration_s * fs)
        raw.crop(tmax=(n_samp - 1) / fs)

    # Select available target channels
    present = [ch for ch in target_channels if ch in raw.ch_names]
    if not present:
        print("Normalized names (first 40):", raw.ch_names[:40])
        raise ValueError("No appropriate channels found after strict normalization.")

    data = raw.get_data(picks=present).T
    df = pd.DataFrame(index=np.arange(data.shape[0]), columns=target_channels, dtype=float)
    df[:] = np.nan
    present_df = pd.DataFrame(data, columns=present)
    for ch in present:
        df[ch] = present_df[ch].values

    print(f"Loaded {os.path.basename(edf_path)} | fs={fs:.2f} Hz | samples={df.shape[0]} | present {len(present)}/19")
    miss = [ch for ch in target_channels if ch not in present]
    if miss:
        print("Missing targets:", miss)
    return df, fs

# --- Smoke test on first file ---
row = index_df.iloc[0]
df_test, fs_test = load_edf_19(row["edf_path"])
print(row["Subject"], row["Session"], "fs=", fs_test, "shape=", df_test.shape)
display(df_test.head())

Loaded S001R01.edf | fs=160.00 Hz | samples=9760 | present 19/19
S001 R01 fs= 160.0 shape= (9760, 19)


Unnamed: 0,Fp1,Fp2,F3,F4,F7,F8,T3,T4,C3,C4,T5,T6,P3,P4,O1,O2,Fz,Cz,Pz
0,-4.9e-05,-2.9e-05,-1.6e-05,-6e-05,-3.9e-05,-2.5e-05,-7.1e-05,-3.2e-05,-2.6e-05,-2e-05,-5.6e-05,-3e-05,-1.2e-05,-3.7e-05,-5.3e-05,-1.1e-05,-3.8e-05,-4e-06,-3.1e-05
1,-2.8e-05,-3.9e-05,-1.5e-05,-6.3e-05,-4.7e-05,-2.4e-05,-6.5e-05,-2.1e-05,-5.5e-05,-2.8e-05,-3.7e-05,-2e-05,4e-06,-2.5e-05,-5.3e-05,1e-06,-4.5e-05,-2.6e-05,-2.1e-05
2,-5.2e-05,-4.6e-05,-1.7e-05,-6.6e-05,-3.9e-05,-1.8e-05,-4.9e-05,1.4e-05,-4.2e-05,-1.9e-05,-3.7e-05,-2e-05,2.4e-05,-1.9e-05,-4.5e-05,1.8e-05,-5e-05,-2.1e-05,-7e-06
3,-7.3e-05,-5.1e-05,-1e-05,-5.2e-05,-3.6e-05,-2.2e-05,-4.2e-05,-2e-06,-2.1e-05,-8e-06,-3.2e-05,-1.6e-05,4.5e-05,-1e-06,-2.9e-05,3.5e-05,-3.3e-05,4e-06,1.6e-05
4,-6.2e-05,-5.1e-05,3e-06,-4.5e-05,-3.7e-05,-3.3e-05,-4e-05,-5e-05,-1.2e-05,-4e-06,-3e-05,-9e-06,4.6e-05,8e-06,-1.3e-05,4e-05,-2.5e-05,2.6e-05,2.3e-05


### DSP helpers and per-session feature extraction

In [29]:
# === Cell 4: Windowing + Welch bandpowers (abs/rel) for ALL sessions ===
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import welch

# Assumes these already exist from earlier cells:
# - index_df (Cell 2)
# - load_edf_19 (Cell 3B)
# - DIRS, WINDOW_SEC, OVERLAP, DURATION_S, CHANNELS_19, BANDS (Cell 1 / your setup)

# ---- Helpers ----
def segment_windows(n_samples, fs, win_sec=WINDOW_SEC, overlap=OVERLAP):
    win_len = int(win_sec * fs)
    hop = max(1, int(win_len * (1 - overlap)))
    return [(s, s + win_len) for s in range(0, n_samples - win_len + 1, hop)]

def welch_psd(x, fs, nperseg, noverlap):
    return welch(x, fs=fs, nperseg=int(nperseg), noverlap=int(noverlap), detrend='constant')

def bandpower_integrate(f, Pxx, band):
    fmin, fmax = band
    idx = (f >= fmin) & (f < fmax)
    if not np.any(idx):
        return 0.0
    return float(np.trapz(Pxx[idx], f[idx]))

def compute_window_features_df(df_win, fs, channels=CHANNELS_19, bands=BANDS):
    """
    Given a windowed DataFrame (time x channels), compute abs & rel bandpower per channel.
    Returns: dict of features.
    """
    nperseg = max(4, int(2 * fs))       # conservative for 5 s windows
    noverlap = int(0.5 * nperseg)
    feats = {}
    for ch in channels:
        x = pd.to_numeric(df_win[ch], errors='coerce').fillna(0.0).values
        f, Pxx = welch_psd(x, fs, nperseg, noverlap)
        total = float(np.trapz(Pxx, f))
        for b, rng in bands.items():
            abs_p = bandpower_integrate(f, Pxx, rng)
            rel_p = abs_p / total if total > 0 else np.nan
            feats[f"{ch}_{b}"]     = abs_p
            feats[f"{ch}_{b}_Rel"] = rel_p
    return feats

def plot_psd_overlay(df_full, fs, channels, out_png, title):
    nperseg = int(4 * fs)
    noverlap = int(0.5 * nperseg)
    plt.figure(figsize=(12, 8))
    for ch in channels:
        x = pd.to_numeric(df_full[ch], errors='coerce').fillna(0.0).values
        f, Pxx = welch_psd(x, fs, nperseg, noverlap)
        plt.plot(f, 10*np.log10(Pxx), label=ch)
    plt.xlim(0, 50)
    plt.xlabel("Frequency (Hz)"); plt.ylabel("PSD (dB/Hz)")
    plt.title(title)
    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=8)
    plt.tight_layout()
    plt.savefig(out_png, dpi=150); plt.close()

def plot_band_bars(wide_feat_row, channels, bands, out_abs, out_rel, title):
    xs = np.arange(len(channels)); width = 0.15; band_list = list(bands.keys())
    # Absolute
    plt.figure(figsize=(14, 6))
    for i, b in enumerate(band_list):
        vals = [wide_feat_row.get(f"{ch}_{b}", np.nan) for ch in channels]
        plt.bar(xs + i*width, vals, width, label=b)
    plt.xticks(xs + width*2, channels, rotation=45)
    plt.ylabel("Absolute Band Power")
    plt.title(title + " (Absolute)")
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_abs, dpi=150); plt.close()
    # Relative
    plt.figure(figsize=(14, 6))
    for i, b in enumerate(band_list):
        vals = [wide_feat_row.get(f"{ch}_{b}_Rel", np.nan) for ch in channels]
        plt.bar(xs + i*width, vals, width, label=b)
    plt.xticks(xs + width*2, channels, rotation=45)
    plt.ylabel("Relative Band Power")
    plt.title(title + " (Relative)")
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_rel, dpi=150); plt.close()

# Ensure required output subfolders exist (based on your DIRS)
for key in ["feat_sessions", "feat_master", "plots_psd", "plots_band"]:
    os.makedirs(DIRS[key], exist_ok=True)

# ---- Process all sessions in index_df ----
all_rows = []
per_session_counts = []

for i, row in index_df.iterrows():
    subj, sess, edf_path = row["Subject"], row["Session"], row["edf_path"]

    # Load 19-channel frame (uses your strict loader from Cell 3B)
    df_full, fs = load_edf_19(edf_path, duration_s=DURATION_S, target_channels=CHANNELS_19)

    # Windowing
    wins = segment_windows(len(df_full), fs, WINDOW_SEC, OVERLAP)
    sess_rows = []
    for (s, e) in wins:
        dfw = df_full.iloc[s:e]
        feats = compute_window_features_df(dfw, fs, channels=CHANNELS_19, bands=BANDS)
        feats["Subject"] = subj
        feats["Session"] = sess
        feats["StartSample"] = int(s)
        feats["Fs"] = float(fs)
        sess_rows.append(feats)
        all_rows.append(feats)

    # Save per-session features
    sess_df = pd.DataFrame(sess_rows)
    out_csv = os.path.join(DIRS["feat_sessions"], f"{subj}_{sess}_winfeat.csv")
    sess_df.to_csv(out_csv, index=False)

    # Wide summary over full recording for plots
    wide_feats = compute_window_features_df(df_full, fs, channels=CHANNELS_19, bands=BANDS)
    psd_png   = os.path.join(DIRS["plots_psd"],  f"psd_{subj}_{sess}.png")
    bandA_png = os.path.join(DIRS["plots_band"], f"band_abs_{subj}_{sess}.png")
    bandR_png = os.path.join(DIRS["plots_band"], f"band_rel_{subj}_{sess}.png")
    plot_psd_overlay(df_full, fs, CHANNELS_19, psd_png,  f"EEG PSD — {subj} {sess}")
    plot_band_bars(wide_feats, CHANNELS_19, BANDS, bandA_png, bandR_png, f"Band Powers — {subj} {sess}")

    per_session_counts.append({"Subject": subj, "Session": sess,
                               "n_windows": len(sess_rows), "fs": fs})

# Save master table and counts
master_df = pd.DataFrame(all_rows)
master_csv = os.path.join(DIRS["feat_master"], f"eeg_master_windows_{int(WINDOW_SEC)}s.csv")
master_df.to_csv(master_csv, index=False)

counts_df = pd.DataFrame(per_session_counts).sort_values(["Subject","Session"])
counts_csv = os.path.join(DIRS["feat_master"], "session_window_counts.csv")
counts_df.to_csv(counts_csv, index=False)

print(f"Saved per-session CSVs to: {DIRS['feat_sessions']}")
print(f"Saved master feature table: {master_csv}  (shape={master_df.shape})")
print(f"Saved session window counts: {counts_csv}")
display(counts_df.head())

Loaded S001R01.edf | fs=160.00 Hz | samples=9760 | present 19/19
Loaded S001R02.edf | fs=160.00 Hz | samples=9760 | present 19/19
Loaded S001R03.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S001R04.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S001R05.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S001R06.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S001R07.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S001R08.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S001R09.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S001R10.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S001R11.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S001R12.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S001R13.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S001R14.edf | fs=160.00 Hz | samples=20000 | present 19/19
Loaded S002R01.edf | fs=160.00 Hz | samples=9760 | present 19/19
Loaded S002R0

Unnamed: 0,Subject,Session,n_windows,fs
0,S001,R01,12,160.0
1,S001,R02,12,160.0
2,S001,R03,25,160.0
3,S001,R04,25,160.0
4,S001,R05,25,160.0


### Quick Review

In [31]:
# === Cell 5: Quick preview & sanity checks ===
print("Master features shape:", master_df.shape)
print("Columns (first 10):", master_df.columns[:10].tolist())
print("Subjects:", sorted(master_df['Subject'].unique().tolist())[:10], '...')

# Windows per subject
w_per_subj = master_df.groupby("Subject")["StartSample"].count().rename("n_windows").reset_index()
display(w_per_subj.sort_values("n_windows", ascending=False).head(10))

# Windows per (Subject, Session)
w_per_sess = master_df.groupby(["Subject","Session"])["StartSample"].count().rename("n_windows").reset_index()
display(w_per_sess.head(10))

Master features shape: (34263, 194)
Columns (first 10): ['Fp1_Delta', 'Fp1_Delta_Rel', 'Fp1_Theta', 'Fp1_Theta_Rel', 'Fp1_Alpha', 'Fp1_Alpha_Rel', 'Fp1_Beta', 'Fp1_Beta_Rel', 'Fp1_Gamma', 'Fp1_Gamma_Rel']
Subjects: ['S001', 'S002', 'S003', 'S004', 'S005', 'S006', 'S007', 'S008', 'S009', 'S010'] ...


Unnamed: 0,Subject,n_windows
0,S001,324
94,S095,324
60,S061,324
64,S065,324
65,S066,324
45,S046,324
70,S071,324
78,S079,324
34,S035,324
31,S032,324


Unnamed: 0,Subject,Session,n_windows
0,S001,R01,12
1,S001,R02,12
2,S001,R03,25
3,S001,R04,25
4,S001,R05,25
5,S001,R06,25
6,S001,R07,25
7,S001,R08,25
8,S001,R09,25
9,S001,R10,25


### Train Test Validation

In [35]:
# === Cell 6: Build SESSION-DISJOINT 60/20/20 splits per subject ===
import os
import pandas as pd

# Assumes you already ran Cell 4 and have the master feature CSV
FEAT_MASTER = os.path.join(OUTPUT_DIR, "features", "master")
master_csv  = os.path.join(FEAT_MASTER, "eeg_master_windows_5s.csv")

# Read just Subject/Session to build a split plan
df_index = pd.read_csv(master_csv, usecols=["Subject", "Session"]).drop_duplicates()

def sort_sessions(sess_list):
    """
    Sort session labels like ['R01','R02','R10'] correctly.
    If your labels are different, adjust this parser.
    """
    def key(s):
        # Expect 'R##' → numeric; fallback to string
        try:
            return int(str(s).strip().lstrip("R"))
        except:
            return s
    return sorted(sess_list, key=key)

def split_60_20_20(sess_list):
    sess = sort_sessions(sess_list)
    n = len(sess)
    if n == 1:
        return sess, [], []                 # all train if only 1
    if n == 2:
        return [sess[0]], [], [sess[1]]     # train, test
    # general case
    n_train = max(1, int(round(0.6 * n)))
    n_val   = max(1, int(round(0.2 * n)))
    # ensure at least 1 test
    if n_train + n_val >= n:
        n_val = max(1, n_val - 1)
    train = sess[:n_train]
    val   = sess[n_train:n_train+n_val]
    test  = sess[n_train+n_val:]
    if len(test) == 0:
        test = [sess[-1]]
        if val and val[-1] == test[0]:
            val = val[:-1]
    return train, val, test

plan_rows = []
for subj, g in df_index.groupby("Subject"):
    sess_list = g["Session"].unique().tolist()
    tr, va, te = split_60_20_20(sess_list)
    plan_rows.append({
        "Subject": subj,
        "Train": ",".join(tr),
        "Val":   ",".join(va),
        "Test":  ",".join(te),
        "n_sessions": len(sess_list)
    })

split_plan_df = pd.DataFrame(plan_rows).sort_values("Subject")

# Save plan
SPLIT_DIR = os.path.join(OUTPUT_DIR, "features", "splits")
os.makedirs(SPLIT_DIR, exist_ok=True)
split_csv = os.path.join(SPLIT_DIR, "session_split_plan_60_20_20.csv")
split_plan_df.to_csv(split_csv, index=False)

print("Saved session-wise split plan →", split_csv)
display(split_plan_df.head(12))


Saved session-wise split plan → C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\features\splits\session_split_plan_60_20_20.csv


Unnamed: 0,Subject,Train,Val,Test,n_sessions
0,S001,"R01,R02,R03,R04,R05,R06,R07,R08","R09,R10,R11","R12,R13,R14",14
1,S002,"R01,R02,R03,R04,R05,R06,R07,R08","R09,R10,R11","R12,R13,R14",14
2,S003,"R01,R02,R03,R04,R05,R06,R07,R08","R09,R10,R11","R12,R13,R14",14
3,S004,"R01,R02,R03,R04,R05,R06,R07,R08","R09,R10,R11","R12,R13,R14",14
4,S005,"R01,R02,R03,R04,R05,R06,R07,R08","R09,R10,R11","R12,R13,R14",14
5,S006,"R01,R02,R03,R04,R05,R06,R07,R08","R09,R10,R11","R12,R13,R14",14
6,S007,"R01,R02,R03,R04,R05,R06,R07,R08","R09,R10,R11","R12,R13,R14",14
7,S008,"R01,R02,R03,R04,R05,R06,R07,R08","R09,R10,R11","R12,R13,R14",14
8,S009,"R01,R02,R03,R04,R05,R06,R07,R08","R09,R10,R11","R12,R13,R14",14
9,S010,"R01,R02,R03,R04,R05,R06,R07,R08","R09,R10,R11","R12,R13,R14",14


In [None]:
# === Cell 7 (LOSO): Train on TRAIN sessions, tune with LOSO on TRAIN, pick by VAL, report on TEST ===
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneGroupOut  # <-- new
# (Assumes all other sklearn imports were done in Cell 1)

# Paths
FEAT_MASTER = os.path.join(OUTPUT_DIR, "features", "master")
master_csv  = os.path.join(FEAT_MASTER, "eeg_master_windows_5s.csv")
SPLIT_DIR   = os.path.join(OUTPUT_DIR, "features", "splits")
split_csv   = os.path.join(SPLIT_DIR, "session_split_plan_60_20_20.csv")

# Load master features & split plan
dfm   = pd.read_csv(master_csv)
plan  = pd.read_csv(split_csv)

# Build lookup: subject → set(s) of sessions for each split
def plan_to_sets(plan_df):
    def to_set(x):
        s = str(x).strip()
        return set([t for t in s.split(",") if t]) if s and s.lower() != "nan" else set()
    train_map, val_map, test_map = {}, {}, {}
    for _, r in plan_df.iterrows():
        subj = r["Subject"]
        train_map[subj] = to_set(r["Train"])
        val_map[subj]   = to_set(r["Val"])
        test_map[subj]  = to_set(r["Test"])
    return train_map, val_map, test_map

train_map, val_map, test_map = plan_to_sets(plan)

def mask_for(split_map):
    return dfm.apply(lambda r: r["Session"] in split_map.get(r["Subject"], set()), axis=1)

m_train = mask_for(train_map)
m_val   = mask_for(val_map)
m_test  = mask_for(test_map)

df_train = dfm[m_train].copy()
df_val   = dfm[m_val].copy()
df_test  = dfm[m_test].copy()

print("Window counts (session-disjoint):")
print("  Train:", len(df_train), " Val:", len(df_val), " Test:", len(df_test))

# Features and labels
feat_cols = [c for c in dfm.columns if c not in ["Subject","Session","StartSample","Fs"]]
X_train, y_train = df_train[feat_cols].values, df_train["Subject"].values
X_val,   y_val   = df_val[feat_cols].values,   df_val["Subject"].values
X_test,  y_test  = df_test[feat_cols].values,  df_test["Subject"].values
classes = sorted(dfm["Subject"].unique().tolist())

# --- NEW: groups for LOSO over TRAIN sessions (one group per Subject+Session) ---
groups_train = (df_train["Subject"] + "_" + df_train["Session"]).values

# Models & grids (reuse imports from Cell 1)
rf_pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", RandomForestClassifier(random_state=RANDOM_SEED))
])
rf_grid = {
    "clf__n_estimators": [300, 500],
    "clf__max_depth": [None, 12, 18],
    "clf__min_samples_leaf": [1, 2],
}

svm_pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", SVC(kernel="rbf", probability=True, random_state=RANDOM_SEED))
])
svm_grid = {
    "clf__C": [1, 5, 10],
    "clf__gamma": ["scale", 0.01, 0.001],
}

def train_and_select(name, pipe, grid, Xtr, ytr, Xva, yva, groups):
    """
    Hyperparameter tuning with Leave-One-Session-Out CV on TRAIN sessions.
    Each fold leaves out one entire TRAIN session (group) for validation.
    """
    logo = LeaveOneGroupOut()
    gs = GridSearchCV(
        pipe, grid,
        cv=logo.split(Xtr, ytr, groups),
        n_jobs=-1, scoring="accuracy", verbose=0
    )
    gs.fit(Xtr, ytr)
    best    = gs.best_estimator_
    cv_acc  = gs.best_score_              # LOSO-CV accuracy over TRAIN sessions
    val_acc = accuracy_score(yva, best.predict(Xva))  # held-out VAL sessions
    print(f"[{name}] LOSO-CV acc={cv_acc:.4f} | VAL acc={val_acc:.4f} | best={gs.best_params_}")
    return best, cv_acc, val_acc, gs.best_params_

# Train both models, pick by VAL accuracy
models = []
for name, base, grid in [
    ("RandomForest", rf_pipe, rf_grid),
    ("SVM_RBF",      svm_pipe, svm_grid),
]:
    best, cv_acc, val_acc, params = train_and_select(
        name, base, grid, X_train, y_train, X_val, y_val, groups_train
    )
    models.append((name, best, cv_acc, val_acc, params))

models.sort(key=lambda t: t[3], reverse=True)
sel_name, sel_model, sel_cv, sel_val, sel_params = models[0]
print(f"\nSelected model: {sel_name} | VAL acc={sel_val:.4f} | LOSO-CV={sel_cv:.4f} | params={sel_params}")

# Final TEST evaluation (session-disjoint)
yt_pred  = sel_model.predict(X_test)
test_acc = accuracy_score(y_test, yt_pred)
cm       = confusion_matrix(y_test, yt_pred, labels=classes)

# Output dirs
PLOTS_CM   = os.path.join(OUTPUT_DIR, "plots", "cm");    os.makedirs(PLOTS_CM, exist_ok=True)
PLOTS_ROC  = os.path.join(OUTPUT_DIR, "plots", "roc");   os.makedirs(PLOTS_ROC, exist_ok=True)
MODELS_OUT = os.path.join(OUTPUT_DIR, "models", "final"); os.makedirs(MODELS_OUT, exist_ok=True)

# Confusion matrix (TEST)
cm_png = os.path.join(PLOTS_CM, f"cm_test_{sel_name}_LOSO_sessiondisjoint.png")
plt.figure(figsize=(10, 9))
plt.imshow(cm, cmap="Blues")
plt.title(f"Confusion Matrix (TEST, LOSO) — {sel_name} | acc={test_acc:.4f}")
plt.xticks(np.arange(len(classes)), classes, rotation=90, fontsize=6)
plt.yticks(np.arange(len(classes)), classes, fontsize=6)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, str(cm[i, j]), ha='center', va='center', fontsize=5)
plt.colorbar(); plt.tight_layout(); plt.savefig(cm_png, dpi=150); plt.close()

# One-vs-rest ROC on TEST
roc_png = os.path.join(PLOTS_ROC, f"roc_test_{sel_name}_LOSO_sessiondisjoint.png")
y_prob  = sel_model.predict_proba(X_test)
Ybin    = label_binarize(y_test, classes=classes)
plt.figure(figsize=(8, 7))
macro_aucs = []
for i, cname in enumerate(classes):
    fpr, tpr, thr = roc_curve(Ybin[:, i], y_prob[:, i])
    roc_auc = auc(fpr, tpr)
    macro_aucs.append(roc_auc)
plt.plot([0,1],[0,1],'--', lw=1)
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title(f"OvR ROC (TEST, LOSO) — {sel_name} | Macro AUC={np.mean(macro_aucs):.3f}")
plt.tight_layout(); plt.savefig(roc_png, dpi=150); plt.close()

# Report & save model
rep     = classification_report(y_test, yt_pred, labels=classes, output_dict=True)
rep_df  = pd.DataFrame(rep).transpose()
rep_csv = os.path.join(MODELS_OUT, f"class_report_test_{sel_name}_LOSO_sessiondisjoint.csv")
rep_df.to_csv(rep_csv)

model_path = os.path.join(MODELS_OUT, f"chosen_{sel_name}_trainval_LOSO_sessiondisjoint.joblib")
joblib.dump(sel_model, model_path)

print(f"\nTEST acc (LOSO, session-disjoint) = {test_acc:.4f}")
print("Saved:",
      "\n - Confusion matrix:", cm_png,
      "\n - ROC (OvR):", roc_png,
      "\n - Test classification report:", rep_csv,
      "\n - Final model:", model_path)


Window counts (session-disjoint):
  Train: 18435  Val: 7914  Test: 7914
