In [17]:
# === Cell 1: Paths & Output Folders ===
import os

# Point these to the folders that CONTAIN your CSVs
MASTER_DIR = r"C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\features\master"
SPLIT_DIR  = r"C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\features\splits"

# Your requested output directory
OUTPUT_DIR = r"C:\Users\Admin\Desktop\ALVIN\output_spilt"

# Subfolders (auto-created)
PLOTS_CM    = os.path.join(OUTPUT_DIR, "plots", "cm")
PLOTS_ROC   = os.path.join(OUTPUT_DIR, "plots", "roc")
MODELS_OUT  = os.path.join(OUTPUT_DIR, "models")
REPORTS_OUT = os.path.join(OUTPUT_DIR, "reports")
SESSION_OUT = os.path.join(OUTPUT_DIR, "session_eval")
for d in [OUTPUT_DIR, PLOTS_CM, PLOTS_ROC, MODELS_OUT, REPORTS_OUT, SESSION_OUT]:
    os.makedirs(d, exist_ok=True)

print("OUTPUT_DIR:", OUTPUT_DIR)


OUTPUT_DIR: C:\Users\Admin\Desktop\ALVIN\output_spilt


In [19]:
# === Cell 2: Imports & Config ===
import glob, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import joblib

# Keep runs stable on CPU-only Windows
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


In [21]:
# === Cell 3: Resolve MASTER_CSV and SPLIT_CSV from folders ===
def pick_file(base_dir, prefer_names, fallback="*.csv"):
    if not os.path.isdir(base_dir):
        raise FileNotFoundError(f"Dir not found: {base_dir}")
    for name in prefer_names:
        hits = glob.glob(os.path.join(base_dir, name))
        if hits:
            return hits[0]
    hits = glob.glob(os.path.join(base_dir, fallback))
    if not hits:
        raise FileNotFoundError(f"No CSV found in {base_dir}")
    return hits[0]

MASTER_CSV = pick_file(MASTER_DIR,
                       ["eeg_master_windows_5s.csv", "*master*.csv"])
SPLIT_CSV  = pick_file(SPLIT_DIR,
                       ["session_split_plan_60_20_20.csv", "*split*plan*.csv"])

print("MASTER_CSV:", MASTER_CSV)
print("SPLIT_CSV :", SPLIT_CSV)

# quick preview
print("\nMaster head:")
print(pd.read_csv(MASTER_CSV, nrows=2))
print("\nSplit head:")
print(pd.read_csv(SPLIT_CSV, nrows=2))

MASTER_CSV: C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\features\master\eeg_master_windows_5s.csv
SPLIT_CSV : C:\Users\Admin\Desktop\ALVIN\outputs_final_loso\features\splits\session_split_plan_60_20_20.csv

Master head:
      Fp1_Delta  Fp1_Delta_Rel     Fp1_Theta  Fp1_Theta_Rel     Fp1_Alpha  \
0  5.076068e-10       0.450822  1.239780e-10       0.110109  9.280993e-11   
1  1.403810e-09       0.635569  1.281276e-10       0.058009  8.273167e-11   

   Fp1_Alpha_Rel      Fp1_Beta  Fp1_Beta_Rel     Fp1_Gamma  Fp1_Gamma_Rel  \
0       0.082428  9.198859e-11      0.081698  5.382360e-11       0.047802   
1       0.037456  1.366813e-10      0.061882  7.975574e-11       0.036109   

   ...      Pz_Alpha  Pz_Alpha_Rel       Pz_Beta  Pz_Beta_Rel      Pz_Gamma  \
0  ...  1.318680e-10      0.102281  1.205842e-10     0.093529  2.268541e-11   
1  ...  8.921552e-11      0.060063  1.960278e-10     0.131973  2.698234e-11   

   Pz_Gamma_Rel  Subject  Session  StartSample     Fs  
0      0.017596   

In [23]:
# === Cell 4: Load master + plan; build session-disjoint Train/Val/Test ===
dfm = pd.read_csv(MASTER_CSV)
plan = pd.read_csv(SPLIT_CSV)

need = {"Subject","Session","StartSample","Fs"}
miss = need - set(dfm.columns)
if miss:
    raise ValueError(f"Master missing columns: {miss}")

# Support two split plan styles:
# A) row-aligned plan with "Split" column
# B) per-subject lists Train/Val/Test (comma-separated session IDs)
if ("Split" in plan.columns) and ("Subject" in plan.columns) and len(plan) == len(dfm):
    dfm["Split"] = plan["Split"].values
else:
    def _to_set(s):
        s = str(s).strip()
        return set(t for t in s.split(",") if t) if s and s.lower()!="nan" else set()
    tr, va, te = {}, {}, {}
    for _, r in plan.iterrows():
        subj = str(r["Subject"])
        tr[subj] = _to_set(r.get("Train",""))
        va[subj] = _to_set(r.get("Val",""))
        te[subj] = _to_set(r.get("Test",""))
    def _assign(row):
        s, sess = row["Subject"], row["Session"]
        if sess in tr.get(s,set()): return "Train"
        if sess in va.get(s,set()): return "Val"
        if sess in te.get(s,set()): return "Test"
        return "Unassigned"
    dfm["Split"] = dfm.apply(_assign, axis=1)

# Guard against unassigned rows
if (dfm["Split"]=="Unassigned").any():
    bad = dfm[dfm["Split"]=="Unassigned"][["Subject","Session"]].drop_duplicates().head(20)
    print("Unassigned examples:\n", bad)
    raise AssertionError("Some windows not assigned; fix your split CSV.")

print(dfm["Split"].value_counts())

df_train = dfm[dfm["Split"]=="Train"].copy()
df_val   = dfm[dfm["Split"]=="Val"].copy()
df_test  = dfm[dfm["Split"]=="Test"].copy()

print("\nWindow counts — Train:", len(df_train), " Val:", len(df_val), " Test:", len(df_test))

feat_cols = [c for c in dfm.columns if c not in ["Subject","Session","StartSample","Fs","Split"]]
X_train, y_train = df_train[feat_cols].values, df_train["Subject"].values
X_val,   y_val   = df_val[feat_cols].values,   df_val["Subject"].values
X_test,  y_test  = df_test[feat_cols].values,  df_test["Subject"].values
classes = sorted(dfm["Subject"].unique().tolist())

print("Shapes -> X_train", X_train.shape, "| X_val", X_val.shape, "| X_test", X_test.shape)
print("Subjects:", len(classes))


Split
Train    18435
Val       7914
Test      7914
Name: count, dtype: int64

Window counts — Train: 18435  Val: 7914  Test: 7914
Shapes -> X_train (18435, 190) | X_val (7914, 190) | X_test (7914, 190)
Subjects: 109


In [24]:
# === Cell 5: Define models & small grids (fast) ===
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

rf_pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", RandomForestClassifier(random_state=RANDOM_SEED))
])
rf_grid = {
    "clf__n_estimators": [300],    # single strong value (fast)
    "clf__max_depth": [None, 18],  # small choice
    "clf__min_samples_leaf": [1],  # keep simple
}

svm_pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", SVC(kernel="rbf", probability=True, random_state=RANDOM_SEED))
])
svm_grid = {
    "clf__C": [1, 10],
    "clf__gamma": ["scale"],       # robust default
}


In [25]:
# === Cell 6: Train on TRAIN; choose best by VAL ===
results = []
for name, pipe, grid in [
    ("RandomForest", rf_pipe, rf_grid),
    ("SVM_RBF",      svm_pipe, svm_grid),
]:
    gs = GridSearchCV(pipe, grid, cv=3, n_jobs=-1, scoring="accuracy", verbose=1)
    gs.fit(X_train, y_train)
    best = gs.best_estimator_
    val_acc = accuracy_score(y_val, best.predict(X_val))
    print(f"[{name}] VAL acc={val_acc:.4f} | best={gs.best_params_}")
    results.append((name, val_acc, gs.best_params_, best))

# pick best by validation accuracy
results.sort(key=lambda t: t[1], reverse=True)
best_name, best_val_acc, best_params, best_model = results[0]
print(f"\nSelected model: {best_name} | VAL acc={best_val_acc:.4f} | params={best_params}")

# Save chosen model
model_path = os.path.join(OUTPUT_DIR, "models", f"chosen_{best_name}_trainval_sessiondisjoint.joblib")
joblib.dump(best_model, model_path)
print("Saved model:", model_path)


Fitting 3 folds for each of 2 candidates, totalling 6 fits
[RandomForest] VAL acc=0.9265 | best={'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__n_estimators': 300}
Fitting 3 folds for each of 2 candidates, totalling 6 fits
[SVM_RBF] VAL acc=0.9294 | best={'clf__C': 10, 'clf__gamma': 'scale'}

Selected model: SVM_RBF | VAL acc=0.9294 | params={'clf__C': 10, 'clf__gamma': 'scale'}
Saved model: C:\Users\Admin\Desktop\ALVIN\output_spilt\models\chosen_SVM_RBF_trainval_sessiondisjoint.joblib


In [26]:
# === Cell 7: Evaluate on TEST (strict, unseen) ===
y_pred_test = best_model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred_test)
print(f"\nTEST acc (session-disjoint) = {test_acc:.4f}")

cm = confusion_matrix(y_test, y_pred_test, labels=classes)
print("Confusion matrix shape:", cm.shape)


TEST acc (session-disjoint) = 0.9277
Confusion matrix shape: (109, 109)


In [27]:
# === Cell 8: Save plots & reports (TEST) ===
def plot_confusion_matrix(cm, classes, out_path, title):
    plt.figure(figsize=(10,9))
    plt.imshow(cm, cmap="Blues")
    plt.title(title)
    plt.xticks(np.arange(len(classes)), classes, rotation=90, fontsize=6)
    plt.yticks(np.arange(len(classes)), classes, fontsize=6)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, str(cm[i, j]), ha='center', va='center', fontsize=5)
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

def compute_ovr_roc(proba, y_true, class_names, out_png, out_csv):
    Y = label_binarize(y_true, classes=class_names)
    aucs = []
    plt.figure(figsize=(8,7))
    for i, cname in enumerate(class_names):
        fpr, tpr, thr = roc_curve(Y[:, i], proba[:, i])
        roc_auc = auc(fpr, tpr); aucs.append(roc_auc)
    plt.plot([0,1],[0,1],'--', lw=1)
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title(f"OvR ROC (TEST) — Macro AUC={np.mean(aucs):.3f}")
    plt.tight_layout(); plt.savefig(out_png, dpi=150); plt.close()
    pd.DataFrame({"Class": class_names, "AUC": aucs}).to_csv(out_csv, index=False)

# Save CM
cm_png = os.path.join(PLOTS_CM, f"cm_test_{best_name}_sessiondisjoint.png")
plot_confusion_matrix(cm, classes, cm_png, f"Confusion Matrix — {best_name} (TEST) | acc={test_acc:.4f}")
print("Saved:", cm_png)

# Save ROC (if available)
try:
    y_prob_test = best_model.predict_proba(X_test)
    roc_png = os.path.join(PLOTS_ROC, f"roc_test_{best_name}_sessiondisjoint.png")
    roc_csv = os.path.join(REPORTS_OUT, f"roc_test_{best_name}_sessiondisjoint.csv")
    compute_ovr_roc(y_prob_test, y_test, classes, roc_png, roc_csv)
    print("Saved:", roc_png)
    print("Saved:", roc_csv)
except Exception as e:
    print("ROC not available (predict_proba missing):", e)

# Classification report
rep = classification_report(y_test, y_pred_test, labels=classes, output_dict=True)
rep_df = pd.DataFrame(rep).transpose()
rep_csv = os.path.join(REPORTS_OUT, f"class_report_test_{best_name}_sessiondisjoint.csv")
rep_df.to_csv(rep_csv)
print("Saved:", rep_csv)

Saved: C:\Users\Admin\Desktop\ALVIN\output_spilt\plots\cm\cm_test_SVM_RBF_sessiondisjoint.png
Saved: C:\Users\Admin\Desktop\ALVIN\output_spilt\plots\roc\roc_test_SVM_RBF_sessiondisjoint.png
Saved: C:\Users\Admin\Desktop\ALVIN\output_spilt\reports\roc_test_SVM_RBF_sessiondisjoint.csv
Saved: C:\Users\Admin\Desktop\ALVIN\output_spilt\reports\class_report_test_SVM_RBF_sessiondisjoint.csv


In [28]:
# === Cell 9 (optional): Per-session majority vote on TEST ===
def majority_vote(series):
    s = pd.Series(series)
    return s.mode().iloc[0]

df_test = dfm[dfm["Split"]=="Test"].copy()
df_test["y_pred"] = y_pred_test

session_votes = (
    df_test
    .groupby(["Subject","Session"])
    .agg(true_subject=("Subject","first"),
         pred_subject=("y_pred", majority_vote),
         n_windows=("y_pred","size"))
    .reset_index(drop=True)
)
session_votes["correct"] = (session_votes["true_subject"] == session_votes["pred_subject"]).astype(int)
session_acc = session_votes["correct"].mean()

session_csv = os.path.join(SESSION_OUT, "test_session_majority_vote.csv")
session_votes.to_csv(session_csv, index=False)
print(f"Session-level accuracy (TEST, majority vote) = {session_acc:.4f}")
print("Saved:", session_csv)
session_votes.head(10)


Session-level accuracy (TEST, majority vote) = 0.9969
Saved: C:\Users\Admin\Desktop\ALVIN\output_spilt\session_eval\test_session_majority_vote.csv


Unnamed: 0,true_subject,pred_subject,n_windows,correct
0,S001,S001,25,1
1,S001,S001,25,1
2,S001,S001,25,1
3,S002,S002,24,1
4,S002,S002,24,1
5,S002,S002,24,1
6,S003,S003,25,1
7,S003,S003,25,1
8,S003,S003,25,1
9,S004,S004,24,1


In [29]:
# Define a subdirectory for session-level reports inside your OUTPUT_DIR
SESSION_OUT = os.path.join(OUTPUT_DIR, "session_reports")
os.makedirs(SESSION_OUT, exist_ok=True)
print("Session report outputs will be saved in:", SESSION_OUT)


Session report outputs will be saved in: C:\Users\Admin\Desktop\ALVIN\output_spilt\session_reports


In [30]:
# === Per-session report (TEST) — window accuracy + majority vote ===
import os
import numpy as np
import pandas as pd

os.makedirs(SESSION_OUT, exist_ok=True)

# 1) Ensure Test rows and predictions exist
try:
    df_test
except NameError:
    df_test = dfm[dfm["Split"] == "Test"].copy()

try:
    y_pred_test
except NameError:
    y_pred_test = best_model.predict(X_test)

# 2) Attach predictions to Test windows
df_test_pred = df_test.copy()
df_test_pred["y_true"]  = df_test_pred["Subject"]
df_test_pred["y_pred"]  = y_pred_test
df_test_pred["correct"] = (df_test_pred["y_true"] == df_test_pred["y_pred"]).astype(int)

# Optional: a single column like S001R09 for readability
df_test_pred["SessionID"] = df_test_pred["Subject"].astype(str) + df_test_pred["Session"].astype(str)

# 3) Per-session window accuracy + majority vote
def majority_vote(series):
    s = pd.Series(series)
    return s.mode().iloc[0]

per_session = (
    df_test_pred
    .groupby(["Subject","Session"], as_index=False)
    .agg(
        n_windows   = ("correct","size"),
        n_correct   = ("correct","sum"),
        win_acc     = ("correct","mean"),
        majority_pred = ("y_pred", majority_vote),
    )
)
per_session["majority_correct"] = (per_session["majority_pred"] == per_session["Subject"]).astype(int)
per_session["SessionID"] = per_session["Subject"].astype(str) + per_session["Session"].astype(str)

# 4) Summary metrics
overall_window_acc   = df_test_pred["correct"].mean()
overall_session_acc  = per_session["majority_correct"].mean()

# 5) Save CSVs
win_csv   = os.path.join(SESSION_OUT, "test_window_predictions.csv")                 # every window
sess_csv  = os.path.join(SESSION_OUT, "test_session_report.csv")                     # one row per session
summ_csv  = os.path.join(SESSION_OUT, "test_window_vs_session_summary.csv")          # one-line summary

df_test_pred.to_csv(win_csv, index=False)
per_session[["Subject","Session","SessionID","n_windows","n_correct","win_acc","majority_pred","majority_correct"]]\
    .sort_values(["Subject","Session"]).to_csv(sess_csv, index=False)

pd.DataFrame([{
    "PerWindow_Accuracy": float(overall_window_acc),
    "PerSession_MajorityVote_Accuracy": float(overall_session_acc),
    "Num_Test_Windows": int(len(df_test_pred)),
    "Num_Test_Sessions": int(len(per_session))
}]).to_csv(summ_csv, index=False)

print(f"Per-window Test accuracy (overall)        : {overall_window_acc:.4f}")
print(f"Per-session majority-vote Test accuracy   : {overall_session_acc:.4f}")
print("Saved:")
print(" -", win_csv)
print(" -", sess_csv)
print(" -", summ_csv)

# Preview a few rows
per_session.sort_values(["Subject","Session"]).head(10)

Per-window Test accuracy (overall)        : 0.9277
Per-session majority-vote Test accuracy   : 0.9969
Saved:
 - C:\Users\Admin\Desktop\ALVIN\output_spilt\session_reports\test_window_predictions.csv
 - C:\Users\Admin\Desktop\ALVIN\output_spilt\session_reports\test_session_report.csv
 - C:\Users\Admin\Desktop\ALVIN\output_spilt\session_reports\test_window_vs_session_summary.csv


Unnamed: 0,Subject,Session,n_windows,n_correct,win_acc,majority_pred,majority_correct,SessionID
0,S001,R12,25,19,0.76,S001,1,S001R12
1,S001,R13,25,25,1.0,S001,1,S001R13
2,S001,R14,25,25,1.0,S001,1,S001R14
3,S002,R12,24,24,1.0,S002,1,S002R12
4,S002,R13,24,21,0.875,S002,1,S002R13
5,S002,R14,24,24,1.0,S002,1,S002R14
6,S003,R12,25,21,0.84,S003,1,S003R12
7,S003,R13,25,23,0.92,S003,1,S003R13
8,S003,R14,25,24,0.96,S003,1,S003R14
9,S004,R12,24,23,0.958333,S004,1,S004R12
