## Importing Libraries

In [3]:
import os, glob, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.signal import welch
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                             roc_curve, auc)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics.pairwise import cosine_similarity
import joblib

## Defining pathway for the base dataset

In [5]:
INPUT_DIR  = r"C:\Users\Admin\Desktop\ALVIN\dataset"   # folder with CSV/EDF
OUTPUT_DIR = r"C:\Users\Admin\Desktop\ALVIN\outputs"   # where results go
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Organized subfolders
DIRS = {
    "csv":        os.path.join(OUTPUT_DIR, "corrected_csv"),
    "features":   os.path.join(OUTPUT_DIR, "features"),
    "plots":      os.path.join(OUTPUT_DIR, "plots"),
    "similarity": os.path.join(OUTPUT_DIR, "similarity"),
    "models":     os.path.join(OUTPUT_DIR, "models"),
}
for d in DIRS.values():
    os.makedirs(d, exist_ok=True)

# ---------------- Analysis parameters ----------------
WINDOW_SEC   = 5.0       # length of each window in seconds
OVERLAP      = 0.5       # 50% overlap
DURATION_SEC = 60.0      # use first 60 s (for EDF) / infer fs from rows/60 (for CSV)
FS_KNOWN     = None      # set e.g. 516.67 if you know exact fs; else leave None
RANDOM_SEED  = 42
np.random.seed(RANDOM_SEED)

# 19-channel target order
CHANNELS = ['Fp1','Fp2','F3','F4','F7','F8','T3','T4','C3','C4',
            'T5','T6','P3','P4','O1','O2','Fz','Cz','Pz']

# EEG bands
BANDS = {
    "Delta": (0.5, 4.0),
    "Theta": (4.0, 7.0),
    "Alpha": (8.0, 13.0),
    "Beta":  (13.0, 30.0),
    "Gamma": (30.0, 45.0),  # stop at 45 Hz to avoid 50 Hz region
}

print("Folders ready:")
for k, v in DIRS.items():
    print(f" - {k}: {v}")

Folders ready:
 - csv: C:\Users\Admin\Desktop\ALVIN\outputs\corrected_csv
 - features: C:\Users\Admin\Desktop\ALVIN\outputs\features
 - plots: C:\Users\Admin\Desktop\ALVIN\outputs\plots
 - similarity: C:\Users\Admin\Desktop\ALVIN\outputs\similarity
 - models: C:\Users\Admin\Desktop\ALVIN\outputs\models


## Loader, Header fixing

In [7]:
def likely_unlabeled(df_head, expected):
    cols = list(df_head.columns)
    if cols == expected:
        return False
    num_like = 0
    for c in cols:
        try:
            float(str(c).strip()); num_like += 1
        except Exception:
            pass
    return num_like >= len(cols) // 2

def ensure_labeled_csv(path, expected_channels=CHANNELS, corrected_dir=DIRS["csv"]):
    """Load CSV; if headers missing/misaligned, fix to the 19-channel order."""
    stem = os.path.splitext(os.path.basename(path))[0]
    try:
        df_try = pd.read_csv(path, nrows=3)
    except Exception:
        df = pd.read_csv(path, header=None)
        df.columns = expected_channels
        df.to_csv(os.path.join(corrected_dir, f"{stem}_corrected.csv"), index=False)
        return df, stem

    if likely_unlabeled(df_try, expected_channels):
        df = pd.read_csv(path, header=None)
        df.columns = expected_channels
        df.to_csv(os.path.join(corrected_dir, f"{stem}_corrected.csv"), index=False)
        return df, stem
    else:
        df = pd.read_csv(path)
        if list(df.columns) != expected_channels:
            if set(expected_channels).issubset(df.columns):
                df = df[expected_channels]
            else:
                raise ValueError(f"{path}: columns do not match expected 19 channels.")
        return df, stem

def load_edf_as_df(path, target_channels=CHANNELS, duration_sec=DURATION_SEC):
    """Read EDF, remap modern names to your legacy targets, return first 60 s as DataFrame."""
    import mne
    ALIAS = {'T7':'T3','T8':'T4','P7':'T5','P8':'T6'}  # modern -> legacy
    raw = mne.io.read_raw_edf(path, preload=True, verbose="ERROR")
    raw.rename_channels({ch: ALIAS.get(ch, ch) for ch in raw.ch_names})
    fs = float(raw.info['sfreq'])
    n = int(duration_sec * fs)
    picks = [ch for ch in target_channels if ch in raw.ch_names]
    if not picks:
        raise ValueError(f"{path}: none of the 19 target channels present.")
    data = raw.get_data(picks=picks)[:, :n].T
    # pre-allocate with correct number of rows (avoid length mismatch)
    df = pd.DataFrame(index=np.arange(data.shape[0]), columns=target_channels, dtype=float)
    df[:] = np.nan
    have = pd.DataFrame(data, columns=picks)
    for ch in picks:
        df[ch] = have[ch].values
    return df, fs

def load_eeg_as_df(path, target_channels=CHANNELS, duration_sec=DURATION_SEC):
    """Wrapper that accepts .csv or .edf and returns (df, subject_id, fs)."""
    stem = os.path.splitext(os.path.basename(path))[0]
    ext  = os.path.splitext(path)[1].lower()
    if ext == ".csv":
        df, subj = ensure_labeled_csv(path, expected_channels=target_channels, corrected_dir=DIRS["csv"])
        fs = df.shape[0] / float(duration_sec)
        return df, subj, fs
    elif ext == ".edf":
        df, fs = load_edf_as_df(path, target_channels=target_channels, duration_sec=duration_sec)
        return df, stem, fs
    else:
        raise ValueError(f"Unsupported file type: {ext}")


## DSP, Welch, bands, windows

In [9]:
def welch_psd(x, fs, nperseg, noverlap):
    return welch(x, fs=fs, nperseg=int(nperseg), noverlap=int(noverlap), detrend='constant')

def integrate_bandpower(f, Pxx, band):
    fmin, fmax = band
    idx = (f >= fmin) & (f < fmax)
    if not np.any(idx):
        return 0.0
    return float(np.trapz(Pxx[idx], f[idx]))

def compute_wide_summary(df, fs, bands=BANDS, channels=CHANNELS, out_csv=None):
    """Absolute & relative band powers over full 60 s (one row)."""
    nperseg = int(4 * fs); noverlap = int(0.5 * nperseg)
    feats = {}
    for ch in channels:
        x = pd.to_numeric(df[ch], errors='coerce').fillna(0.0).values
        f, Pxx = welch_psd(x, fs, nperseg, noverlap)
        total = float(np.trapz(Pxx, f))
        for b, rng in bands.items():
            abs_p = integrate_bandpower(f, Pxx, rng)
            rel_p = abs_p / total if total > 0 else np.nan
            feats[f"{ch}_{b}"] = abs_p
            feats[f"{ch}_{b}_Rel"] = rel_p
    wide = pd.DataFrame([feats])
    if out_csv:
        wide.to_csv(out_csv, index=False)
    return wide

def segment_windows(n_samples, fs, win_sec=WINDOW_SEC, overlap=OVERLAP):
    win_len = int(win_sec * fs)
    hop = max(1, int(win_len * (1 - overlap)))
    return [(s, s + win_len) for s in range(0, n_samples - win_len + 1, hop)]

def compute_window_features(df, fs, window, bands=BANDS, channels=CHANNELS):
    s, e = window
    xw = df.iloc[s:e]
    nperseg = max(4, int(2 * fs)); noverlap = int(0.5 * nperseg)
    feats = {}
    for ch in channels:
        x = pd.to_numeric(xw[ch], errors='coerce').fillna(0.0).values
        f, Pxx = welch_psd(x, fs, nperseg, noverlap)
        total = float(np.trapz(Pxx, f))
        for b, rng in bands.items():
            abs_p = integrate_bandpower(f, Pxx, rng)
            rel_p = abs_p / total if total > 0 else np.nan
            feats[f"{ch}_{b}"] = abs_p
            feats[f"{ch}_{b}_Rel"] = rel_p
    feats["StartSample"] = s
    feats["Fs"] = fs
    return feats


## Process All Subject, Wide summaires Master Features

In [11]:
# gather CSV + EDF
candidates = sorted([p for p in glob.glob(os.path.join(INPUT_DIR, "*"))
                     if os.path.splitext(p)[1].lower() in [".csv", ".edf"]])
if not candidates:
    raise FileNotFoundError(f"No CSV/EDF files found in {INPUT_DIR}")

print(f"Found {len(candidates)} file(s).")

master_rows = []
subjects = []
wide_paths = []

for path in candidates:
    df, subj, fs_file = load_eeg_as_df(path, target_channels=CHANNELS, duration_sec=DURATION_SEC)
    fs = FS_KNOWN if FS_KNOWN else fs_file
    print(f"[{subj}] type={os.path.splitext(path)[1].lower()[1:]}, samples={len(df)}, fs={fs:.3f} Hz")

    # Per-subject wide summary (full 60 s)
    wide_csv = os.path.join(DIRS["features"], f"{subj}_bandpowers_wide.csv")
    wide = compute_wide_summary(df, fs, BANDS, CHANNELS, out_csv=wide_csv)
    wide_paths.append(wide_csv)
    subjects.append(subj)

    # Windowed features
    windows = segment_windows(len(df), fs, WINDOW_SEC, OVERLAP)
    for w in windows:
        feats = compute_window_features(df, fs, w, BANDS, CHANNELS)
        feats["Subject"] = subj
        master_rows.append(feats)

# Save master features
if not master_rows:
    raise RuntimeError("No windowed features produced. Check WINDOW_SEC/OVERLAP vs file length.")

master = pd.DataFrame(master_rows)
master_csv = os.path.join(DIRS["features"], f"eeg_biometric_features_{int(WINDOW_SEC)}s_master.csv")
master.to_csv(master_csv, index=False)

print("\nSaved master features:", master_csv)
print("Subjects processed:", sorted(set(subjects)))


Found 36 file(s).
[s00] type=csv, samples=31000, fs=516.667 Hz
[s01] type=csv, samples=31000, fs=516.667 Hz
[s02] type=csv, samples=31000, fs=516.667 Hz
[s03] type=csv, samples=31000, fs=516.667 Hz
[s04] type=csv, samples=31000, fs=516.667 Hz
[s05] type=csv, samples=31000, fs=516.667 Hz
[s06] type=csv, samples=31000, fs=516.667 Hz
[s07] type=csv, samples=31000, fs=516.667 Hz
[s08] type=csv, samples=31000, fs=516.667 Hz
[s09] type=csv, samples=31000, fs=516.667 Hz
[s10] type=csv, samples=31000, fs=516.667 Hz
[s11] type=csv, samples=31000, fs=516.667 Hz
[s12] type=csv, samples=31000, fs=516.667 Hz
[s13] type=csv, samples=31000, fs=516.667 Hz
[s14] type=csv, samples=31000, fs=516.667 Hz
[s15] type=csv, samples=31000, fs=516.667 Hz
[s16] type=csv, samples=31000, fs=516.667 Hz
[s17] type=csv, samples=31000, fs=516.667 Hz
[s18] type=csv, samples=31000, fs=516.667 Hz
[s19] type=csv, samples=31000, fs=516.667 Hz
[s20] type=csv, samples=31000, fs=516.667 Hz
[s21] type=csv, samples=31000, fs=516

## Train & Evaluate (RF + SVM, CV)

In [21]:
dfm = pd.read_csv(master_csv)
X = dfm.drop(columns=["Subject", "StartSample", "Fs"]).values
y = dfm["Subject"].values
classes = sorted(dfm["Subject"].unique().tolist())
print("Classes:", classes)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

rf_pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", RandomForestClassifier(random_state=RANDOM_SEED))
])
rf_grid = {"clf__n_estimators":[300, 500], "clf__max_depth":[None, 18], "clf__min_samples_leaf":[1,2]}

svm_pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", SVC(kernel="rbf", probability=True, random_state=RANDOM_SEED))
])
svm_grid = {"clf__C":[5,10], "clf__gamma":["scale", 0.001]}

results = []

for name, base, grid in [
    ("RandomForest", rf_pipe, rf_grid),
    ("SVM_RBF",      svm_pipe, svm_grid),
]:
    cms, accs, probas_all, y_true_all, y_pred_all = [], [], [], [], []
    for tr, te in skf.split(X, y):
        gs = GridSearchCV(base, grid, cv=3, n_jobs=-1, scoring="accuracy", verbose=0)
        gs.fit(X[tr], y[tr])
        best = gs.best_estimator_
        yp = best.predict(X[te])
        accs.append(accuracy_score(y[te], yp))
        cms.append(confusion_matrix(y[te], yp, labels=classes))
        probas_all.append(best.predict_proba(X[te]))
        y_true_all.append(y[te]); y_pred_all.append(yp)

    mean_cv_acc = float(np.mean(accs))
    cm_sum = np.sum(cms, axis=0)

    # Confusion matrix saved as PNG (no plt.show())
    plt.figure(figsize=(6, 5))
    plt.imshow(cm_sum, cmap="Blues")
    plt.title(f"Confusion Matrix (5-fold CV) — {name}\nmean acc={mean_cv_acc:.4f}")
    plt.xticks(np.arange(len(classes)), classes, rotation=45)
    plt.yticks(np.arange(len(classes)), classes)
    for i in range(cm_sum.shape[0]):
        for j in range(cm_sum.shape[1]):
            plt.text(j, i, str(cm_sum[i, j]), ha='center', va='center', fontsize=8)
    plt.colorbar()
    plt.tight_layout()
    cm_path = os.path.join(DIRS["plots"], f"cm_{name}.png")
    plt.savefig(cm_path, dpi=150)
    plt.close()

    # ROC curve saved as PNG (no plt.show())
    proba_stack = np.vstack(probas_all)
    y_true_stack = np.concatenate(y_true_all)
    Ybin = label_binarize(y_true_stack, classes=classes)
    aucs, eers = [], []
    plt.figure(figsize=(6,5))
    for i, cname in enumerate(classes):
        fpr, tpr, thr = roc_curve(Ybin[:, i], proba_stack[:, i])
        roc_auc = auc(fpr, tpr)
        fnr = 1 - tpr
        idx = np.nanargmin(np.abs(fnr - fpr))
        eer = float((fnr[idx] + fpr[idx]) / 2.0)
        aucs.append(float(roc_auc)); eers.append(float(eer))
        plt.plot(fpr, tpr, label=f"{cname} AUC={roc_auc:.3f}, EER={eer:.3f}")
    plt.plot([0,1],[0,1],'--', lw=1)
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title(f"One-vs-Rest ROC — {name}")
    plt.legend(fontsize=7, loc="lower right")
    plt.tight_layout()
    roc_png = os.path.join(DIRS["plots"], f"roc_{name}.png")
    plt.savefig(roc_png, dpi=150)
    plt.close()

    # Save verification table (per-class + macro row)
    verif_csv = os.path.join(DIRS["models"], f"verification_{name}.csv")
    verif_df = pd.DataFrame({"Class": classes, "AUC": aucs, "EER": eers})
    macro_row = pd.DataFrame([{
        "Class": "macro_avg",
        "AUC": float(np.mean(aucs)),
        "EER": float(np.mean(eers)),
    }])
    verif_df = pd.concat([verif_df, macro_row], ignore_index=True)
    verif_df.to_csv(verif_csv, index=False)

    # Train final best model on ALL data and save
    final_gs = GridSearchCV(base, grid, cv=3, n_jobs=-1, scoring="accuracy", verbose=0)
    final_gs.fit(X, y)
    best_model = final_gs.best_estimator_
    model_path = os.path.join(DIRS["models"], f"best_{name}.joblib")
    joblib.dump(best_model, model_path)

    # Classification report (stacked CV preds, indicative)
    y_pred_stack = np.concatenate(y_pred_all)
    report = classification_report(y_true_stack, y_pred_stack, labels=classes, output_dict=True)
    rep_df = pd.DataFrame(report).transpose()
    rep_path = os.path.join(DIRS["models"], f"class_report_{name}.csv")
    rep_df.to_csv(rep_path)

    results.append({
        "model": name,
        "mean_cv_acc": mean_cv_acc,
        "confusion_matrix_png": cm_path,
        "roc_png": roc_png,
        "verification_csv": verif_csv,
        "class_report_csv": rep_path,
        "saved_model": model_path,
        "best_params": final_gs.best_params_,
    })
    print(f"[{name}] mean CV acc = {mean_cv_acc:.4f} | best params: {final_gs.best_params_}")

summary = pd.DataFrame(results)
summary_path = os.path.join(DIRS["models"], "model_summary.csv")
summary.to_csv(summary_path, index=False)
print("\nSaved model summary:", summary_path)
summary

Classes: ['s00', 's01', 's02', 's03', 's04', 's05', 's06', 's07', 's08', 's09', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21', 's22', 's23', 's24', 's25', 's26', 's27', 's28', 's29', 's30', 's31', 's32', 's33', 's34', 's35']
[RandomForest] mean CV acc = 0.9879 | best params: {'clf__max_depth': 18, 'clf__min_samples_leaf': 1, 'clf__n_estimators': 500}
[SVM_RBF] mean CV acc = 0.9879 | best params: {'clf__C': 10, 'clf__gamma': 0.001}

Saved model summary: C:\Users\Admin\Desktop\ALVIN\outputs\models\model_summary.csv


Unnamed: 0,model,mean_cv_acc,confusion_matrix_png,roc_png,verification_csv,class_report_csv,saved_model,best_params
0,RandomForest,0.98793,C:\Users\Admin\Desktop\ALVIN\outputs\plots\cm_...,C:\Users\Admin\Desktop\ALVIN\outputs\plots\roc...,C:\Users\Admin\Desktop\ALVIN\outputs\models\ve...,C:\Users\Admin\Desktop\ALVIN\outputs\models\cl...,C:\Users\Admin\Desktop\ALVIN\outputs\models\be...,"{'clf__max_depth': 18, 'clf__min_samples_leaf'..."
1,SVM_RBF,0.98793,C:\Users\Admin\Desktop\ALVIN\outputs\plots\cm_...,C:\Users\Admin\Desktop\ALVIN\outputs\plots\roc...,C:\Users\Admin\Desktop\ALVIN\outputs\models\ve...,C:\Users\Admin\Desktop\ALVIN\outputs\models\cl...,C:\Users\Admin\Desktop\ALVIN\outputs\models\be...,"{'clf__C': 10, 'clf__gamma': 0.001}"


### Subject-wise Similarity Report

In [28]:
# --- Subject-wise similarity (centroids, margin, top pairs) ---

# Requires: X, y, classes, DIRS
scaler_sim = StandardScaler(with_mean=True, with_std=True)
Xz = scaler_sim.fit_transform(X)

subjects = np.array(classes)
centroids = {s: Xz[y == s].mean(axis=0) for s in subjects}
C = np.vstack([centroids[s] for s in subjects])
S_mat = cosine_similarity(C)

# Matrix CSV + heatmap
sim_df = pd.DataFrame(S_mat, index=subjects, columns=subjects)
sim_csv = os.path.join(DIRS["similarity"], "subject_similarity_matrix.csv")
sim_df.to_csv(sim_csv)

plt.figure(figsize=(7,6))
plt.imshow(S_mat, vmin=-1, vmax=1, cmap="coolwarm")
plt.xticks(np.arange(len(subjects)), subjects, rotation=90)
plt.yticks(np.arange(len(subjects)), subjects)
plt.colorbar(label="Cosine similarity")
plt.title("Subject–Subject Centroid Similarity")
plt.tight_layout()
sim_png = os.path.join(DIRS["similarity"], "subject_similarity_matrix.png")
plt.savefig(sim_png, dpi=150); plt.close()

# Per-subject metrics
rows = []
for s in subjects:
    Xi = Xz[y == s]
    sims = cosine_similarity(Xi, centroids[s].reshape(1,-1)).ravel()
    genuine_mean = float(np.mean(sims)) if sims.size else np.nan
    genuine_std  = float(np.std(sims))  if sims.size else np.nan
    row = sim_df.loc[s].drop(labels=s)
    nearest_imp_id  = row.idxmax()
    nearest_imp_sim = float(row.max())
    safety_margin = genuine_mean - nearest_imp_sim if np.isfinite(genuine_mean) else np.nan
    rows.append({
        "Subject": s,
        "Genuine_Mean": genuine_mean,
        "Genuine_Std": genuine_std,
        "Nearest_Impostor_Subject": nearest_imp_id,
        "Nearest_Impostor_Sim": nearest_imp_sim,
        "Safety_Margin": safety_margin
    })

report_df = pd.DataFrame(rows).sort_values(by="Safety_Margin", ascending=False)
report_csv = os.path.join(DIRS["similarity"], "subject_similarity_report.csv")
report_df.to_csv(report_csv, index=False)
print("Saved similarity matrix & report:")
print(" -", sim_csv)
print(" -", sim_png)
print(" -", report_csv)
report_df.head(10)


Saved similarity matrix & report:
 - C:\Users\Admin\Desktop\ALVIN\outputs\similarity\subject_similarity_matrix.csv
 - C:\Users\Admin\Desktop\ALVIN\outputs\similarity\subject_similarity_matrix.png
 - C:\Users\Admin\Desktop\ALVIN\outputs\similarity\subject_similarity_report.csv


Unnamed: 0,Subject,Genuine_Mean,Genuine_Std,Nearest_Impostor_Subject,Nearest_Impostor_Sim,Safety_Margin
21,s21,0.791138,0.067362,s18,0.306389,0.484749
2,s02,0.909292,0.040191,s28,0.432762,0.47653
34,s34,0.745629,0.082809,s00,0.332648,0.412981
18,s18,0.811084,0.077162,s11,0.431162,0.379922
32,s32,0.843383,0.096294,s12,0.469653,0.37373
19,s19,0.836597,0.076883,s16,0.468548,0.36805
20,s20,0.783862,0.110625,s04,0.428982,0.35488
4,s04,0.840077,0.055229,s15,0.523232,0.316845
28,s28,0.799017,0.070228,s00,0.511023,0.287994
29,s29,0.893851,0.04427,s00,0.613133,0.280718


## Top Confusing paris

In [29]:
N = 10
pairs = []
for i, si in enumerate(subjects):
    for j, sj in enumerate(subjects):
        if i >= j: 
            continue
        pairs.append((si, sj, float(S_mat[i, j])))

pairs_sorted = sorted(pairs, key=lambda t: t[2], reverse=True)
top_pairs_df = pd.DataFrame(pairs_sorted[:N], columns=["Subject_A","Subject_B","Cosine_Similarity"])
top_pairs_csv = os.path.join(DIRS["similarity"], "subject_top_confusing_pairs.csv")
top_pairs_df.to_csv(top_pairs_csv, index=False)
print("Saved top confusing pairs:", top_pairs_csv)
top_pairs_df

Saved top confusing pairs: C:\Users\Admin\Desktop\ALVIN\outputs\similarity\subject_top_confusing_pairs.csv


Unnamed: 0,Subject_A,Subject_B,Cosine_Similarity
0,s13,s26,0.77681
1,s23,s25,0.684611
2,s06,s11,0.637231
3,s26,s33,0.632619
4,s00,s29,0.613133
5,s09,s26,0.610011
6,s12,s29,0.609457
7,s09,s13,0.594944
8,s08,s33,0.585876
9,s09,s33,0.569343
