# Importing necessary libraries:

In [3]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import welch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

## Explainging the basics:

input:- where my raw data is  
output:- where my updated data will be saved  
window:- the window size (5 seconds of eeg per feature vector)  
overlap:- fraction of overlap between windows(0.5=50%)  
duration:- each recording is 60 seconds  

In [23]:
INPUT_DIR = r"C:\Users\Admin\Desktop\ALVIN\dataset"
OUTPUT_DIR = r"C:\Users\Admin\Desktop\ALVIN\outputs"
WINDOW_SEC = 5.0
OVERLAP = 0.5
DURATION_SEC = 60.0

CHANNELS = ['Fp1','Fp2','F3','F4','F7','F8','T3','T4','C3','C4',
            'T5','T6','P3','P4','O1','O2','Fz','Cz','Pz']
BANDS = {
    "Delta": (0.5, 4.0),
    "Theta": (4.0, 7.0),
    "Alpha": (8.0, 13.0),
    "Beta": (13.0, 30.0),
    "Gamma": (30.0, 45.0),
}

### FIXING HEADERS, WELCH PSD AND BANDPOWER, PER-SUBJECT SUMMARY, WINDOWING, FEATURES PER WINDOW, TRAINING AND EVALUATION

In [24]:
def likely_unlabeled(df_head):
    cols = list(df_head.columns)
    if cols == CHANNELS:
        return False
    num_like = 0
    for c in cols:
        try:
            float(str(c).strip()); num_like += 1
        except:
            pass
    return num_like >= len(cols)//2

def ensure_labeled(path):
    stem = os.path.splitext(os.path.basename(path))[0]
    try:
        df_try = pd.read_csv(path, nrows=3)
    except:
        df = pd.read_csv(path, header=None); df.columns = CHANNELS
        df.to_csv(os.path.join(OUTPUT_DIR, f"{stem}_corrected.csv"), index=False)
        return df, stem
    if likely_unlabeled(df_try):
        df = pd.read_csv(path, header=None); df.columns = CHANNELS
        df.to_csv(os.path.join(OUTPUT_DIR, f"{stem}_corrected.csv"), index=False)
        return df, stem
    else:
        df = pd.read_csv(path)
        if list(df.columns) != CHANNELS:
            if set(CHANNELS).issubset(set(df.columns)):
                df = df[CHANNELS]
            else:
                raise ValueError(f"{path} has wrong columns")
        return df, stem

def welch_psd(x, fs, nperseg, noverlap):
    return welch(x, fs=fs, nperseg=nperseg, noverlap=noverlap, detrend='constant')

def integrate_bandpower(f, Pxx, band):
    fmin, fmax = band
    idx = (f >= fmin) & (f < fmax)
    if not np.any(idx): return 0.0
    return float(np.trapz(Pxx[idx], f[idx]))

def compute_wide_summary(df, fs, out_csv):
    nperseg = int(4*fs); noverlap = int(0.5*nperseg)
    feats = {}
    for ch in CHANNELS:
        f, Pxx = welch_psd(df[ch].values.astype(float), fs, nperseg, noverlap)
        total = np.trapz(Pxx, f)
        for b, rng in BANDS.items():
            abs_p = integrate_bandpower(f, Pxx, rng)
            rel_p = abs_p/total if total>0 else np.nan
            feats[f"{ch}_{b}"] = abs_p
            feats[f"{ch}_{b}_Rel"] = rel_p
    wide = pd.DataFrame([feats])
    wide.to_csv(out_csv, index=False)
    return wide

def segment_windows(df, fs, win_sec, overlap):
    win_len = int(win_sec*fs); hop = int(win_len*(1-overlap))
    return [(s, s+win_len) for s in range(0, len(df)-win_len+1, hop)]

def compute_window_features(df, fs, window):
    s,e = window; xw = df.iloc[s:e]
    nperseg = int(2*fs); noverlap = int(0.5*nperseg)
    feats = {}
    for ch in CHANNELS:
        f,Pxx = welch_psd(xw[ch].values.astype(float), fs, nperseg, noverlap)
        total = np.trapz(Pxx, f)
        for b,rng in BANDS.items():
            abs_p = integrate_bandpower(f,Pxx,rng)
            rel_p = abs_p/total if total>0 else np.nan
            feats[f"{ch}_{b}"] = abs_p; feats[f"{ch}_{b}_Rel"] = rel_p
    feats["StartSample"] = s; feats["Fs"] = fs
    return feats

def train_and_evaluate(features_csv):
    df = pd.read_csv(features_csv)
    X = df.drop(columns=["Subject","StartSample","Fs"]).values
    y = df["Subject"].values
    classes = sorted(df["Subject"].unique().tolist())
    if len(classes)<2: 
        print("Need at least 2 subjects"); return
    clf = Pipeline([
        ("scaler", StandardScaler()),
        ("rf", RandomForestClassifier(n_estimators=400, random_state=42))
    ])
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accs, cms = [], []
    for tr,te in skf.split(X,y):
        clf.fit(X[tr],y[tr]); y_pred = clf.predict(X[te])
        accs.append(accuracy_score(y[te],y_pred))
        cms.append(confusion_matrix(y[te],y_pred,labels=classes))
    print("Mean CV Accuracy:", np.mean(accs))
    cm_sum = np.sum(cms,axis=0)
    plt.imshow(cm_sum, cmap="Blues")
    plt.xticks(np.arange(len(classes)), classes)
    plt.yticks(np.arange(len(classes)), classes)
    plt.title("Confusion Matrix (5-fold CV)")
    plt.colorbar()
    plt.savefig(os.path.join(OUTPUT_DIR,"confusion_matrix.png"))
    plt.close()


### MAIN PIPELINE 

In [25]:
def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    candidates = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".csv")]
    exclude = re.compile(r"(bandpowers|features|corrected|wide|master)", re.I)
    candidates = [f for f in candidates if not exclude.search(f)]

    master_rows, subjects = [], []
    for fname in sorted(candidates):
        path = os.path.join(INPUT_DIR,fname)
        df, subj = ensure_labeled(path); subjects.append(subj)
        fs = df.shape[0]/DURATION_SEC
        wide_csv = os.path.join(OUTPUT_DIR,f"{subj}_bandpowers_wide.csv")
        compute_wide_summary(df, fs, wide_csv)
        for w in segment_windows(df,fs,WINDOW_SEC,OVERLAP):
            feats = compute_window_features(df,fs,w)
            feats["Subject"]=subj; master_rows.append(feats)

    if not master_rows: return
    master = pd.DataFrame(master_rows)
    master_csv = os.path.join(OUTPUT_DIR,f"eeg_biometric_features_{int(WINDOW_SEC)}s_master.csv")
    master.to_csv(master_csv,index=False)
    print("Saved features:", master_csv)
    if len(set(subjects))>=2: train_and_evaluate(master_csv)

if __name__=="__main__":
    main()

Saved features: C:\Users\Admin\Desktop\ALVIN\outputs\eeg_biometric_features_5s_master.csv
Mean CV Accuracy: 0.9915516611902154
