# Training and evaluating Random Forest classifiers trained EC, EO and random epochs

# EC epochs 

In [None]:
import os
import random
import pickle
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mne

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)

In [None]:
# Load data
with open("top_epochs_per_subject.pkl", "rb") as f:
    top_epochs_per_subject = pickle.load(f)
top_epochs_per_subject = {str(k).strip(): v for k, v in top_epochs_per_subject.items()}

metadata = pd.read_csv("metadata_time_filtered.csv")
metadata["subject_id"] = metadata["subject_id"].astype(str).str.strip()

all_subjects_EC_rf = [s for s in top_epochs_per_subject.keys() if s in metadata["subject_id"].values]

# Creating hold out set with 500 subjects
random.seed(13)
test_subjects_EC_rf = random.sample(all_subjects_EC_rf, 500)

# Saves the hold out set, so it can be used in the other models
with open("test_subjects_EC_rf.pkl", "wb") as f:
    pickle.dump(test_subjects_EC_rf, f)
    
train_subjects_EC_rf = [s for s in all_subjects_EC_rf if s not in test_subjects_EC_rf]

# Define age groups
def assign_age_group(age):
    if age < 21:
        return 0
    elif age < 71:
        return 1
    else:
        return 2

metadata["age_group"] = metadata["age"].apply(assign_age_group)
metadata = metadata[metadata["subject_id"].isin(all_subjects_EC_rf)]

# Feature extraction function
def extract_psd_features(subject_id, epoch_indices, set_folder):
    path = f"{set_folder}/{subject_id}_epoched.set"
    epochs = mne.io.read_epochs_eeglab(path, verbose='ERROR')
    data = epochs.get_data()[epoch_indices]
    sfreq = epochs.info["sfreq"]

    psds, freqs = mne.time_frequency.psd_array_welch(
        data, sfreq=sfreq, fmin=1, fmax=45, n_fft=200, verbose=False
    )
    return psds.mean(axis=(0, 1))  # Average over epochs and channels

# Extracting training values 
set_folder = "G:/ChristianMusaeus/Preprocessed_setfiles"
X_train_EC_rf, y_train_EC_rf = [], []

for subj_id in train_subjects_EC_rf:
    try:
        features = extract_psd_features(subj_id, top_epochs_per_subject[subj_id], set_folder)
        age_group = metadata.loc[metadata["subject_id"] == subj_id, "age_group"].values[0]
        X_train_EC_rf.append(features)
        y_train_EC_rf.append(age_group)
    except Exception as e:
        print(f"Error processing {subj_id}: {e}")

X_train_EC_rf = np.array(X_train_EC_rf)
y_train_EC_rf = np.array(y_train_EC_rf)

#  GRID SEARCH + CV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['log2', 'sqrt']
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
accuracies_ec_rf = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_EC_rf, y_train_EC_rf), 1):
    print(f"\n Fold {fold}")
    X_train, X_val = X_train_EC_rf[train_idx], X_train_EC_rf[val_idx]
    y_train, y_val = y_train_EC_rf[train_idx], y_train_EC_rf[val_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    base_model = RandomForestClassifier(random_state=13)
    grid_search = GridSearchCV(base_model, param_grid, cv=3, scoring='neg_log_loss', n_jobs=-1, verbose=1)
    grid_search.fit(X_train_scaled, y_train)

    preds = grid_search.best_estimator_.predict(X_val_scaled)
    acc = accuracy_score(y_val, preds)
    accuracies_ec_rf.append(acc)

    print(f"Fold {fold} Accuracy: {acc:.3f}")
    print(f"Best params: {grid_search.best_params_}")

print(f"\n Mean CV Accuracy: {np.mean(accuracies_ec_rf):.3f}")

# Final model 
scaler_final = StandardScaler()
X_scaled_full = scaler_final.fit_transform(X_train_EC_rf)
final_model = RandomForestClassifier(**grid_search.best_params_, random_state=13)
final_model.fit(X_scaled_full, y_train_EC_rf)

X_test_EC_rf, y_test_EC_rf = [], []
for subj_id in test_subjects_EC_rf:
    try:
        features = extract_psd_features(subj_id, top_epochs_per_subject[subj_id], set_folder)
        age_group = metadata.loc[metadata["subject_id"] == subj_id, "age_group"].values[0]
        X_test_EC_rf.append(features)
        y_test_EC_rf.append(age_group)
    except Exception as e:
        print(f"Error processing test subject {subj_id}: {e}")

X_test_EC_rf = np.array(X_test_EC_rf)
y_test_EC_rf = np.array(y_test_EC_rf)
X_test_scaled = scaler_final.transform(X_test_EC_rf)

test_preds_ec_rf = final_model.predict(X_test_scaled)
test_acc = accuracy_score(y_test_EC_rf, test_preds_ec_rf)
print(f"\n Final Test Accuracy: {test_acc:.3f}")
print(classification_report(y_test_EC_rf, test_preds_ec_rf))

# Subject level analysis and export for anova
subject_scores = [{
    "subject_id": subj_id,
    "model_type": "RFC",
    "data_type": "EC",
    "score": int(true == pred)
} for subj_id, true, pred in zip(test_subjects_EC_rf, y_test_EC_rf, test_preds_ec_rf)]

df_subject_scores_ec_rf = pd.DataFrame(subject_scores)
df_subject_scores_ec_rf.sort_values(by="subject_id").to_csv("subject_scores_rf_ec.csv", index=False)
print("Saved subject-level scores to 'subject_scores_rf_ec.csv'")

# Save outputs 
with open("y_test_EC_rf.pkl", "wb") as f:
    pickle.dump(y_test_EC_rf, f)
with open("test_preds_EC_rf.pkl", "wb") as f:
    pickle.dump(test_preds_ec_rf, f)
with open("test_subjects_EC_rf.pkl", "wb") as f:
    pickle.dump(test_subjects_EC_rf, f)


In [None]:
print(classification_report(y_test_EC_rf, test_preds_ec_rf))

### Distribution of subjects in the three age groups in both test set adn training set 

In [None]:
(unique_classes, counts) = np.unique(y_test_EC_rf, return_counts=True)
percentages = 100 * counts /counts.sum()

print("test set age group distribtuion:")
for cls, count, pct in zip(unique_classes, counts, percentages):
    print(f"age group: {count} subjects ({pct:.2f}%)")


In [None]:
age_group_distr = Counter(y_train_EC_rf)
y_train_EC_rf = np.array(y_train_EC_rf)
total_subjects = len(y_train_EC_rf)
for group, count in sorted(age_group_distr.items()):
    pct = (count/total_subjects )* 100
    print(f" agegroup {group}: {count}, {pct:.2f} %")

### Confusion matrix for EC 

In [None]:
cm_ec_rf = confusion_matrix(y_test_EC_rf, test_preds_ec_rf)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_ec_rf)
fig, ax = plt.subplots(figsize=(8,8))
disp.plot(ax=ax, cmap = 'Blues', colorbar = True)
plt.title("Confusion Matrix for EC RFC model")

# EO epochs

### Getting 60 EO epochs per test subject

In [None]:
# Load the label predictions CSV
labels_df = pd.read_csv("label_predictions.csv")

valid_subjects = set(str(k) for k in top_epochs_per_subject.keys())
top_60_EO_epochs_per_subject = {}

# Group by subject ID (assuming the column is 'subject_id')
for subject_id, group in labels_df.groupby("Test subject ID"):
    if str(subject_id) not in valid_subjects:
        continue
    # Filter epochs labeled as eyes-open (label == 0)
    eyes_open_epochs = group[group["Label"] == 0]

    # Sort by probability descending to get highest confidence epochs first
    eyes_open_sorted = eyes_open_epochs.sort_values(by="Probability", ascending=False)

    # Take top 60 epoch numbers
    top_epochs = eyes_open_sorted.head(60)["Epoch number"].values

    # Store in dictionary
    top_60_EO_epochs_per_subject[subject_id] = top_epochs

with open("top_60_EO_epochs_per_subject.pkl", "wb") as f:
    pickle.dump(top_60_EO_epochs_per_subject, f)


### Training the same RF model on the EO data set using the same hold out set as for EC 

In [None]:
#  LOAD AND PREPROCESS METADATA 
top_60_EO_epochs_per_subject_rf = {str(k).strip(): v for k, v in top_60_EO_epochs_per_subject.items()}

metadata = pd.read_csv("metadata_time_filtered.csv")
metadata["subject_id"] = metadata["subject_id"].astype(str).str.strip()

all_subjects_EO_rf = [s for s in top_60_EO_epochs_per_subject_rf if s in metadata["subject_id"].values]

test_subjects_EO_rf = test_subjects_EC_rf  # Same test set
train_subjects_EO_rf = [s for s in all_subjects_EO_rf if s not in test_subjects_EO_rf]

def assign_age_group(age):
    if age < 21:
        return 0
    elif age < 71:
        return 1
    else:
        return 2

metadata["age_group"] = metadata["age"].apply(assign_age_group)
metadata = metadata[metadata["subject_id"].isin(all_subjects_EO_rf)]

#  PSD FEATURE EXTRACTION FUNCTION
def extract_psd_features_eo_rf(subject_id, epoch_indices, set_folder):
    path = f"{set_folder}/{subject_id}_epoched.set"
    epochs = mne.io.read_epochs_eeglab(path, verbose='ERROR')
    data = epochs.get_data()[epoch_indices]
    sfreq = epochs.info["sfreq"]

    psds, freqs = mne.time_frequency.psd_array_welch(
        data, sfreq=sfreq, fmin=1, fmax=45, n_fft=200, verbose=False
    )
    return psds.mean(axis=(0, 1))  # Average over epochs and channels

#  EXTRACT FEATURES 
set_folder = "G:/ChristianMusaeus/Preprocessed_setfiles"

X_train_EO_rf, y_train_EO_rf = [], []
for subj_id in train_subjects_EO_rf:
    try:
        features = extract_psd_features_eo_rf(subj_id, top_60_EO_epochs_per_subject_rf[subj_id], set_folder)
        label = metadata.loc[metadata["subject_id"] == subj_id, "age_group"].values[0]
        X_train_EO_rf.append(features)
        y_train_EO_rf.append(label)
    except Exception as e:
        print(f"Error processing train subject {subj_id}: {e}")

X_test_EO_rf, y_test_EO_rf = [], []
for subj_id in test_subjects_EO_rf:
    try:
        features = extract_psd_features_eo_rf(subj_id, top_60_EO_epochs_per_subject_rf[subj_id], set_folder)
        label = metadata.loc[metadata["subject_id"] == subj_id, "age_group"].values[0]
        X_test_EO_rf.append(features)
        y_test_EO_rf.append(label)
    except Exception as e:
        print(f"Error processing test subject {subj_id}: {e}")

X_train_EO_rf = np.array(X_train_EO_rf)
y_train_EO_rf = np.array(y_train_EO_rf)
X_test_EO_rf = np.array(X_test_EO_rf)
y_test_EO_rf = np.array(y_test_EO_rf)

#  SCALE FEATURES 
scaler_EO_rf = StandardScaler()
X_train_scaled_EO_rf = scaler_EO_rf.fit_transform(X_train_EO_rf)
X_test_scaled_EO_rf = scaler_EO_rf.transform(X_test_EO_rf)

#  NESTED CV: OUTER LOOP FOR ANOVA 
param_grid_EO_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['log2', 'sqrt']
}

skf_EO_rf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
accuracies_eo_rf = []
anova_results_EO_rf = []

for fold, (train_idx, val_idx) in enumerate(skf_EO_rf.split(X_train_scaled_EO_rf, y_train_EO_rf), start=1):
    print(f"\n Fold {fold} running...")
    X_tr, X_val = X_train_scaled_EO_rf[train_idx], X_train_scaled_EO_rf[val_idx]
    y_tr, y_val = y_train_EO_rf[train_idx], y_train_EO_rf[val_idx]

    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(random_state=13),
        param_grid=param_grid_EO_rf,
        cv=3,
        scoring='neg_log_loss',
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_tr, y_tr)
    best_model = grid_search.best_estimator_

    preds = best_model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    accuracies_eo_rf.append(acc) 

    anova_results_EO_rf.append({
        "fold": fold,
        "model_type": "RFC",
        "data_type": "EO",
        "score": acc
    })

    print(f"Fold {fold} Accuracy: {acc:.3f}")
    print(f"Best params: {grid_search.best_params_}")

print(f"\n Mean CV Accuracy: {np.mean(accuracies_eo_rf):.3f}")

#  SAVE ANOVA RESULTS 
df_anova_EO_rf = pd.DataFrame(anova_results_EO_rf)
df_anova_EO_rf.to_csv("anova_results_rf_eo.csv", index=False)
print("Saved ANOVA results to 'anova_results_rf_eo.csv'")

#  FINAL GRID SEARCH ON ALL TRAINING DATA 
final_grid_EO_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=13),
    param_grid=param_grid_EO_rf,
    cv=5,
    scoring='neg_log_loss',
    n_jobs=-1,
    verbose=1
)
final_grid_EO_rf.fit(X_train_scaled_EO_rf, y_train_EO_rf)
best_params_EO_rf = final_grid_EO_rf.best_params_

#  FINAL MODEL TRAINING AND TEST EVALUATION 
final_model_EO_rf = RandomForestClassifier(**best_params_EO_rf, random_state=13)
final_model_EO_rf.fit(X_train_scaled_EO_rf, y_train_EO_rf)

test_preds_EO_rf = final_model_EO_rf.predict(X_test_scaled_EO_rf)
test_acc_EO_rf = accuracy_score(y_test_EO_rf, test_preds_EO_rf)
print(f"\n Final Test Accuracy: {test_acc_EO_rf:.3f}")
print(classification_report(y_test_EO_rf, test_preds_EO_rf))

#  SUBJECT-LEVEL ACCURACY EXPORT FOR ANOVA 
subject_scores_eo = [{
    "subject_id": subj_id,
    "model_type": "RFC",
    "data_type": "EO",
    "score": int(true == pred)
} for subj_id, true, pred in zip(test_subjects_EO_rf, y_test_EO_rf, test_preds_EO_rf)]

df_subject_scores_eo_rf = pd.DataFrame(subject_scores_eo)
df_subject_scores_eo_rf.sort_values(by="subject_id").to_csv("subject_scores_rf_eo.csv", index=False)

print("Saved subject-level scores to 'subject_scores_rf_eo.csv'")


#  SAVE FINAL TEST RESULTS 
with open('y_test_EO_rf.pkl', 'wb') as f:
    pickle.dump(y_test_EO_rf, f)

with open('y_pred_EO_rf.pkl', 'wb') as f:
    pickle.dump(test_preds_EO_rf, f)


In [None]:
print(classification_report(y_test_EO_rf, test_preds_EO_rf))

### Confusion matrix for EO

In [None]:
# Create and display confusion matrix
cm_eo_rf = confusion_matrix(y_test_EO_rf, test_preds_EO_rf)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_eo_rf)
fig, ax = plt.subplots(figsize=(8, 8))
disp.plot(ax=ax, cmap="Blues", colorbar=True)

plt.title("Confusion Matrix for EO Random Forest Model")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# Random epochs

### Extracting 60 random epochs 

In [None]:
import pandas as pd
import pickle

labels_df = pd.read_csv("label_predictions.csv")  

# Sample 60 epochs per subject (mixed EC and EO)
random_epochs_per_subject = {}
valid_subjects = set(top_epochs_per_subject.keys())
for subj_id, group in labels_df.groupby("Test subject ID"):
    if str(subj_id) not in valid_subjects:
        continue
    if len(group) >= 60:
        sampled = group.sample(n=60, random_state=13)
        random_epochs_per_subject[subj_id] = sampled["Epoch number"].values

print(len(random_epochs_per_subject))

# Save to pickle
with open("random_epochs_per_subject.pkl", "wb") as f:
    pickle.dump(random_epochs_per_subject, f)


### Training and testing the same model on the random epochs, using the same train and test set as the two other models 

In [None]:
#  LOAD RANDOM EPOCHS PER SUBJECT 
with open("random_epochs_per_subject.pkl", "rb") as f:
    random_epochs_per_subject = pickle.load(f)

random_epochs_per_subject_rf = {str(k).strip(): v for k, v in random_epochs_per_subject.items()}

#  LOAD METADATA 
metadata = pd.read_csv("metadata_time_filtered.csv")
metadata["subject_id"] = metadata["subject_id"].astype(str).str.strip()

all_subjects_random_rf = [s for s in random_epochs_per_subject_rf.keys() if s in metadata["subject_id"].values]

# Reuse test subjects from EC-RF to ensure consistency
with open("test_subjects_EC_rf.pkl", "rb") as f:
    test_subjects_random_rf = pickle.load(f)

train_subjects_random_rf = [s for s in all_subjects_random_rf if s not in test_subjects_random_rf]

#  DEFINE AGE GROUP LABELS 
def assign_age_group(age):
    if age < 21:
        return 0
    elif age < 71:
        return 1
    else:
        return 2

metadata["age_group"] = metadata["age"].apply(assign_age_group)
metadata = metadata[metadata["subject_id"].isin(all_subjects_random_rf)]

#  FEATURE EXTRACTION
def extract_psd_features(subject_id, epoch_indices, set_folder):
    path = f"{set_folder}/{subject_id}_epoched.set"
    epochs = mne.io.read_epochs_eeglab(path, verbose='ERROR')
    data = epochs.get_data()[epoch_indices]
    sfreq = epochs.info["sfreq"]

    psds, freqs = mne.time_frequency.psd_array_welch(
        data,
        sfreq=sfreq,
        fmin=1,
        fmax=45,
        n_fft=200,
        verbose=False
    )
    mean_psd = psds.mean(axis=(0, 1))  # average over epochs and channels
    return mean_psd

#  EXTRACT FEATURES FROM TRAIN SET 
set_folder = "G:/ChristianMusaeus/Preprocessed_setfiles"

X_random_rf, y_random_rf = [], []

for subj_id in train_subjects_random_rf:
    try:
        epoch_inds = random_epochs_per_subject_rf[subj_id]
        features = extract_psd_features(subj_id, epoch_inds, set_folder)
        age_group = metadata.loc[metadata["subject_id"] == subj_id, "age_group"].values[0]
        X_random_rf.append(features)
        y_random_rf.append(age_group)
    except Exception as e:
        print(f"Error processing {subj_id}: {e}")

X_random_rf = np.array(X_random_rf)
y_random_rf = np.array(y_random_rf)

#  GRID SEARCH + CROSS-VALIDATION 
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['log2', 'sqrt']
}

skf_random_rf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
accuracies_random_rf = []

for fold, (train_idx, val_idx) in enumerate(skf_random_rf.split(X_random_rf, y_random_rf), start=1):
    print(f"\n Fold {fold} running...")
    X_train_rf, X_val_rf = X_random_rf[train_idx], X_random_rf[val_idx]
    y_train_rf, y_val_rf = y_random_rf[train_idx], y_random_rf[val_idx]

    scaler_random_rf = StandardScaler()
    X_train_scaled_rf = scaler_random_rf.fit_transform(X_train_rf)
    X_val_scaled_rf = scaler_random_rf.transform(X_val_rf)

    base_model_rf = RandomForestClassifier(random_state=13)
    grid_search_rf = GridSearchCV(
        estimator=base_model_rf,
        param_grid=param_grid_rf,
        cv=3,
        n_jobs=-1,
        scoring='neg_log_loss',
        verbose=1
    )
    grid_search_rf.fit(X_train_scaled_rf, y_train_rf)
    best_model_rf = grid_search_rf.best_estimator_

    preds_rf = best_model_rf.predict(X_val_scaled_rf)
    acc_rf = accuracy_score(y_val_rf, preds_rf)
    accuracies_random_rf.append(acc_rf)

    print(f"Fold {fold} Accuracy: {acc_rf:.3f}")
    print(f"Best params: {grid_search_rf.best_params_}")

print(f"\n Mean CV Accuracy: {np.mean(accuracies_random_rf):.3f}")

#  FINAL MODEL TRAINING 
scaler_random_final_rf = StandardScaler()
X_scaled_random_rf = scaler_random_final_rf.fit_transform(X_random_rf)

best_params_final_random_rf = grid_search_rf.best_params_
final_model_random_rf = RandomForestClassifier(**best_params_final_random_rf, random_state=13)
final_model_random_rf.fit(X_scaled_random_rf, y_random_rf)

#  TEST SET EVALUATION 
X_test_random_rf, y_test_random_rf = [], []

for subj_id in test_subjects_random_rf:
    try:
        epoch_inds = random_epochs_per_subject_rf[subj_id]
        features = extract_psd_features(subj_id, epoch_inds, set_folder)
        age_group = metadata.loc[metadata["subject_id"] == subj_id, "age_group"].values[0]
        X_test_random_rf.append(features)
        y_test_random_rf.append(age_group)
    except Exception as e:
        print(f"Error processing test subject {subj_id}: {e}")

X_test_random_rf = np.array(X_test_random_rf)
y_test_random_rf = np.array(y_test_random_rf)
X_test_scaled_random_rf = scaler_random_final_rf.transform(X_test_random_rf)

test_preds_random_rf = final_model_random_rf.predict(X_test_scaled_random_rf)
test_acc_random_rf = accuracy_score(y_test_random_rf, test_preds_random_rf)
print(f"\n Final Test Accuracy: {test_acc_random_rf:.3f}")
print(classification_report(y_test_random_rf, test_preds_random_rf))

#  SUBJECT-LEVEL ACCURACY FOR ANOVA 
subject_level_scores_rf_random = [{
    "subject_id": subj_id,
    "model_type": "RFC",
    "data_type": "Random",
    "score": int(true == pred)
} for subj_id, true, pred in zip(test_subjects_random_rf, y_test_random_rf, test_preds_random_rf)]

df_subject_scores_rf_random = pd.DataFrame(subject_level_scores_rf_random)
df_subject_scores_rf_random.sort_values(by="subject_id").to_csv("subject_scores_rf_random.csv", index=False)
print("Saved subject-level scores to 'subject_scores_rf_random.csv'")

#  SAVE y_test AND PREDICTIONS 
with open('y_test_random_rf.pkl', 'wb') as f:
    pickle.dump(y_test_random_rf, f)

with open('test_preds_random_rf.pkl', 'wb') as f:
    pickle.dump(test_preds_random_rf, f)

In [None]:
print(classification_report(y_test_random_rf, test_preds_random_rf))

### Confusion matrix for random epochs 

In [None]:
# Create and display confusion matrix
cm_random_rf = confusion_matrix(y_test_random_rf, test_preds_random_rf)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_random_rf)
fig, ax = plt.subplots(figsize=(8, 8))
disp.plot(ax=ax, cmap="Blues", colorbar=True)

plt.title("Confusion Matrix for Random epochs RFC Model")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()