# Ensembling for labeling converted set files 

In [None]:
import os
import numpy as np
import joblib
import mne
import pandas as pd

In [None]:
data_folder = 'G:\ChristianMusaeus\Preprocessed_setfiles'  # Folder with .set files
output_folder = 'E:\ChristianMusaeus'       # Output CSV folder
output_csv = os.path.join(output_folder, "label_predictions.csv")

n_channels = 19

# Feature reduction
def reduce_freq_resolution(X, n_channels, n_freqs, n_bins):
    bin_size = n_freqs // n_bins
    X_reshaped = X.reshape(-1, n_channels, n_freqs)
    reduced = np.stack([
        X_reshaped[:, :, i * bin_size:(i + 1) * bin_size].mean(axis=2)
        for i in range(n_bins)
    ], axis=2)
    return reduced.reshape(X.shape[0], -1)

# -------- Load models and scalers --------
lr_model = joblib.load("final_model_lr.pkl")
lr_scaler = joblib.load("final_scaler_lr.pkl")
lr_n_bins = int(np.load("final_n_bins_lr.npy"))

svm_model = joblib.load("final_model_svm.pkl")
svm_scaler = joblib.load("final_scaler_svm.pkl")
svm_n_bins = int(np.load("final_n_bins_svm.npy"))

rf_model = joblib.load("final_model_rf.pkl")
rf_scaler = joblib.load("final_scaler_rf.pkl")
rf_n_bins = int(np.load("final_n_bins_rf.npy"))

# -------- Probability prediction --------
def get_model_proba(model, scaler, n_bins, X, n_channels, n_freqs):
    X_binned = reduce_freq_resolution(X, n_channels, n_freqs, n_bins)
    X_scaled = scaler.transform(X_binned)
    return model.predict_proba(X_scaled)

# -------- Label and aggregate --------
set_files = [f for f in os.listdir(data_folder) if f.endswith('.set')]
all_preds = []  # List of DataFrames

for fname in set_files:
    file_path = os.path.join(data_folder, fname)
    print(f"Processing {fname}...")

    subject_id = fname.split('_')[0]

    try:
        epochs = mne.io.read_epochs_eeglab(file_path, verbose='ERROR')
        data = epochs.get_data()

        if np.isnan(data).any():
            print(f" Skipping {fname} due to NaN values")
            continue

        if data.ndim != 3:
            raise ValueError("Expected 3D data")

        n_freqs = data.shape[2]
        X = data.reshape(data.shape[0], -1)

        probs_lr = get_model_proba(lr_model, lr_scaler, lr_n_bins, X, n_channels, n_freqs)
        probs_svm = get_model_proba(svm_model, svm_scaler, svm_n_bins, X, n_channels, n_freqs)
        probs_rf = get_model_proba(rf_model, rf_scaler, rf_n_bins, X, n_channels, n_freqs)

        ensemble_probs = (probs_lr + probs_svm + probs_rf) / 3
        ensemble_pred = np.argmax(ensemble_probs, axis=1)
        ensemble_pred_prob = ensemble_probs[np.arange(len(ensemble_pred)), ensemble_pred]

        df = pd.DataFrame({
            "Test subject ID": subject_id,
            "Epoch number": np.arange(len(ensemble_pred)),
            "Label": ensemble_pred,
            "Probability": ensemble_pred_prob
        })
        all_preds.append(df)

    except Exception as e:
        print(f" Error processing {fname}: {e}")

# -------- Save final combined CSV --------
if all_preds:
    final_df = pd.concat(all_preds, ignore_index=True)
    final_df.to_csv(output_csv, index=False)
    print(f"\n Saved combined predictions to: {output_csv}")
else:
    print("\n No valid predictions were saved.")


### Printing the percentage of epochs with label 1 as a sanity check. 

In [None]:
df = pd.read_csv("E:\ChristianMusaeus\label_predictions.csv")

summary = df.groupby("Test subject ID").agg(total_epochs=("Label", "count"), num_ones=("Label",lambda x: (x==1).sum()))

summary["percent_ones"] = 100 * summary["num_ones"] / summary["total_epochs"]

print(summary)