In [1]:
pip install wfdb

Note: you may need to restart the kernel to use updated packages.


In [212]:
import wfdb
import numpy as np
import os
import csv
from scipy.ndimage import gaussian_filter1d
from scipy.signal import medfilt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

# Extracting and Combining Data by Class

## Utility functions

In [213]:
# extract the ranges of each stage from their annotation
def get_stages(annotation, size):
    stages = []
    for i, (samp, note) in enumerate(zip(annotation.sample, annotation.aux_note)):
        stages.append({"Label" : note, "start": samp, "end": 0}) # dummy end
    
    for i in range((len(stages) - 1)):
        stages[i]["end"] = stages[i+1]["start"] # each stages end is the next one's beginning
    stages[-1]["end"] = size # set the end of last stage as the length of the data

    return stages  

In [214]:
# find label of a data point at a given index based on the ranges of each stage
def find_label(stages, index):
    for stage in stages:
        if index >= stage['start'] and index < stage['end']:
            return stage["Label"]
    return "Unknown"

In [215]:
# label the AccTempEDA data
def extract_labeled_AccTempEDA(record, annotation):
    stages = get_stages(annotation, record.p_signal.shape[0]) # get the range for each stage
    data = record.p_signal # extract the data into a numpy array
    labels = [] # array to store the labels corresponding to each data point in the record
    for idx, entry in enumerate(data): # loop over all the data
        labels.append((find_label(stages, idx))) # find the label based on the points index
    labeled_data = np.column_stack((data, labels)) # append the labels column to the data 
    return labeled_data # return the labeled data

In [216]:
# downsample by averaging BUT make sure you stay within a given class
def downsample_by_averaging(data, factor):
    downsampled_data = []
    i = 0
    while i + factor <= len(data): # cannot use for (range) as we may need to update i
        class_label = data[i][-1] # get the expected class lavel
        if class_label == data[i + factor - 1][-1]: # if all the data belongs to one class
            chunk = data[i: i + factor, :-1].astype(float) # extract a chunk (ignore last column [label] to average and cast to float to counter the upcasting when we added the label)
            avg_chunk = np.mean(chunk, axis=0)
            downsampled_data.append(np.append(avg_chunk, class_label))  # avg of each column while adding back the label
            i += factor
        else: # we have crossed into a new class, need to reset i to beginning of the new class
            for j in range(i + 1, i + factor):
                if data[j-1][-1] != data[j][-1]: # if data at index j has a different class from the one before it, update i and break
                    i = j
                    break
                    
    return np.array(downsampled_data)

In [217]:
# combine AccTempEDA and Spo2HR data while truncating them to be of the same length
# CRUCIAL but naive assumption! time stamps match after downsampling (technically incorrect)
def combine(AccTempEDA, Spo2HR):
    min_length = min(len(AccTempEDA), len(Spo2HR)) # finds the minimum length to align the points properly

    # concatenate the truncated columns from AccTempEDA with those from Spo2HR and add the labels at the end
    combined_data = np.column_stack((AccTempEDA[:min_length, :-1], Spo2HR[:min_length], AccTempEDA[:min_length, -1:]))

    return combined_data

In [218]:
def balance_classes(combined_data):
    balanced_data = []
    relaxed = False
    for data in combined_data:
        if data[-1] != "Relax": # we've passed the first relax
            relaxed = True
            balanced_data.append(data)
        elif not relaxed:
            balanced_data.append(data)
    return np.array(balanced_data)  

# Extracting Raw and Preprocessed data

In [219]:
# val is 1 incase labels are included 0 otherwise 
def preprocess(wavevec):
    final_vec = [] # stores the filtered signals
    for i in range(0,len(wavevec[0])): # goes through each point in the signal
        one_col = [wavevec[s][i].astype(float) for s in range(0,len(wavevec))] # filters column by column
        one_col = medfilt(one_col, kernel_size=9) # filtering each column
        if i == 0:
            final_vec = one_col
        else:
            final_vec = np.column_stack((final_vec, one_col))
    return final_vec

In [220]:
# get the full data for each subject
def get_subject_data_preprocessed(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation):
    AccTempEDA_record.p_signal = preprocess(AccTempEDA_record.p_signal)
    AccTempEDA = extract_labeled_AccTempEDA(AccTempEDA_record, AccTempEDA_annotation) # extract and label AccTempEDA record
    #print(AccTempEDA[3000])
    AccTempEDA = downsample_by_averaging(AccTempEDA, 8) # downsample to match the other Spo2HR's rate
    Spo2HR = preprocess(Spo2HR_record.p_signal) # extract Spo2HR record
    #print(Spo2HR.shape)
    combined_data = combine(AccTempEDA, Spo2HR) # combine both records
    #print(len(combined_data))
    return combined_data # return the resulting data

In [221]:
def get_subject_data_raw(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation):
    AccTempEDA = extract_labeled_AccTempEDA(AccTempEDA_record, AccTempEDA_annotation) # extract and label AccTempEDA record
    #print(AccTempEDA[3000])
    #print(AccTempEDA)
    AccTempEDA = downsample_by_averaging(AccTempEDA, 8) # downsample to match the other Spo2HR's rate
    Spo2HR = Spo2HR_record.p_signal # extract Spo2HR record
    #print(Spo2HR.shape)
    combined_data = combine(AccTempEDA, Spo2HR) # combine both records
    #print(len(combined_data))
    return combined_data # return the resulting data

In [222]:
def get_subject_data_raw_balanced(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation):
    AccTempEDA = extract_labeled_AccTempEDA(AccTempEDA_record, AccTempEDA_annotation) # extract and label AccTempEDA record
    #print(AccTempEDA[3000])
    #print(AccTempEDA)
    AccTempEDA = downsample_by_averaging(AccTempEDA, 8) # downsample to match the other Spo2HR's rate
    Spo2HR = Spo2HR_record.p_signal # extract Spo2HR record
    #print(Spo2HR.shape)
    combined_data = combine(AccTempEDA, Spo2HR) # combine both records
    balanced_data = balance_classes(combined_data)
    return balanced_data # return the resulting data

In [223]:
# get the full data for each subject
def get_subject_data_preprocessed_balanced(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation):
    AccTempEDA_record.p_signal = preprocess(AccTempEDA_record.p_signal)
    AccTempEDA = extract_labeled_AccTempEDA(AccTempEDA_record, AccTempEDA_annotation) # extract and label AccTempEDA record
    #print(AccTempEDA[3000])
    AccTempEDA = downsample_by_averaging(AccTempEDA, 8) # downsample to match the other Spo2HR's rate
    Spo2HR = preprocess(Spo2HR_record.p_signal) # extract Spo2HR record
    #print(Spo2HR.shape)
    combined_data = combine(AccTempEDA, Spo2HR) # combine both records
    balanced_data = balance_classes(combined_data)
    return balanced_data # return the resulting data

# Function to read and get raw data for all subjects and store them in csvs

In [204]:
len(all_sub_info)

0

# Function to get the data in the desired window sizes

In [224]:
def windowSel(data, factor):
    # stores the windowed sample
    windowed = []
    labels = []
    # stores the last i in case the window exceeds the size of the sample
    for i in range(0, len(data), factor):
        # checks if i exceeds the sample size
        if i + factor < len(data):
            class_label = data[i][-1] # get the expected class lavel
            if class_label == data[i + factor - 1][-1]: # if all the data belongs to one class
            # adds the windowed sample 
                windowed.append(data[i: i + factor, :-1])
                labels.append(class_label)
            else:
                continue     
    return windowed, labels

In [206]:
windowed_samps = []
val = 5
for i in range(0,20):
   windowSel(all_sub_info[1],val)
    

IndexError: list index out of range

# Compute Z score [Normalization] (importnat for methods like KNN, logistic regression, SVM and neural networks

In [225]:
# data includes the features without the labels column
def normalize_data(train_data, test_data):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(train_data)
    X_test_scaled = scaler.transform(test_data)
    return X_train_scaled, X_test_scaled
    
        

# Getting Data Raw, Preprocessed (Balanced and Unbalanced)

In [226]:
data_dir = rf"D:\University\8.Spring 2025\Machine\Project\non-eeg-dataset-for-assessment-of-neurological-status-1.0.0\non-eeg-dataset-for-assessment-of-neurological-status-1.0.0"
all_preprocessed_data_balanced=[]
all_preprocessed_data=[]
all_raw_data_balanced=[]
all_raw_data=[]
for i in range(1, 21):
    AccTempEDA_record = wfdb.rdrecord(fr"{data_dir}\Subject{i}_AccTempEDA")
    Spo2HR_record = wfdb.rdrecord(fr"{data_dir}\Subject{i}_SpO2HR")
    AccTempEDA_annotation = wfdb.rdann(fr"{data_dir}\Subject{i}_AccTempEDA", 'atr')
    
    data = get_subject_data_preprocessed(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation)
    raw_data = get_subject_data_raw(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation)
    data_balanced = get_subject_data_preprocessed_balanced(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation)
    raw_data_balanced = get_subject_data_raw_balanced(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation)
    
    all_preprocessed_data.append(data)
    all_raw_data.append(raw_data)
    all_preprocessed_data_balanced.append(data_balanced)
    all_raw_data_balanced.append(raw_data_balanced)

In [227]:
# select windows BUT make sure you stay within a given class
def window_data(data, factor):
    windowed_data = []
    labels = []
    i = 0
    while i + factor <= len(data): # cannot use for (range) as we may need to update i
        class_label = data[i][-1] # get the expected class lavel
        if class_label == data[i + factor - 1][-1]: # if all the data belongs to one class
            chunk = data[i: i + factor, : -1] # extract a chunk (ignore last column [label] to average and cast to float to counter the upcasting when we added the label)
            windowed_data.append(chunk)  # avg of each column while adding back the label
            labels.append(class_label)
            i += factor
        else: # we have crossed into a new class, need to reset i to beginning of the new class
            for j in range(i + 1, i + factor):
                if data[j-1][-1] != data[j][-1]: # if data at index j has a different class from the one before it, update i and break
                    i = j
                    break
    # flatten the windows

    windowed_data = [window.flatten() for window in windowed_data]
                    
    return np.array(windowed_data).astype(float), np.array(labels)

# Least Squares

In [228]:
def least_squared_all_subs(data, norm = False, window = 1):
    precision_list_squares = []
    recall_list_squares = []
    f1_list_squares = []
    
    for i in range(1, 21):
        if(window == 1):
            X, y = data[i - 1][:, :-1].astype(float), data[i - 1][:, -1]
        else:
            X, y = window_data(data[i-1], window)
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        # Prepare lists to store metrics for each fold
        all_preds = []
        all_true = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            if(norm):
                X_train, X_test = normalize_data(X_train, X_test)
                
            clf = RidgeClassifier()
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            # Collect predictions and true labels
            all_preds.extend(y_pred)
            all_true.extend(y_test)
        
        # Final combined classification report
        # print(f"Subject {i}")
        # print(classification_report(all_true, all_preds, zero_division=0))
        # print("\n")
    
        report_dict = classification_report(all_true, all_preds, output_dict=True, zero_division=0)
        macro_avg = report_dict['macro avg']
    
        avg_precision = macro_avg['precision']
        avg_recall = macro_avg['recall']
        avg_f1_score = macro_avg['f1-score']
    
        precision_list_squares.append(avg_precision)
        recall_list_squares.append(avg_recall)
        f1_list_squares.append(avg_f1_score)
        
    overall_avg_precision_squares = sum(precision_list_squares) / len(precision_list_squares)
    overall_avg_recall_squares = sum(recall_list_squares) / len(recall_list_squares)
    overall_avg_f1_squares = sum(f1_list_squares) / len(f1_list_squares)
    
    print("Overall averages for 20 subjects with Least Sqaures classifier:")
    print(f"Average Precision: {overall_avg_precision_squares:.3f}")
    print(f"Average Recall: {overall_avg_recall_squares:.3f}")
    print(f"Average F1-Score: {overall_avg_f1_squares:.3f}")
        

## Least Squares on Raw Data (Unbalanced)

In [229]:
least_squared_all_subs(all_raw_data, norm = False)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.881
Average Recall: 0.820
Average F1-Score: 0.825


## Least Squares on Raw Data (Balanced)

In [230]:
least_squared_all_subs(all_raw_data_balanced, norm = False)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.943
Average Recall: 0.942
Average F1-Score: 0.941


## Least Squares on Preprocessed (Unbalanced)

In [22]:
least_squared_all_subs(all_preprocessed_data, norm = True)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.836
Average Recall: 0.814
Average F1-Score: 0.793


## Least Squares on Preprocessed (Balanced)

In [23]:
least_squared_all_subs(all_preprocessed_data_balanced, norm = True)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.931
Average Recall: 0.912
Average F1-Score: 0.907


## Least Squares Windows

In [74]:
least_squared_all_subs(all_preprocessed_data_balanced, True, 2)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.945
Average Recall: 0.947
Average F1-Score: 0.945


In [72]:
least_squared_all_subs(all_preprocessed_data_balanced, True, 3)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.945
Average Recall: 0.946
Average F1-Score: 0.944


In [102]:
least_squared_all_subs(all_preprocessed_data_balanced, True, 4)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.945
Average Recall: 0.946
Average F1-Score: 0.944


In [73]:
least_squared_all_subs(all_preprocessed_data_balanced, True, 5)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.938
Average Recall: 0.940
Average F1-Score: 0.937


In [75]:
least_squared_all_subs(all_preprocessed_data_balanced, True, 10)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.932
Average Recall: 0.934
Average F1-Score: 0.930


In [76]:
least_squared_all_subs(all_preprocessed_data_balanced, True, 15)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.925
Average Recall: 0.924
Average F1-Score: 0.922


### interesting observation => as windows grow we actually fall

# Logistic Regression

In [66]:
def logistic_regression_all_subs(data, norm = False, window = 1):
    precision_list_squares = []
    recall_list_squares = []
    f1_list_squares = []
    
    for i in range(1, 21):
        if(window == 1):
            X, y = data[i - 1][:, :-1].astype(float), data[i - 1][:, -1]
        else:
            X, y = window_data(data[i-1], window)
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        # Prepare lists to store metrics for each fold
        all_preds = []
        all_true = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            if(norm):
                X_train, X_test = normalize_data(X_train, X_test)
                
            clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            # Collect predictions and true labels
            all_preds.extend(y_pred)
            all_true.extend(y_test)
    
        report_dict = classification_report(all_true, all_preds, output_dict=True, zero_division=0)
        macro_avg = report_dict['macro avg']
    
        avg_precision = macro_avg['precision']
        avg_recall = macro_avg['recall']
        avg_f1_score = macro_avg['f1-score']
    
        precision_list_squares.append(avg_precision)
        recall_list_squares.append(avg_recall)
        f1_list_squares.append(avg_f1_score)
        
    overall_avg_precision_squares = sum(precision_list_squares) / len(precision_list_squares)
    overall_avg_recall_squares = sum(recall_list_squares) / len(recall_list_squares)
    overall_avg_f1_squares = sum(f1_list_squares) / len(f1_list_squares)
    
    print("Overall averages for 20 subjects with Logisitic Regression:")
    print(f"Average Precision: {overall_avg_precision_squares:.3f}")
    print(f"Average Recall: {overall_avg_recall_squares:.3f}")
    print(f"Average F1-Score: {overall_avg_f1_squares:.3f}")
        

## doesn't work without normalization

## Logistic Regression on Preprocessed (Unbalanced)

In [75]:
logistic_regression_all_subs(all_preprocessed_data, norm = True)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.942
Average Recall: 0.933
Average F1-Score: 0.936


## Logistic Regression on Preprocessed (Balanced)

In [76]:
logistic_regression_all_subs(all_preprocessed_data_balanced, norm = True)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.985
Average Recall: 0.986
Average F1-Score: 0.985


## Logisitic Regression Windowed

In [78]:
logistic_regression_all_subs(all_preprocessed_data_balanced, True, 2)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.984
Average Recall: 0.985
Average F1-Score: 0.984


In [79]:
logistic_regression_all_subs(all_preprocessed_data_balanced, True, 3)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.982
Average Recall: 0.983
Average F1-Score: 0.982


In [101]:
logistic_regression_all_subs(all_preprocessed_data_balanced, True, 4)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.980
Average Recall: 0.981
Average F1-Score: 0.980


In [80]:
logistic_regression_all_subs(all_preprocessed_data_balanced, True, 5)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.980
Average Recall: 0.981
Average F1-Score: 0.980


In [81]:
logistic_regression_all_subs(all_preprocessed_data_balanced, True, 10)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.973
Average Recall: 0.975
Average F1-Score: 0.973


In [82]:
logistic_regression_all_subs(all_preprocessed_data_balanced, True, 15)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.971
Average Recall: 0.971
Average F1-Score: 0.970


# SVM

In [67]:
def svm_all_subs(data, k, norm = False, window = 1):
    precision_list_squares = []
    recall_list_squares = []
    f1_list_squares = []
    
    for i in range(1, 21):
        if(window == 1):
            X, y = data[i - 1][:, :-1].astype(float), data[i - 1][:, -1]
        else:
            X, y = window_data(data[i-1], window)
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        # Prepare lists to store metrics for each fold
        all_preds = []
        all_true = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            if(norm):
                X_train, X_test = normalize_data(X_train, X_test)
                
            clf = SVC(kernel= k, decision_function_shape='ovr')
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            # Collect predictions and true labels
            all_preds.extend(y_pred)
            all_true.extend(y_test)
        
        # Final combined classification report
        # print(f"Subject {i}")
        # print(classification_report(all_true, all_preds, zero_division=0))
        # print("\n")
    
        report_dict = classification_report(all_true, all_preds, output_dict=True, zero_division=0)
        macro_avg = report_dict['macro avg']
    
        avg_precision = macro_avg['precision']
        avg_recall = macro_avg['recall']
        avg_f1_score = macro_avg['f1-score']
    
        precision_list_squares.append(avg_precision)
        recall_list_squares.append(avg_recall)
        f1_list_squares.append(avg_f1_score)
        
    overall_avg_precision_squares = sum(precision_list_squares) / len(precision_list_squares)
    overall_avg_recall_squares = sum(recall_list_squares) / len(recall_list_squares)
    overall_avg_f1_squares = sum(f1_list_squares) / len(f1_list_squares)
    
    print(f"Overall averages for 20 subjects with SVM {k}:")
    print(f"Average Precision: {overall_avg_precision_squares:.3f}")
    print(f"Average Recall: {overall_avg_recall_squares:.3f}")
    print(f"Average F1-Score: {overall_avg_f1_squares:.3f}")
        

## Unnormalized also doesn't run (takes too long)

### Linear SVM on Preprocessed (UnBalanced)

In [78]:
svm_all_subs(all_preprocessed_data, 'linear', norm = True)

Overall averages for 20 subjects with SVM linear:
Average Precision: 0.961
Average Recall: 0.952
Average F1-Score: 0.954


### Linear SVM on Preprocessed (Balanced)

In [79]:
svm_all_subs(all_preprocessed_data_balanced, 'linear', norm = True)

Overall averages for 20 subjects with SVM linear:
Average Precision: 0.988
Average Recall: 0.989
Average F1-Score: 0.988


### Non-linear SVM on Preprocessed (UnBalanced)

In [58]:
svm_all_subs(all_preprocessed_data, 'rbf', norm = True)

Overall averages for 20 subjects with SVM rbf:
Average Precision: 0.866
Average Recall: 0.882
Average F1-Score: 0.869


### Non-linear SVM on Preprocessed (Balanced)

In [59]:
svm_all_subs(all_preprocessed_data_balanced, 'rbf', norm = True)

Overall averages for 20 subjects with SVM rbf:
Average Precision: 0.949
Average Recall: 0.950
Average F1-Score: 0.948


#### Observation (Linear actually better!!!) since high dimensionality

### SVM Linear Windows

In [84]:
svm_all_subs(all_preprocessed_data_balanced, 'linear', True, 2)

Overall averages for 20 subjects with SVM linear:
Average Precision: 0.987
Average Recall: 0.987
Average F1-Score: 0.987


In [85]:
svm_all_subs(all_preprocessed_data_balanced, 'linear', True, 3)

Overall averages for 20 subjects with SVM linear:
Average Precision: 0.985
Average Recall: 0.986
Average F1-Score: 0.985


In [100]:
svm_all_subs(all_preprocessed_data_balanced, 'linear', True, 4)

Overall averages for 20 subjects with SVM linear:
Average Precision: 0.981
Average Recall: 0.982
Average F1-Score: 0.982


In [86]:
svm_all_subs(all_preprocessed_data_balanced, 'linear', True, 5)

Overall averages for 20 subjects with SVM linear:
Average Precision: 0.982
Average Recall: 0.983
Average F1-Score: 0.982


In [87]:
svm_all_subs(all_preprocessed_data_balanced, 'linear', True, 10)

Overall averages for 20 subjects with SVM linear:
Average Precision: 0.973
Average Recall: 0.974
Average F1-Score: 0.973


In [88]:
svm_all_subs(all_preprocessed_data_balanced, 'linear', True, 15)

Overall averages for 20 subjects with SVM linear:
Average Precision: 0.970
Average Recall: 0.971
Average F1-Score: 0.970


## Gradient Boosting

In [68]:
def gradient_boosting_all_subs(data, norm = False, window = 1):
    precision_list_squares = []
    recall_list_squares = []
    f1_list_squares = []
    
    for i in range(1, 21):
        if(window == 1):
            X, y = data[i - 1][:, :-1].astype(float), data[i - 1][:, -1]
        else:
            X, y = window_data(data[i-1], window)
        skf = StratifiedKFold(n_splits=5)
        # Prepare lists to store metrics for each fold
        all_preds = []
        all_true = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            if(norm):
                X_train, X_test = normalize_data(X_train, X_test)
                
            clf = GradientBoostingClassifier()
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            # Collect predictions and true labels
            all_preds.extend(y_pred)
            all_true.extend(y_test)
        
        # Final combined classification report
        # print(f"Subject {i}")
        # print(classification_report(all_true, all_preds, zero_division=0))
        # print("\n")
    
        report_dict = classification_report(all_true, all_preds, output_dict=True, zero_division=0)
        macro_avg = report_dict['macro avg']
    
        avg_precision = macro_avg['precision']
        avg_recall = macro_avg['recall']
        avg_f1_score = macro_avg['f1-score']
    
        precision_list_squares.append(avg_precision)
        recall_list_squares.append(avg_recall)
        f1_list_squares.append(avg_f1_score)
        
    overall_avg_precision_squares = sum(precision_list_squares) / len(precision_list_squares)
    overall_avg_recall_squares = sum(recall_list_squares) / len(recall_list_squares)
    overall_avg_f1_squares = sum(f1_list_squares) / len(f1_list_squares)
    
    print("Overall averages for 20 subjects with Gradient Boosting:")
    print(f"Average Precision: {overall_avg_precision_squares:.3f}")
    print(f"Average Recall: {overall_avg_recall_squares:.3f}")
    print(f"Average F1-Score: {overall_avg_f1_squares:.3f}")
        

In [67]:
gradient_boosting_all_subs(all_raw_data)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.866
Average Recall: 0.880
Average F1-Score: 0.869


In [68]:
gradient_boosting_all_subs(all_raw_data_balanced)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.931
Average Recall: 0.922
Average F1-Score: 0.924


In [69]:
gradient_boosting_all_subs(all_preprocessed_data, norm = True)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.869
Average Recall: 0.880
Average F1-Score: 0.871


In [70]:
gradient_boosting_all_subs(all_preprocessed_data_balanced, norm = True)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.931
Average Recall: 0.922
Average F1-Score: 0.923


### Note: no difference with standardization

### Gradient Boosting Windows

In [89]:
gradient_boosting_all_subs(all_raw_data_balanced, False, 2)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.930
Average Recall: 0.923
Average F1-Score: 0.924


In [91]:
gradient_boosting_all_subs(all_preprocessed_data_balanced, True, 2)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.929
Average Recall: 0.922
Average F1-Score: 0.923


In [92]:
gradient_boosting_all_subs(all_raw_data_balanced, False, 3)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.936
Average Recall: 0.929
Average F1-Score: 0.930


In [93]:
gradient_boosting_all_subs(all_preprocessed_data_balanced, True, 3)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.935
Average Recall: 0.928
Average F1-Score: 0.929


In [94]:
gradient_boosting_all_subs(all_raw_data_balanced, False, 5)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.929
Average Recall: 0.923
Average F1-Score: 0.924


In [95]:
gradient_boosting_all_subs(all_preprocessed_data_balanced, True, 5)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.931
Average Recall: 0.924
Average F1-Score: 0.925


In [96]:
gradient_boosting_all_subs(all_raw_data_balanced, False, 10)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.917
Average Recall: 0.913
Average F1-Score: 0.912


In [97]:
gradient_boosting_all_subs(all_preprocessed_data_balanced, True, 10)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.918
Average Recall: 0.914
Average F1-Score: 0.914


In [98]:
gradient_boosting_all_subs(all_raw_data_balanced, False, 15)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.916
Average Recall: 0.912
Average F1-Score: 0.912


In [99]:
gradient_boosting_all_subs(all_preprocessed_data_balanced, True, 15)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.915
Average Recall: 0.911
Average F1-Score: 0.911


## Least Squares leave subject out

In [168]:
def least_squared_leave_one_out(data, norm = False, window = 1):
    all_preds = []
    all_true = []
    
    for i in range(20):
        leave_one_out = np.vstack([data[j] for j in range(20) if j != i])
        if(window == 1):    
            X_train, y_train = leave_one_out[:, :-1].astype(float), leave_one_out[:, -1]
            X_test, y_test = data[i][:, :-1].astype(float), data[i][:, -1]
        else:
            X_train, y_train = window_data(leave_one_out, window)
            X_test, y_test = window_data(data[i], window)

        if(norm):
            X_train, X_test = normalize_data(X_train, X_test)
                
        clf = RidgeClassifier(random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        all_preds.extend(y_pred)
        all_true.extend(y_test)
    
    report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    macro_avg = report_dict['macro avg']
    
    avg_precision = macro_avg['precision']
    avg_recall = macro_avg['recall']
    avg_f1_score = macro_avg['f1-score']
    
    print("Overall averages for 20 subjects with Least Sqaures classifier:")
    print(f"Average Precision: {avg_precision:.3f}")
    print(f"Average Recall: {avg_recall:.3f}")
    print(f"Average F1-Score: {avg_f1_score:.3f}")
        

In [169]:
# just checking
least_squared_leave_one_out(all_raw_data, norm = False, window = 1)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.604
Average Recall: 0.599
Average F1-Score: 0.573


In [170]:
least_squared_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 1)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.866
Average Recall: 0.812
Average F1-Score: 0.816


In [171]:
# (with old aggregation)
# least_squared_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 1)

In [172]:
least_squared_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 2)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.875
Average Recall: 0.825
Average F1-Score: 0.830


In [173]:
least_squared_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 3)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.875
Average Recall: 0.827
Average F1-Score: 0.831


In [174]:
least_squared_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 4)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.866
Average Recall: 0.820
Average F1-Score: 0.825


In [175]:
least_squared_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 5)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.851
Average Recall: 0.802
Average F1-Score: 0.805


In [176]:
least_squared_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 10)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.831
Average Recall: 0.793
Average F1-Score: 0.794


In [177]:
least_squared_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 15)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.801
Average Recall: 0.780
Average F1-Score: 0.778


## Logistic regression leave subject out

In [178]:
def logistic_regression_leave_one_out(data, norm = False, window = 1):
    all_preds = []
    all_true = []
    
    for i in range(20):
        leave_one_out = np.vstack([data[j] for j in range(20) if j != i])
        if(window == 1):    
            X_train, y_train = leave_one_out[:, :-1].astype(float), leave_one_out[:, -1]
            X_test, y_test = data[i][:, :-1].astype(float), data[i][:, -1]
        else:
            X_train, y_train = window_data(leave_one_out, window)
            X_test, y_test = window_data(data[i], window)

        if(norm):
            X_train, X_test = normalize_data(X_train, X_test)
                
        clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state = 42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        # Collect predictions and true labels
        all_preds.extend(y_pred)
        all_true.extend(y_test)
    
    report_dict = classification_report(all_true, all_preds, output_dict=True, zero_division=0)
    macro_avg = report_dict['macro avg']
    
    avg_precision = macro_avg['precision']
    avg_recall = macro_avg['recall']
    avg_f1_score = macro_avg['f1-score']
    
    print("Overall averages for 20 subjects with Logisitic Regression:")
    print(f"Average Precision: {avg_precision:.3f}")
    print(f"Average Recall: {avg_recall:.3f}")
    print(f"Average F1-Score: {avg_f1_score:.3f}")
        

In [180]:
logistic_regression_all_subs(all_preprocessed_data_balanced, norm = True, window = 1)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.985
Average Recall: 0.986
Average F1-Score: 0.985


In [181]:
logistic_regression_leave_one_out(all_preprocessed_data, norm = True, window = 1)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.613
Average Recall: 0.573
Average F1-Score: 0.583


In [179]:
logistic_regression_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 1)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.700
Average Recall: 0.700
Average F1-Score: 0.700


In [182]:
logistic_regression_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 2)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.702
Average Recall: 0.702
Average F1-Score: 0.702


In [183]:
logistic_regression_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 3)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.703
Average Recall: 0.703
Average F1-Score: 0.703


In [184]:
logistic_regression_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 4)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.701
Average Recall: 0.701
Average F1-Score: 0.701


In [185]:
logistic_regression_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 5)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.700
Average Recall: 0.701
Average F1-Score: 0.700


In [186]:
logistic_regression_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 10)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.710
Average Recall: 0.712
Average F1-Score: 0.711


In [187]:
logistic_regression_leave_one_out(all_preprocessed_data_balanced, norm = True, window = 15)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.713
Average Recall: 0.716
Average F1-Score: 0.714


## SVM Leave one out

In [232]:
def svm_leave_one_out(data, norm = False, window = 1):
    all_preds = []
    all_true = []
    
    for i in range(20):
        leave_one_out = np.vstack([data[j] for j in range(20) if j != i])
        if(window == 1):    
            X_train, y_train = leave_one_out[:, :-1].astype(float), leave_one_out[:, -1]
            X_test, y_test = data[i][:, :-1].astype(float), data[i][:, -1]
        else:
            X_train, y_train = window_data(leave_one_out, window)
            X_test, y_test = window_data(data[i], window)

        if(norm):
            X_train, X_test = normalize_data(X_train, X_test)
                
        clf = LinearSVC(random_state=42, max_iter=10000) 
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
            
        # Collect predictions and true labels
        all_preds.extend(y_pred)
        all_true.extend(y_test)
        
    
    report_dict = classification_report(all_true, all_preds, output_dict=True, zero_division=0)
    macro_avg = report_dict['macro avg']
    
    avg_precision = macro_avg['precision']
    avg_recall = macro_avg['recall']
    avg_f1_score = macro_avg['f1-score']
    
    print(f"Overall averages for 20 subjects with SVM {k}:")
    print(f"Average Precision: {avg_precision:.3f}")
    print(f"Average Recall: {avg_recall:.3f}")
    print(f"Average F1-Score: {avg_f1_score:.3f}")
        

In [None]:
# svm_leave_one_out(all_preprocessed_data, 'linear', norm = True)

In [None]:
svm_leave_one_out(all_preprocessed_data_balanced, norm = True)

In [161]:
len(all_raw_data[0])*19

43548

In [153]:
data = all_raw_data
for i in range(1, 21):
    new_data = np.vstack([data[j] for j in range(20) if j != i])


In [162]:
len(new_data)

46089

## Class distribution

In [84]:
from collections import Counter

In [85]:
def class_distribution_all_subs(data):
    for i in range(1, 21):
        y = data[i - 1][:, -1]  # Assuming labels are in the last column
        class_counts = Counter(y)
        print(f"Subject {i} class distribution:")
        for cls, count in sorted(class_counts.items()):
            print(f"  Class {cls}: {count} instances")
        print()

In [87]:
class_distribution_all_subs(all_raw_data_balanced)

Subject 1 class distribution:
  Class CognitiveStress: 364 instances
  Class EmotionalStress: 400 instances
  Class PhysicalStress: 328 instances
  Class Relax: 300 instances

Subject 2 class distribution:
  Class CognitiveStress: 355 instances
  Class EmotionalStress: 408 instances
  Class PhysicalStress: 327 instances
  Class Relax: 300 instances

Subject 3 class distribution:
  Class CognitiveStress: 354 instances
  Class EmotionalStress: 403 instances
  Class PhysicalStress: 324 instances
  Class Relax: 300 instances

Subject 4 class distribution:
  Class CognitiveStress: 355 instances
  Class EmotionalStress: 399 instances
  Class PhysicalStress: 327 instances
  Class Relax: 300 instances

Subject 5 class distribution:
  Class CognitiveStress: 354 instances
  Class EmotionalStress: 400 instances
  Class PhysicalStress: 326 instances
  Class Relax: 300 instances

Subject 6 class distribution:
  Class CognitiveStress: 356 instances
  Class EmotionalStress: 399 instances
  Class Phys