In [1]:
pip install wfdb

Note: you may need to restart the kernel to use updated packages.


In [60]:
import wfdb
import numpy as np
import os
import csv
from scipy.ndimage import gaussian_filter1d
from scipy.signal import medfilt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
def downsample_by_averaging(signal, factor):
    downsampled_signal = []
    
    for i in range(0, len(signal), factor): # moves by index of 8
        chunk = signal[i:i+factor, :] 
        downsampled_signal.append(np.mean(chunk, axis=0))  # avg of each column
        
    return np.array(downsampled_signal)


In [4]:
data_list=[] #takes all the subjects 
directory="non-eeg-dataset-for-assessment-of-neurological-status-1.0.0" # directory to all the files ATR, DAT, HEA
csv_path = os.path.join(directory, 'subjectinfo.csv') 
for i in range(1,21):
    ACCTEMPEDA_Path = os.path.join(directory, f'Subject{i}_AccTempEDA') 
    SPO2HR_Path = os.path.join(directory, f'Subject{i}_SpO2HR') 

    ACCTEMPEDA_Record = wfdb.rdrecord(ACCTEMPEDA_Path) #  rdrecord read .dat and .hea file
    SPO2HR_Record = wfdb.rdrecord(SPO2HR_Path) 
    annotations = wfdb.rdann(ACCTEMPEDA_Path, 'atr') # rdann reads .atr file

    # ACCTEMPEDA_DownSampled = downsample_by_averaging(ACCTEMPEDA_Record.p_signal, 8) # downsample ACCTEMP EDA as it is 8 HZ
    # ACCTEMPEDA_DownSampled_Record = wfdb.Record(
    #     record_name=f'Subject{i}_AccTempEDA_DownSampled', 
    #     p_signal=ACCTEMPEDA_DownSampled, 
    #     fs=1,
    #     sig_name=ACCTEMPEDA_Record.sig_name,
    #     units=ACCTEMPEDA_Record.units        
    # )
    with open(csv_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if int(row['subject']) == i:
                data_list.append({
                    'subject_id': i,
                    'acc_temp_eda': ACCTEMPEDA_Record,
                    'spo2_hr': SPO2HR_Record,
                    'annotations': annotations,
                    'age': int(row['age']),
                    'gender': row['gender'],
                    'height_cm': int(row['height/cm']),
                    'weight_kg': int(row['weight/kg'])
                })
                #print(f"Subject {i} Metadata:")
                #print(f"  Age: {int(row['age'])}")
                #print(f"  Gender: {row['gender']}")
                #print(f"  Height (cm): {int(row['height/cm'])}")
                #print(f"  Weight (kg): {int(row['weight/kg'])}")
                

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Legion Pro/non-eeg-dataset-for-assessment-of-neurological-status-1.0.0/Subject1_AccTempEDA.hea'

In [None]:
sub = data_list[1]
#print()
print(sub['acc_temp_eda'].p_signal)

In [None]:
subject = data_list[1]  
wfdb.plot_wfdb( record=subject['acc_temp_eda'], annotation=subject['annotations'], time_units='minutes', title='Subject 1 - Downsampled AccTempEDA (1Hz)')
wfdb.plot_wfdb( record=subject['spo2_hr'], annotation=subject['annotations'], time_units='minutes', title='Subject 1 - SpO2HR Signals')

In [None]:
preprocess(sub['acc_temp_eda'])

In [None]:
sub['acc_temp_eda'].p_signal = preprocess(sub['acc_temp_eda'].p_signal)
wfdb.plot_wfdb( record=sub['acc_temp_eda'], annotation=subject['annotations'], time_units='seconds', title='Subject 1 - median filtered AccTempEDA (1Hz)')

In [None]:
preprocess(sub['acc_temp_eda'])

In [None]:
sub['acc_temp_eda'].p_signal = preprocess(sub['acc_temp_eda'])
wfdb.plot_wfdb( record=sub['acc_temp_eda'], annotation=subject['annotations'], time_units='seconds', title='Subject 1 - gaussian filtered AccTempEDA (1Hz)')

# Extracting and Combining Data by Class

## Utility functions

In [5]:
# extract the ranges of each stage from their annotation
def get_stages(annotation, size):
    stages = []
    for i, (samp, note) in enumerate(zip(annotation.sample, annotation.aux_note)):
        stages.append({"Label" : note, "start": samp, "end": 0}) # dummy end
    
    for i in range((len(stages) - 1)):
        stages[i]["end"] = stages[i+1]["start"] # each stages end is the next one's beginning
    stages[-1]["end"] = size # set the end of last stage as the length of the data

    return stages  

In [6]:
# find label of a data point at a given index based on the ranges of each stage
def find_label(stages, index):
    for stage in stages:
        if index >= stage['start'] and index < stage['end']:
            return stage["Label"]
    return "Unknown"

In [7]:
# label the AccTempEDA data
def extract_labeled_AccTempEDA(record, annotation):
    stages = get_stages(annotation, record.p_signal.shape[0]) # get the range for each stage
    data = record.p_signal # extract the data into a numpy array
    labels = [] # array to store the labels corresponding to each data point in the record
    for idx, entry in enumerate(data): # loop over all the data
        labels.append((find_label(stages, idx))) # find the label based on the points index
    labeled_data = np.column_stack((data, labels)) # append the labels column to the data 
    return labeled_data # return the labeled data

In [8]:
# downsample by averaging BUT make sure you stay within a given class
def downsample_by_averaging(data, factor):
    downsampled_data = []
    i = 0
    while i + factor <= len(data): # cannot use for (range) as we may need to update i
        class_label = data[i][-1] # get the expected class lavel
        if class_label == data[i + factor - 1][-1]: # if all the data belongs to one class
            chunk = data[i: i + factor, :-1].astype(float) # extract a chunk (ignore last column [label] to average and cast to float to counter the upcasting when we added the label)
            avg_chunk = np.mean(chunk, axis=0)
            downsampled_data.append(np.append(avg_chunk, class_label))  # avg of each column while adding back the label
            i += factor
        else: # we have crossed into a new class, need to reset i to beginning of the new class
            for j in range(i + 1, i + factor):
                if data[j-1][-1] != data[j][-1]: # if data at index j has a different class from the one before it, update i and break
                    i = j
                    break
                    
    return np.array(downsampled_data)

In [9]:
# combine AccTempEDA and Spo2HR data while truncating them to be of the same length
# CRUCIAL but naive assumption! time stamps match after downsampling (technically incorrect)
def combine(AccTempEDA, Spo2HR):
    min_length = min(len(AccTempEDA), len(Spo2HR)) # finds the minimum length to align the points properly

    # concatenate the truncated columns from AccTempEDA with those from Spo2HR and add the labels at the end
    combined_data = np.column_stack((AccTempEDA[:min_length, :-1], Spo2HR[:min_length], AccTempEDA[:min_length, -1:]))

    return combined_data

In [10]:
def balance_classes(combined_data):
    balanced_data = []
    relaxed = False
    for data in combined_data:
        if data[-1] != "Relax": # we've passed the first relax
            relaxed = True
            balanced_data.append(data)
        elif not relaxed:
            balanced_data.append(data)
    return np.array(balanced_data)  

# Extracting Raw and Preprocessed data

In [11]:
# val is 1 incase labels are included 0 otherwise 
def preprocess(wavevec):
    final_vec = [] # stores the filtered signals
    for i in range(0,len(wavevec[0])): # goes through each point in the signal
        one_col = [wavevec[s][i].astype(float) for s in range(0,len(wavevec))] # filters column by column
        one_col = medfilt(one_col, kernel_size=9) # filtering each column
        if i == 0:
            final_vec = one_col
        else:
            final_vec = np.column_stack((final_vec, one_col))
    return final_vec

In [12]:
# get the full data for each subject
def get_subject_data_preprocessed(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation):
    AccTempEDA_record.p_signal = preprocess(AccTempEDA_record.p_signal)
    AccTempEDA = extract_labeled_AccTempEDA(AccTempEDA_record, AccTempEDA_annotation) # extract and label AccTempEDA record
    #print(AccTempEDA[3000])
    AccTempEDA = downsample_by_averaging(AccTempEDA, 8) # downsample to match the other Spo2HR's rate
    Spo2HR = preprocess(Spo2HR_record.p_signal) # extract Spo2HR record
    #print(Spo2HR.shape)
    combined_data = combine(AccTempEDA, Spo2HR) # combine both records
    #print(len(combined_data))
    return combined_data # return the resulting data

In [13]:
def get_subject_data_raw(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation):
    AccTempEDA = extract_labeled_AccTempEDA(AccTempEDA_record, AccTempEDA_annotation) # extract and label AccTempEDA record
    #print(AccTempEDA[3000])
    #print(AccTempEDA)
    AccTempEDA = downsample_by_averaging(AccTempEDA, 8) # downsample to match the other Spo2HR's rate
    Spo2HR = Spo2HR_record.p_signal # extract Spo2HR record
    #print(Spo2HR.shape)
    combined_data = combine(AccTempEDA, Spo2HR) # combine both records
    #print(len(combined_data))
    return combined_data # return the resulting data

In [14]:
def get_subject_data_raw_balanced(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation):
    AccTempEDA = extract_labeled_AccTempEDA(AccTempEDA_record, AccTempEDA_annotation) # extract and label AccTempEDA record
    #print(AccTempEDA[3000])
    #print(AccTempEDA)
    AccTempEDA = downsample_by_averaging(AccTempEDA, 8) # downsample to match the other Spo2HR's rate
    Spo2HR = Spo2HR_record.p_signal # extract Spo2HR record
    #print(Spo2HR.shape)
    combined_data = combine(AccTempEDA, Spo2HR) # combine both records
    balanced_data = balance_classes(combined_data)
    return balanced_data # return the resulting data

In [15]:
# get the full data for each subject
def get_subject_data_preprocessed_balanced(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation):
    AccTempEDA_record.p_signal = preprocess(AccTempEDA_record.p_signal)
    AccTempEDA = extract_labeled_AccTempEDA(AccTempEDA_record, AccTempEDA_annotation) # extract and label AccTempEDA record
    #print(AccTempEDA[3000])
    AccTempEDA = downsample_by_averaging(AccTempEDA, 8) # downsample to match the other Spo2HR's rate
    Spo2HR = preprocess(Spo2HR_record.p_signal) # extract Spo2HR record
    #print(Spo2HR.shape)
    combined_data = combine(AccTempEDA, Spo2HR) # combine both records
    balanced_data = balance_classes(combined_data)
    return balanced_data # return the resulting data

# Function to read and get raw data for all subjects and store them in csvs

In [None]:
read_directory="non-eeg-dataset-for-assessment-of-neurological-status-1.0.0" # directory to all the files ATR, DAT, HEA
type = "Preprocessed"
write_directory= f"Subject Data {type}"
all_sub_info = []
for i in range(1,21):
    csv_path = os.path.join(write_directory, f'subject_{i}_data_{type}.csv')
    ACCTEMPEDA_Path = os.path.join(read_directory, f'Subject{i}_AccTempEDA') 
    SPO2HR_Path = os.path.join(read_directory, f'Subject{i}_SpO2HR') 

    ACCTEMPEDA_Record = wfdb.rdrecord(ACCTEMPEDA_Path) #  rdrecord read .dat and .hea file
    SPO2HR_Record = wfdb.rdrecord(SPO2HR_Path) 
    annotations = wfdb.rdann(ACCTEMPEDA_Path, 'atr') # rdann reads .atr file
    subject_data = get_subject_data_preprocessed(ACCTEMPEDA_Record, SPO2HR_Record, annotations)
    all_sub_info.append(subject_data)
    # #print(f'subject{i}: {len(subject_data)}') 
    # with open(csv_path, mode='w', newline='') as csvfile:
    #     writer = csv.writer(csvfile)
    #     # Write headers
    #     writer.writerow(['ax', 'ay', 'az', 'temp', 'EDA', 'SpO2', 'HR', 'label'])
    #     # Write each row of subject data
    #     for row in subject_data:
    #         writer.writerow(row)

In [None]:
len(all_sub_info)

# Function to get the data in the desired window sizes

In [16]:
def windowSel(data, factor):
    # stores the windowed sample
    windowed = []
    labels = []
    # stores the last i in case the window exceeds the size of the sample
    for i in range(0, len(data), factor):
        # checks if i exceeds the sample size
        if i + factor < len(data):
            class_label = data[i][-1] # get the expected class lavel
            if class_label == data[i + factor - 1][-1]: # if all the data belongs to one class
            # adds the windowed sample 
                windowed.append(data[i: i + factor, :-1])
                labels.append(class_label)
            else:
                continue     
    return windowed, labels

In [None]:
windowed_samps = []
val = 5
for i in range(0,20):
   windowSel(all_sub_info[1],val)
    

# Compute Z score [Normalization] (importnat for methods like KNN, logistic regression, SVM and neural networks

In [17]:
# data includes the features without the labels column
def normalize_data(train_data, test_data):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(train_data)
    X_test_scaled = scaler.transform(test_data)
    return X_train_scaled, X_test_scaled
    
        

# Getting Data Raw, Preprocessed (Balanced and Unbalanced)

In [18]:
data_dir = rf"D:\University\8.Spring 2025\Machine\Project\non-eeg-dataset-for-assessment-of-neurological-status-1.0.0\non-eeg-dataset-for-assessment-of-neurological-status-1.0.0"
all_preprocessed_data_balanced=[]
all_preprocessed_data=[]
all_raw_data_balanced=[]
all_raw_data=[]
for i in range(1, 21):
    AccTempEDA_record = wfdb.rdrecord(fr"{data_dir}\Subject{i}_AccTempEDA")
    Spo2HR_record = wfdb.rdrecord(fr"{data_dir}\Subject{i}_SpO2HR")
    AccTempEDA_annotation = wfdb.rdann(fr"{data_dir}\Subject{i}_AccTempEDA", 'atr')
    
    data = get_subject_data_preprocessed(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation)
    raw_data = get_subject_data_raw(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation)
    data_balanced = get_subject_data_preprocessed_balanced(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation)
    raw_data_balanced = get_subject_data_raw_balanced(AccTempEDA_record, Spo2HR_record, AccTempEDA_annotation)
    
    all_preprocessed_data.append(data)
    all_raw_data.append(raw_data)
    all_preprocessed_data_balanced.append(data_balanced)
    all_raw_data_balanced.append(raw_data_balanced)

# Least Squares

In [49]:
def least_squared_all_subs(data, norm = False):
    precision_list_squares = []
    recall_list_squares = []
    f1_list_squares = []
    
    for i in range(1, 21):
        X, y = data[i - 1][:, :-1].astype(float), data[i - 1][:, -1]
        skf = StratifiedKFold(n_splits=5)
        # Prepare lists to store metrics for each fold
        all_preds = []
        all_true = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            if(norm):
                X_train, X_test = normalize_data(X_train, X_test)
                
            clf = RidgeClassifier()
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            # Collect predictions and true labels
            all_preds.extend(y_pred)
            all_true.extend(y_test)
        
        # Final combined classification report
        # print(f"Subject {i}")
        # print(classification_report(all_true, all_preds, zero_division=0))
        # print("\n")
    
        report_dict = classification_report(all_true, all_preds, output_dict=True, zero_division=0)
        macro_avg = report_dict['macro avg']
    
        avg_precision = macro_avg['precision']
        avg_recall = macro_avg['recall']
        avg_f1_score = macro_avg['f1-score']
    
        precision_list_squares.append(avg_precision)
        recall_list_squares.append(avg_recall)
        f1_list_squares.append(avg_f1_score)
        
    overall_avg_precision_squares = sum(precision_list_squares) / len(precision_list_squares)
    overall_avg_recall_squares = sum(recall_list_squares) / len(recall_list_squares)
    overall_avg_f1_squares = sum(f1_list_squares) / len(f1_list_squares)
    
    print("Overall averages for 20 subjects with Least Sqaures classifier:")
    print(f"Average Precision: {overall_avg_precision_squares:.3f}")
    print(f"Average Recall: {overall_avg_recall_squares:.3f}")
    print(f"Average F1-Score: {overall_avg_f1_squares:.3f}")
        

## Least Squares on Raw Data (Unbalanced)

In [50]:
least_squared_all_subs(all_raw_data, norm = False)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.783
Average Recall: 0.766
Average F1-Score: 0.765


## Least Squares on Raw Data (Balanced)

In [51]:
least_squared_all_subs(all_raw_data_balanced, norm = False)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.912
Average Recall: 0.908
Average F1-Score: 0.906


## Least Squares on Preprocessed (Unbalanced)

In [22]:
least_squared_all_subs(all_preprocessed_data, norm = True)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.836
Average Recall: 0.814
Average F1-Score: 0.793


## Least Squares on Preprocessed (Balanced)

In [23]:
least_squared_all_subs(all_preprocessed_data_balanced, norm = True)

Overall averages for 20 subjects with Least Sqaures classifier:
Average Precision: 0.931
Average Recall: 0.912
Average F1-Score: 0.907


# Logistic Regression

In [52]:
def logistic_regression_all_subs(data, norm = False):
    precision_list_squares = []
    recall_list_squares = []
    f1_list_squares = []
    
    for i in range(1, 21):
        X, y = data[i - 1][:, :-1].astype(float), data[i - 1][:, -1]
        skf = StratifiedKFold(n_splits=5)
        # Prepare lists to store metrics for each fold
        all_preds = []
        all_true = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            if(norm):
                X_train, X_test = normalize_data(X_train, X_test)
                
            clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            # Collect predictions and true labels
            all_preds.extend(y_pred)
            all_true.extend(y_test)
        
        # Final combined classification report
        # print(f"Subject {i}")
        # print(classification_report(all_true, all_preds, zero_division=0))
        # print("\n")
    
        report_dict = classification_report(all_true, all_preds, output_dict=True, zero_division=0)
        macro_avg = report_dict['macro avg']
    
        avg_precision = macro_avg['precision']
        avg_recall = macro_avg['recall']
        avg_f1_score = macro_avg['f1-score']
    
        precision_list_squares.append(avg_precision)
        recall_list_squares.append(avg_recall)
        f1_list_squares.append(avg_f1_score)
        
    overall_avg_precision_squares = sum(precision_list_squares) / len(precision_list_squares)
    overall_avg_recall_squares = sum(recall_list_squares) / len(recall_list_squares)
    overall_avg_f1_squares = sum(f1_list_squares) / len(f1_list_squares)
    
    print("Overall averages for 20 subjects with Logisitic Regression:")
    print(f"Average Precision: {overall_avg_precision_squares:.3f}")
    print(f"Average Recall: {overall_avg_recall_squares:.3f}")
    print(f"Average F1-Score: {overall_avg_f1_squares:.3f}")
        

## doesn't work without normalization

## Logistic Regression on Preprocessed (Unbalanced)

In [53]:
logistic_regression_all_subs(all_preprocessed_data, norm = True)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.809
Average Recall: 0.827
Average F1-Score: 0.813


## Logistic Regression on Preprocessed (Balanced)

In [54]:
logistic_regression_all_subs(all_preprocessed_data_balanced, norm = True)

Overall averages for 20 subjects with Logisitic Regression:
Average Precision: 0.941
Average Recall: 0.940
Average F1-Score: 0.939


# SVM

In [55]:
def svm_all_subs(data, k, norm = False):
    precision_list_squares = []
    recall_list_squares = []
    f1_list_squares = []
    
    for i in range(1, 21):
        X, y = data[i - 1][:, :-1].astype(float), data[i - 1][:, -1]
        skf = StratifiedKFold(n_splits=5)
        # Prepare lists to store metrics for each fold
        all_preds = []
        all_true = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            if(norm):
                X_train, X_test = normalize_data(X_train, X_test)
                
            clf = SVC(kernel= k, decision_function_shape='ovr')
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            # Collect predictions and true labels
            all_preds.extend(y_pred)
            all_true.extend(y_test)
        
        # Final combined classification report
        # print(f"Subject {i}")
        # print(classification_report(all_true, all_preds, zero_division=0))
        # print("\n")
    
        report_dict = classification_report(all_true, all_preds, output_dict=True, zero_division=0)
        macro_avg = report_dict['macro avg']
    
        avg_precision = macro_avg['precision']
        avg_recall = macro_avg['recall']
        avg_f1_score = macro_avg['f1-score']
    
        precision_list_squares.append(avg_precision)
        recall_list_squares.append(avg_recall)
        f1_list_squares.append(avg_f1_score)
        
    overall_avg_precision_squares = sum(precision_list_squares) / len(precision_list_squares)
    overall_avg_recall_squares = sum(recall_list_squares) / len(recall_list_squares)
    overall_avg_f1_squares = sum(f1_list_squares) / len(f1_list_squares)
    
    print(f"Overall averages for 20 subjects with SVM {k}:")
    print(f"Average Precision: {overall_avg_precision_squares:.3f}")
    print(f"Average Recall: {overall_avg_recall_squares:.3f}")
    print(f"Average F1-Score: {overall_avg_f1_squares:.3f}")
        

## Unnormalized also doesn't run (takes too long)

### Linear SVM on Preprocessed (UnBalanced)

In [56]:
svm_all_subs(all_preprocessed_data, 'linear', norm = True)

Overall averages for 20 subjects with SVM linear:
Average Precision: 0.831
Average Recall: 0.856
Average F1-Score: 0.837


### Linear SVM on Preprocessed (Balanced)

In [57]:
svm_all_subs(all_preprocessed_data_balanced, 'linear', norm = True)

Overall averages for 20 subjects with SVM linear:
Average Precision: 0.948
Average Recall: 0.948
Average F1-Score: 0.946


### Non-linear SVM on Preprocessed (UnBalanced)

In [58]:
svm_all_subs(all_preprocessed_data, 'rbf', norm = True)

Overall averages for 20 subjects with SVM rbf:
Average Precision: 0.866
Average Recall: 0.882
Average F1-Score: 0.869


### Non-linear SVM on Preprocessed (Balanced)

In [59]:
svm_all_subs(all_preprocessed_data_balanced, 'rbf', norm = True)

Overall averages for 20 subjects with SVM rbf:
Average Precision: 0.949
Average Recall: 0.950
Average F1-Score: 0.948


#### Observation (little difference between linear and non-linear) since high dimensionality

## Gradient Boosting

In [66]:
def gradient_boosting_all_subs(data, norm = False):
    precision_list_squares = []
    recall_list_squares = []
    f1_list_squares = []
    
    for i in range(1, 21):
        X, y = data[i - 1][:, :-1].astype(float), data[i - 1][:, -1]
        skf = StratifiedKFold(n_splits=5)
        # Prepare lists to store metrics for each fold
        all_preds = []
        all_true = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            if(norm):
                X_train, X_test = normalize_data(X_train, X_test)
                
            clf = GradientBoostingClassifier()
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            # Collect predictions and true labels
            all_preds.extend(y_pred)
            all_true.extend(y_test)
        
        # Final combined classification report
        # print(f"Subject {i}")
        # print(classification_report(all_true, all_preds, zero_division=0))
        # print("\n")
    
        report_dict = classification_report(all_true, all_preds, output_dict=True, zero_division=0)
        macro_avg = report_dict['macro avg']
    
        avg_precision = macro_avg['precision']
        avg_recall = macro_avg['recall']
        avg_f1_score = macro_avg['f1-score']
    
        precision_list_squares.append(avg_precision)
        recall_list_squares.append(avg_recall)
        f1_list_squares.append(avg_f1_score)
        
    overall_avg_precision_squares = sum(precision_list_squares) / len(precision_list_squares)
    overall_avg_recall_squares = sum(recall_list_squares) / len(recall_list_squares)
    overall_avg_f1_squares = sum(f1_list_squares) / len(f1_list_squares)
    
    print("Overall averages for 20 subjects with Gradient Boosting:")
    print(f"Average Precision: {overall_avg_precision_squares:.3f}")
    print(f"Average Recall: {overall_avg_recall_squares:.3f}")
    print(f"Average F1-Score: {overall_avg_f1_squares:.3f}")
        

In [67]:
gradient_boosting_all_subs(all_raw_data)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.866
Average Recall: 0.880
Average F1-Score: 0.869


In [68]:
gradient_boosting_all_subs(all_raw_data_balanced)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.931
Average Recall: 0.922
Average F1-Score: 0.924


In [69]:
gradient_boosting_all_subs(all_preprocessed_data, norm = True)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.869
Average Recall: 0.880
Average F1-Score: 0.871


In [70]:
gradient_boosting_all_subs(all_preprocessed_data_balanced, norm = True)

Overall averages for 20 subjects with Gradient Boosting:
Average Precision: 0.931
Average Recall: 0.922
Average F1-Score: 0.923


### Note: no difference with standardization