In [1]:
import pickle
import numpy as np
import pandas as pd
import statistics
from os import listdir
import pycaret
from pycaret.classification import *
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

## SWELL

In [2]:
swell = pd.read_csv("Final_CSVs/swell.csv")

In [3]:
swell = swell.drop(columns=["Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10", "Unnamed: 11", "timestamp", "C"])

In [4]:
swell['Condition'] = np.where(swell['Condition']=='N',0,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='R',0,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='I',1,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='T',1,swell['Condition'])

## User-based Models

In [5]:
unique_participants4 = swell["PP"].unique()

In [6]:
len(unique_participants4)

25

In [7]:
print(unique_participants4)

['PP1' 'PP2' 'PP3' 'PP4' 'PP5' 'PP6' 'PP7' 'PP8' 'PP9' 'PP10' 'PP11'
 'PP12' 'PP13' 'PP14' 'PP15' 'PP16' 'PP17' 'PP18' 'PP19' 'PP20' 'PP21'
 'PP22' 'PP23' 'PP24' 'PP25']


In [8]:
person_group4 = swell.groupby('PP')

In [72]:
from sklearn.model_selection import train_test_split
import os  # Import os module for folder creation

# Specify folder name
folder_name = "SWELL_User_Based_Splitting_Output_Files"

# Load the CSV data into a pandas dataframe
df_user_info = pd.read_csv("Scored_Surveys/swell_person.csv", sep=";")

accuracies_swell = []
precision_swell = []
recall_swell = []
f1scores_swell = []

for participant in unique_participants4:
    print("Participant: ",participant)    
    part_df = person_group4.get_group(participant)

    # -----------------------------------------------------

    train_data, test_data = train_test_split(part_df)

    fold_groups = train_data.PP

    # Save the 'id' column from the test set
    test_ids = test_data['PP']

    train_data = train_data.drop(columns=['PP'])
    test_data = test_data.drop(columns=['PP'])

    # -----------------------------------------------------

    # grid = setup(data=train_data, target='Condition', fix_imbalance = True, html=False, verbose=False, fold_strategy='groupkfold', fold=3, fold_groups=fold_groups, test_data=test_data) #fix_imbalance = True,
    grid = setup(data=train_data, target='Condition', fix_imbalance = True, html=False, verbose=False, test_data=test_data) #fix_imbalance = True,
    best = compare_models(sort='F1')
    accuracies_swell.append(pull()['Accuracy'][0])
    precision_swell.append(pull()['Prec.'][0])
    recall_swell.append(pull()['Recall'][0])
    f1scores_swell.append(pull()['F1'][0])
    print(best)

    # ---------------------------------------------------

    # Make predictions using the best model
    predictions = predict_model(best, data=test_data)

    # Add 'id' column back to predictions DataFrame
    predictions['PP'] = test_ids  # Use the 'id' column from the test data

    # Merge the dataframes based on the 'PP' column
    predictions_with_info = predictions.merge(df_user_info, on="PP", how="left")

    # Filter data for the current participant
    # participant_data = all_data[all_data['PP'] == participant]

    # Extract true labels (y_true), rename the column
    y_true = predictions_with_info[['Condition']].rename(columns={'Condition': 'y_true'})

    # Extract predicted labels (y_pred), rename the column
    y_pred = predictions_with_info[['prediction_label']].rename(columns={'prediction_label': 'y_pred'})

    # Identify protected attribute columns (assuming you know the column names)
    protected_attributes = predictions_with_info[['Age', 'Gender', 'Occupation']]

    # Concatenate DataFrames containing predictions and protected attributes
    all_data = pd.concat([test_ids.reset_index(drop=True), y_true, y_pred, protected_attributes], axis=1)

    # Create filename
    filename = f"predictions_{participant}.csv"

    # Create folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)  # Create folder using makedirs()

    # Create full path with folder name
    filepath = os.path.join(folder_name, filename)

    # Save predictions
    all_data.to_csv(filepath, index=False)

    print(f"Predictions saved to: {filepath}")

Participant:  PP1


                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
et                 Extra Trees Classifier    0.6800  0.6895   0.670  0.6983   
svm                   SVM - Linear Kernel    0.5289  0.0000   0.920  0.5500   
dt               Decision Tree Classifier    0.6456  0.6425   0.665  0.6450   
lightgbm  Light Gradient Boosting Machine    0.6478  0.7305   0.625  0.6738   
knn                K Neighbors Classifier    0.6789  0.7510   0.620  0.6571   
lda          Linear Discriminant Analysis    0.6367  0.6970   0.605  0.6663   
rf               Random Forest Classifier    0.6267  0.6895   0.625  0.6267   
gbc          Gradient Boosting Classifier    0.5856  0.7155   0.625  0.5788   
ridge                    Ridge Classifier    0.6156  0.0000   0.580  0.6321   
ada                  Ada Boost Classifier    0.5622  0.6425   0.575  0.5671   
lr                    Logistic Regression    0.5644  0.6620   0.540  0.5683   
qda       Quadratic Discriminant Analysis    0.5222 

                                                           

                                    Model  Accuracy    AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.9222  0.990   0.930  0.9183   
rf               Random Forest Classifier    0.9111  0.980   0.930  0.9100   
ada                  Ada Boost Classifier    0.9111  0.975   0.905  0.9300   
gbc          Gradient Boosting Classifier    0.9111  0.960   0.905  0.9300   
et                 Extra Trees Classifier    0.9111  0.975   0.910  0.9300   
ridge                    Ridge Classifier    0.8889  0.000   0.975  0.8364   
lda          Linear Discriminant Analysis    0.8889  0.900   0.975  0.8364   
dt               Decision Tree Classifier    0.9000  0.900   0.885  0.9183   
knn                K Neighbors Classifier    0.8889  0.955   0.930  0.8600   
qda       Quadratic Discriminant Analysis    0.8333  0.910   0.930  0.7902   
lr                    Logistic Regression    0.8556  0.895   0.860  0.8664   
nb                            Naive Bayes    0.8222  0.865   0.9

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.7200  0.8281  0.7667  0.7633   
ada                  Ada Boost Classifier    0.7344  0.8115  0.6967  0.8145   
gbc          Gradient Boosting Classifier    0.6711  0.7927  0.6767  0.7345   
rf               Random Forest Classifier    0.6600  0.7805  0.6967  0.6995   
et                 Extra Trees Classifier    0.6600  0.7413  0.6967  0.7045   
dt               Decision Tree Classifier    0.6589  0.6819  0.6767  0.7212   
lightgbm  Light Gradient Boosting Machine    0.6989  0.7413  0.5933  0.7983   
svm                   SVM - Linear Kernel    0.5611  0.0000  0.5833  0.7111   
ridge                    Ridge Classifier    0.6656  0.0000  0.3833  1.0000   
lda          Linear Discriminant Analysis    0.6656  0.5781  0.3833  1.0000   
qda       Quadratic Discriminant Analysis    0.6444  0.4894  0.3833  0.9000   
lr                    Logistic Regression    0.6444 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
dt               Decision Tree Classifier    0.8556  0.8386  0.8900  0.8581   
lightgbm  Light Gradient Boosting Machine    0.8556  0.8829  0.8933  0.8514   
ada                  Ada Boost Classifier    0.8456  0.8632  0.8900  0.8414   
et                 Extra Trees Classifier    0.8444  0.8517  0.8700  0.8642   
knn                K Neighbors Classifier    0.8444  0.9011  0.8633  0.8579   
gbc          Gradient Boosting Classifier    0.8456  0.8712  0.8700  0.8531   
rf               Random Forest Classifier    0.8344  0.8698  0.8500  0.8608   
nb                            Naive Bayes    0.6778  0.8486  0.9400  0.6644   
qda       Quadratic Discriminant Analysis    0.6144  0.8416  0.9400  0.5916   
lr                    Logistic Regression    0.7067  0.8507  0.5967  0.8517   
ridge                    Ridge Classifier    0.7067  0.0000  0.5967  0.8517   
lda          Linear Discriminant Analysis    0.7067 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.7289  0.7785    0.80  0.7526   
lightgbm  Light Gradient Boosting Machine    0.7311  0.7295    0.82  0.7242   
et                 Extra Trees Classifier    0.7422  0.7875    0.74  0.7850   
ada                  Ada Boost Classifier    0.7067  0.6955    0.72  0.7483   
rf               Random Forest Classifier    0.7100  0.7600    0.70  0.7550   
gbc          Gradient Boosting Classifier    0.7100  0.7960    0.70  0.7733   
dt               Decision Tree Classifier    0.6789  0.6775    0.68  0.7555   
lr                    Logistic Regression    0.6244  0.6100    0.76  0.6470   
ridge                    Ridge Classifier    0.5944  0.0000    0.72  0.6006   
lda          Linear Discriminant Analysis    0.5944  0.5840    0.72  0.6006   
svm                   SVM - Linear Kernel    0.5722  0.0000    0.62  0.5994   
nb                            Naive Bayes    0.5167 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.7878  0.8165    0.84  0.7989   
gbc          Gradient Boosting Classifier    0.7767  0.8170    0.84  0.8014   
et                 Extra Trees Classifier    0.7656  0.7240    0.84  0.7636   
dt               Decision Tree Classifier    0.7644  0.7600    0.82  0.7681   
rf               Random Forest Classifier    0.7422  0.7705    0.84  0.7362   
ada                  Ada Boost Classifier    0.7100  0.7865    0.76  0.7333   
ridge                    Ridge Classifier    0.7467  0.0000    0.68  0.8429   
lda          Linear Discriminant Analysis    0.7467  0.7910    0.68  0.8429   
lr                    Logistic Regression    0.7467  0.8230    0.66  0.8350   
nb                            Naive Bayes    0.6811  0.7300    0.74  0.7100   
qda       Quadratic Discriminant Analysis    0.7233  0.8320    0.66  0.8198   
knn                K Neighbors Classifier    0.6700 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
ada                  Ada Boost Classifier    0.7967  0.8242  0.8100  0.8348   
ridge                    Ridge Classifier    0.7356  0.0000  0.9667  0.7014   
lda          Linear Discriminant Analysis    0.7356  0.7762  0.9667  0.7014   
knn                K Neighbors Classifier    0.7844  0.8486  0.8267  0.8270   
dt               Decision Tree Classifier    0.7833  0.7842  0.7933  0.8150   
lightgbm  Light Gradient Boosting Machine    0.7867  0.8525  0.7533  0.8538   
et                 Extra Trees Classifier    0.7644  0.8522  0.7933  0.7986   
nb                            Naive Bayes    0.7156  0.8267  0.9267  0.6927   
qda       Quadratic Discriminant Analysis    0.7156  0.8523  0.9267  0.6927   
rf               Random Forest Classifier    0.7556  0.8370  0.8133  0.7571   
gbc          Gradient Boosting Classifier    0.7656  0.8788  0.7767  0.7871   
lr                    Logistic Regression    0.7144 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.5958  0.6081   0.590  0.6050   
nb                            Naive Bayes    0.5958  0.6131   0.590  0.6050   
ridge                    Ridge Classifier    0.5958  0.0000   0.590  0.6050   
lda          Linear Discriminant Analysis    0.5958  0.6131   0.590  0.6050   
lightgbm  Light Gradient Boosting Machine    0.5625  0.5462   0.520  0.5833   
dt               Decision Tree Classifier    0.5653  0.5331   0.360  0.6500   
gbc          Gradient Boosting Classifier    0.5653  0.5069   0.360  0.6500   
ada                  Ada Boost Classifier    0.5542  0.5244   0.385  0.5917   
rf               Random Forest Classifier    0.5542  0.4906   0.340  0.6333   
et                 Extra Trees Classifier    0.5542  0.4881   0.340  0.6333   
knn                K Neighbors Classifier    0.5542  0.5963   0.370  0.5750   
svm                   SVM - Linear Kernel    0.5278 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
qda       Quadratic Discriminant Analysis    0.6389  0.7705   0.920  0.6098   
ridge                    Ridge Classifier    0.7011  0.0000   0.680  0.7388   
lda          Linear Discriminant Analysis    0.7011  0.7455   0.680  0.7388   
et                 Extra Trees Classifier    0.6811  0.7410   0.710  0.6983   
rf               Random Forest Classifier    0.6911  0.7345   0.690  0.7333   
dt               Decision Tree Classifier    0.6478  0.6355   0.640  0.7050   
lightgbm  Light Gradient Boosting Machine    0.6478  0.7060   0.640  0.7000   
lr                    Logistic Regression    0.6567  0.7005   0.640  0.6738   
knn                K Neighbors Classifier    0.6267  0.6905   0.655  0.6417   
gbc          Gradient Boosting Classifier    0.6144  0.7195   0.600  0.6488   
ada                  Ada Boost Classifier    0.6144  0.6790   0.595  0.6371   
nb                            Naive Bayes    0.5278 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.7744  0.8045   0.920  0.7239   
et                 Extra Trees Classifier    0.7844  0.8095   0.775  0.8000   
ridge                    Ridge Classifier    0.7078  0.0000   0.940  0.6558   
rf               Random Forest Classifier    0.7644  0.8360   0.755  0.7850   
lightgbm  Light Gradient Boosting Machine    0.7422  0.7515   0.835  0.7151   
lda          Linear Discriminant Analysis    0.6978  0.8140   0.940  0.6425   
ada                  Ada Boost Classifier    0.7400  0.8320   0.755  0.7638   
gbc          Gradient Boosting Classifier    0.7300  0.8090   0.735  0.7467   
dt               Decision Tree Classifier    0.7322  0.7300   0.715  0.7550   
lr                    Logistic Regression    0.6900  0.8100   0.780  0.6733   
svm                   SVM - Linear Kernel    0.6656  0.0000   0.665  0.6948   
qda       Quadratic Discriminant Analysis    0.5856 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.8400  0.8096  1.0000  0.7952   
et                 Extra Trees Classifier    0.8400  0.8169  1.0000  0.7952   
lightgbm  Light Gradient Boosting Machine    0.8400  0.8189  1.0000  0.7952   
dt               Decision Tree Classifier    0.8300  0.8107  0.9833  0.7929   
rf               Random Forest Classifier    0.8300  0.8019  0.9833  0.7929   
ada                  Ada Boost Classifier    0.8300  0.8169  0.9833  0.7929   
gbc          Gradient Boosting Classifier    0.8300  0.8169  0.9833  0.7929   
lr                    Logistic Regression    0.8000  0.8189  0.9033  0.7917   
nb                            Naive Bayes    0.8000  0.8189  0.9033  0.7917   
ridge                    Ridge Classifier    0.8000  0.0000  0.9033  0.7917   
lda          Linear Discriminant Analysis    0.8000  0.8264  0.9033  0.7917   
svm                   SVM - Linear Kernel    0.6900 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.9467  1.0000    0.94  0.9667   
et                 Extra Trees Classifier    0.9467  0.9685    0.94  0.9667   
ada                  Ada Boost Classifier    0.9367  0.9790    0.94  0.9500   
gbc          Gradient Boosting Classifier    0.9378  0.9750    0.94  0.9500   
dt               Decision Tree Classifier    0.9267  0.9275    0.94  0.9333   
lightgbm  Light Gradient Boosting Machine    0.9067  0.9600    0.94  0.9173   
qda       Quadratic Discriminant Analysis    0.8856  0.9650    0.88  0.9181   
ridge                    Ridge Classifier    0.8433  0.0000    0.90  0.8406   
lda          Linear Discriminant Analysis    0.8333  0.8550    0.88  0.8358   
lr                    Logistic Regression    0.8333  0.8390    0.88  0.8373   
knn                K Neighbors Classifier    0.8400  0.9625    0.84  0.8862   
svm                   SVM - Linear Kernel    0.7467 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.7967  0.7915   0.845  0.7600   
lightgbm  Light Gradient Boosting Machine    0.7533  0.8385   0.730  0.7533   
rf               Random Forest Classifier    0.7522  0.8060   0.750  0.7250   
et                 Extra Trees Classifier    0.7322  0.7945   0.755  0.7055   
ada                  Ada Boost Classifier    0.7222  0.7105   0.660  0.7350   
nb                            Naive Bayes    0.7000  0.7970   0.670  0.7050   
gbc          Gradient Boosting Classifier    0.7111  0.7985   0.665  0.7100   
lda          Linear Discriminant Analysis    0.6889  0.7265   0.645  0.6883   
qda       Quadratic Discriminant Analysis    0.6889  0.7720   0.625  0.7083   
dt               Decision Tree Classifier    0.7000  0.6890   0.595  0.7267   
lr                    Logistic Regression    0.6789  0.6785   0.625  0.6800   
ridge                    Ridge Classifier    0.6789 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
et                 Extra Trees Classifier    0.9078  0.9435  0.9233  0.9131   
rf               Random Forest Classifier    0.9067  0.9597  0.9067  0.9250   
dt               Decision Tree Classifier    0.8878  0.8892  0.9033  0.8933   
knn                K Neighbors Classifier    0.8867  0.9068  0.9200  0.8707   
gbc          Gradient Boosting Classifier    0.8778  0.9560  0.8867  0.8883   
lightgbm  Light Gradient Boosting Machine    0.8733  0.9580  0.8800  0.8750   
ada                  Ada Boost Classifier    0.8356  0.8882  0.8867  0.8386   
svm                   SVM - Linear Kernel    0.7033  0.0000  0.8400  0.6509   
lda          Linear Discriminant Analysis    0.6378  0.6855  0.6633  0.6731   
lr                    Logistic Regression    0.6544  0.7942  0.5867  0.7517   
ridge                    Ridge Classifier    0.6044  0.0000  0.5833  0.6514   
qda       Quadratic Discriminant Analysis    0.6311 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.9333  0.9650   0.980  0.9033   
knn                K Neighbors Classifier    0.9222  0.9625   0.980  0.8914   
lightgbm  Light Gradient Boosting Machine    0.8889  0.9700   0.955  0.8581   
nb                            Naive Bayes    0.8222  0.9550   1.000  0.7400   
ridge                    Ridge Classifier    0.8222  0.0000   1.000  0.7400   
lda          Linear Discriminant Analysis    0.8222  0.9650   1.000  0.7400   
ada                  Ada Boost Classifier    0.8556  0.9450   0.840  0.8683   
dt               Decision Tree Classifier    0.8444  0.8450   0.865  0.8283   
rf               Random Forest Classifier    0.8444  0.9425   0.840  0.8433   
gbc          Gradient Boosting Classifier    0.8444  0.9325   0.840  0.8533   
et                 Extra Trees Classifier    0.8444  0.9325   0.840  0.8433   
qda       Quadratic Discriminant Analysis    0.4667 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.9278  0.9735  0.9267  0.9457   
knn                K Neighbors Classifier    0.9289  0.9422  0.8933  0.9800   
et                 Extra Trees Classifier    0.9167  0.9577  0.9267  0.9290   
lightgbm  Light Gradient Boosting Machine    0.9044  0.9538  0.9267  0.9225   
ada                  Ada Boost Classifier    0.8856  0.9415  0.9067  0.9143   
gbc          Gradient Boosting Classifier    0.8756  0.9567  0.8933  0.9038   
lr                    Logistic Regression    0.8756  0.9663  0.8900  0.8990   
dt               Decision Tree Classifier    0.8633  0.8625  0.8900  0.8833   
svm                   SVM - Linear Kernel    0.8656  0.0000  0.8033  0.9633   
lda          Linear Discriminant Analysis    0.7667  0.8945  0.8100  0.8079   
ridge                    Ridge Classifier    0.7689  0.0000  0.8133  0.8040   
qda       Quadratic Discriminant Analysis    0.6756 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
et                 Extra Trees Classifier    0.9144  0.9640    0.96  0.9071   
lightgbm  Light Gradient Boosting Machine    0.9044  0.9450    0.94  0.9000   
rf               Random Forest Classifier    0.9056  0.9630    0.92  0.9133   
ada                  Ada Boost Classifier    0.9067  0.9435    0.92  0.9100   
gbc          Gradient Boosting Classifier    0.8956  0.9260    0.92  0.8933   
knn                K Neighbors Classifier    0.8633  0.9140    0.90  0.8758   
lr                    Logistic Regression    0.8633  0.9230    0.88  0.8767   
dt               Decision Tree Classifier    0.8544  0.8550    0.88  0.8583   
ridge                    Ridge Classifier    0.8144  0.0000    0.88  0.7981   
lda          Linear Discriminant Analysis    0.8144  0.8500    0.88  0.7981   
qda       Quadratic Discriminant Analysis    0.7700  0.9130    1.00  0.7123   
nb                            Naive Bayes    0.7500 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
dt               Decision Tree Classifier    0.6958  0.6842  0.7433  0.7493   
ada                  Ada Boost Classifier    0.6833  0.7506  0.6867  0.7450   
ridge                    Ridge Classifier    0.7083  0.0000  0.6600  0.7898   
qda       Quadratic Discriminant Analysis    0.6292  0.7150  0.8200  0.6323   
gbc          Gradient Boosting Classifier    0.6736  0.7597  0.6667  0.7483   
et                 Extra Trees Classifier    0.6292  0.6161  0.7233  0.6604   
lr                    Logistic Regression    0.6528  0.7361  0.6633  0.7258   
lda          Linear Discriminant Analysis    0.6639  0.7033  0.6233  0.7581   
nb                            Naive Bayes    0.5736  0.5917  0.7400  0.6062   
lightgbm  Light Gradient Boosting Machine    0.6389  0.7469  0.6433  0.6962   
rf               Random Forest Classifier    0.5944  0.6744  0.6667  0.6339   
knn                K Neighbors Classifier    0.5486 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
ada                  Ada Boost Classifier    0.7644  0.8115   0.815  0.7571   
gbc          Gradient Boosting Classifier    0.7611  0.8180   0.745  0.7833   
knn                K Neighbors Classifier    0.7533  0.8625   0.745  0.7876   
nb                            Naive Bayes    0.7433  0.8490   0.735  0.7683   
qda       Quadratic Discriminant Analysis    0.7433  0.7860   0.715  0.7767   
lr                    Logistic Regression    0.7333  0.6790   0.695  0.7683   
et                 Extra Trees Classifier    0.7122  0.7880   0.715  0.7133   
ridge                    Ridge Classifier    0.7233  0.0000   0.675  0.7617   
lda          Linear Discriminant Analysis    0.7233  0.6870   0.675  0.7617   
rf               Random Forest Classifier    0.6711  0.7960   0.710  0.6699   
dt               Decision Tree Classifier    0.7022  0.6925   0.665  0.7121   
lightgbm  Light Gradient Boosting Machine    0.6511 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.8433  0.9144  0.9233  0.8458   
et                 Extra Trees Classifier    0.7722  0.8844  0.8733  0.7810   
ada                  Ada Boost Classifier    0.7578  0.8824  0.8433  0.7719   
rf               Random Forest Classifier    0.7489  0.9044  0.7933  0.8151   
dt               Decision Tree Classifier    0.7367  0.8564  0.8133  0.7828   
gbc          Gradient Boosting Classifier    0.7267  0.8674  0.7933  0.7773   
lightgbm  Light Gradient Boosting Machine    0.6844  0.8489  0.7567  0.7267   
qda       Quadratic Discriminant Analysis    0.5756  0.6874  0.9633  0.5675   
svm                   SVM - Linear Kernel    0.4778  0.0000  0.6233  0.4500   
lda          Linear Discriminant Analysis    0.4589  0.5736  0.4067  0.4362   
ridge                    Ridge Classifier    0.4778  0.0000  0.3633  0.4433   
lr                    Logistic Regression    0.4256 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
et                 Extra Trees Classifier    0.6232  0.5788  0.6250  0.6595   
ada                  Ada Boost Classifier    0.5982  0.5898  0.5667  0.5917   
svm                   SVM - Linear Kernel    0.4429  0.0000  0.9000  0.4054   
gbc          Gradient Boosting Classifier    0.5607  0.5688  0.5500  0.5595   
rf               Random Forest Classifier    0.5857  0.6429  0.5250  0.6708   
dt               Decision Tree Classifier    0.4589  0.4683  0.5167  0.4258   
knn                K Neighbors Classifier    0.5125  0.5227  0.4500  0.4683   
lr                    Logistic Regression    0.6089  0.6658  0.4000  0.5433   
lda          Linear Discriminant Analysis    0.5750  0.6279  0.3750  0.4833   
qda       Quadratic Discriminant Analysis    0.6214  0.6075  0.3500  0.5167   
ridge                    Ridge Classifier    0.5857  0.0000  0.3750  0.4750   
lightgbm  Light Gradient Boosting Machine    0.4964 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
et                 Extra Trees Classifier    0.8711  0.8821  0.8467  0.9267   
nb                            Naive Bayes    0.8278  0.9195  1.0000  0.7798   
dt               Decision Tree Classifier    0.8389  0.8383  0.8267  0.8898   
rf               Random Forest Classifier    0.8300  0.9417  0.8133  0.8948   
knn                K Neighbors Classifier    0.8189  0.9192  0.8500  0.8483   
gbc          Gradient Boosting Classifier    0.8211  0.9360  0.7733  0.9148   
ridge                    Ridge Classifier    0.7867  0.0000  0.8600  0.8298   
ada                  Ada Boost Classifier    0.8089  0.9313  0.7533  0.9033   
lr                    Logistic Regression    0.7744  0.9358  0.7900  0.8490   
lda          Linear Discriminant Analysis    0.7556  0.9358  0.7767  0.8431   
lightgbm  Light Gradient Boosting Machine    0.7533  0.8992  0.7333  0.8250   
svm                   SVM - Linear Kernel    0.7667 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.7389  0.7465   0.635  0.7550   
nb                            Naive Bayes    0.5311  0.7285   1.000  0.5222   
lightgbm  Light Gradient Boosting Machine    0.7289  0.7445   0.615  0.7550   
qda       Quadratic Discriminant Analysis    0.5111  0.0000   1.000  0.5111   
ridge                    Ridge Classifier    0.7267  0.0000   0.595  0.7800   
lda          Linear Discriminant Analysis    0.7267  0.7735   0.595  0.7800   
knn                K Neighbors Classifier    0.5278  0.6525   0.900  0.5540   
rf               Random Forest Classifier    0.7067  0.7145   0.575  0.7550   
et                 Extra Trees Classifier    0.7067  0.7110   0.575  0.7550   
gbc          Gradient Boosting Classifier    0.6856  0.6450   0.535  0.7500   
ada                  Ada Boost Classifier    0.6644  0.6370   0.490  0.7333   
dt               Decision Tree Classifier    0.6544 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.9167  0.9405    1.00  0.8714   
rf               Random Forest Classifier    0.9178  0.9440    0.98  0.8833   
gbc          Gradient Boosting Classifier    0.9178  0.9450    0.98  0.8833   
dt               Decision Tree Classifier    0.9067  0.9110    0.98  0.8667   
lightgbm  Light Gradient Boosting Machine    0.8956  0.9240    0.98  0.8548   
et                 Extra Trees Classifier    0.8844  0.9500    0.96  0.8625   
ada                  Ada Boost Classifier    0.8756  0.9190    0.96  0.8381   
qda       Quadratic Discriminant Analysis    0.8144  0.7420    0.80  0.8433   
ridge                    Ridge Classifier    0.8044  0.0000    0.80  0.8283   
lda          Linear Discriminant Analysis    0.8044  0.7490    0.80  0.8283   
lr                    Logistic Regression    0.7844  0.7370    0.80  0.8033   
nb                            Naive Bayes    0.7422 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.9178  0.9438  0.9467  0.9133   
rf               Random Forest Classifier    0.8956  0.9246  0.9067  0.9148   
ada                  Ada Boost Classifier    0.8844  0.9042  0.9067  0.8981   
lightgbm  Light Gradient Boosting Machine    0.8667  0.8876  0.9433  0.8371   
gbc          Gradient Boosting Classifier    0.8744  0.8664  0.8900  0.8981   
et                 Extra Trees Classifier    0.8622  0.8895  0.8667  0.8898   
dt               Decision Tree Classifier    0.8522  0.8310  0.8500  0.8898   
nb                            Naive Bayes    0.6967  0.9203  1.0000  0.6600   
qda       Quadratic Discriminant Analysis    0.5522  0.0000  1.0000  0.5522   
svm                   SVM - Linear Kernel    0.6978  0.0000  0.6633  0.8606   
lr                    Logistic Regression    0.7178  0.9203  0.5433  0.9300   
ridge                    Ridge Classifier    0.7178 

In [73]:
mean_acc_swell = statistics.mean(accuracies_swell)
mean_prec_swell = statistics.mean(precision_swell)
mean_rec_swell = statistics.mean(recall_swell)
mean_f1_swell = statistics.mean(f1scores_swell)
std_acc_swell = statistics.stdev(accuracies_swell)
std_prec_swell = statistics.stdev(precision_swell)
std_rec_swell = statistics.stdev(recall_swell)
std_f1_swell = statistics.stdev(f1scores_swell)

In [74]:
print("Mean Accuracy SWELL: ",mean_acc_swell)
print("Mean Precision SWELL: ",mean_prec_swell)
print("Mean Recall SWELL: ",mean_rec_swell)
print("Mean F1-score SWELL: ",mean_f1_swell)
print("Standard deviation of Accuracy SWELL: ",std_acc_swell)
print("Standard deviation of Precision SWELL: ",std_prec_swell)
print("Standard deviation of Recall SWELL: ",std_rec_swell)
print("Standard deviation of F1-score SWELL: ",std_f1_swell)

Mean Accuracy SWELL:  0.805528
Mean Precision SWELL:  0.809288
Mean Recall SWELL:  0.849868
Mean F1-score SWELL:  0.816344
Standard deviation of Accuracy SWELL:  0.10705632676306431
Standard deviation of Precision SWELL:  0.10372032234170248
Standard deviation of Recall SWELL:  0.12015854013205497
Standard deviation of F1-score SWELL:  0.1092197405844444


In [75]:
folder_path = "SWELL_User_Based_Splitting_Output_Files"

# List all CSV files in the folder:
csv_files = []
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        csv_files.append(os.path.join(folder_path, filename))

# Concatenate files
combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

In [76]:
output_folder = "Output_Files"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)  # Create folder using makedirs()

# Create the full path with the output folder name
output_filepath = os.path.join(output_folder, "SWELL_User_Based_Splitting.csv")

# Save the concatenated DataFrame to the new CSV file
combined_df.to_csv(output_filepath, index=False)

print(f"All predictions saved to: {output_filepath}")

All predictions saved to: Output_Files\SWELL_User_Based_Splitting.csv


### User-Based Splitting model with protected attributes

In [77]:
swell = pd.read_csv("Final_CSVs/swell.csv")

In [78]:
swell = swell.drop(columns=["Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10", "Unnamed: 11", "timestamp", "C"])

In [79]:
swell['Condition'] = np.where(swell['Condition']=='N',0,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='R',0,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='I',1,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='T',1,swell['Condition'])

In [80]:
# Load the CSV data into a pandas dataframe
df_user_info = pd.read_csv("Scored_Surveys/swell_person.csv", sep=";")

# Merge the dataframes based on the 'PP' column
swell_with_info = swell.merge(df_user_info, on="PP", how="left")

# Print the result
print(swell_with_info)

        PP Condition   HR       RMSSD         SCL  Age Gender  \
0      PP1         0  999  999.000000   80.239727   27      m   
1      PP1         0   61    0.061420   77.365127   27      m   
2      PP1         0   64    0.049663   77.359559   27      m   
3      PP1         0   60    0.052487   76.728772   27      m   
4      PP1         0   61    0.051189   76.512877   27      m   
...    ...       ...  ...         ...         ...  ...    ...   
3135  PP25         1  999  999.000000  999.000000   26      m   
3136  PP25         1  999  999.000000  999.000000   26      m   
3137  PP25         1  999  999.000000  999.000000   26      m   
3138  PP25         1  999  999.000000  999.000000   26      m   
3139  PP25         1  999  999.000000  999.000000   26      m   

                      Occupation Dominant hand Glasses  smoke  coffee  \
0                        student         right      no      6       6   
1                        student         right      no      6       6   


In [81]:
unique_participants4 = swell_with_info["PP"].unique()

In [82]:
person_group4 = swell_with_info.groupby('PP')

In [85]:
from sklearn.model_selection import train_test_split
import os  # Import os module for folder creation

# Specify folder name
folder_name = "SWELL_User_Based_Splitting_Output_Files_Bias"

accuracies_swell = []
precision_swell = []
recall_swell = []
f1scores_swell = []

for participant in unique_participants4:
    print("Participant: ",participant)    
    part_df = person_group4.get_group(participant)

    # -----------------------------------------------------

    train_data, test_data = train_test_split(part_df)

    fold_groups = train_data.PP

    # Save the 'id' column from the test set
    test_ids = test_data['PP']

    train_data = train_data.drop(columns=['PP'])
    test_data = test_data.drop(columns=['PP'])

    # -----------------------------------------------------

    grid = setup(data=train_data, target='Condition', fix_imbalance = True, html=False, verbose=False, test_data=test_data) #fix_imbalance = True,
    best = compare_models(sort='F1')
    accuracies_swell.append(pull()['Accuracy'][0])
    precision_swell.append(pull()['Prec.'][0])
    recall_swell.append(pull()['Recall'][0])
    f1scores_swell.append(pull()['F1'][0])
    print(best)

    # ---------------------------------------------------

    # Make predictions using the best model
    predictions = predict_model(best, data=test_data)

    # Add 'id' column back to predictions DataFrame
    predictions['PP'] = test_ids  # Use the 'id' column from the test data

    # Extract true labels (y_true), rename the column
    y_true = predictions[['Condition']].rename(columns={'Condition': 'y_true'})

    # Extract predicted labels (y_pred), rename the column
    y_pred = predictions[['prediction_label']].rename(columns={'prediction_label': 'y_pred'})

    # Identify protected attribute columns (assuming you know the column names)
    protected_attributes = predictions[['Age', 'Gender', 'Occupation']]

    # Concatenate DataFrames containing predictions and protected attributes
    all_data = pd.concat([test_ids, y_true, y_pred, protected_attributes], axis=1)

    # Create filename
    filename = f"predictions_{participant}.csv"

    # Create folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)  # Create folder using makedirs()

    # Create full path with folder name
    filepath = os.path.join(folder_name, filename)

    # Save predictions
    all_data.to_csv(filepath, index=False)

    print(f"Predictions saved to: {filepath}")

Participant:  PP1




                                    Model  Accuracy     AUC  Recall   Prec.  \
et                 Extra Trees Classifier    0.7378  0.7073  0.6867  0.8238   
rf               Random Forest Classifier    0.7056  0.6960  0.7067  0.7405   
gbc          Gradient Boosting Classifier    0.6989  0.7228  0.6867  0.7688   
knn                K Neighbors Classifier    0.6756  0.7457  0.6833  0.6948   
dt               Decision Tree Classifier    0.6667  0.6683  0.6467  0.7355   
ada                  Ada Boost Classifier    0.6378  0.6780  0.6100  0.6800   
lr                    Logistic Regression    0.6322  0.7070  0.6467  0.6743   
ridge                    Ridge Classifier    0.6222  0.0000  0.6667  0.6586   
lightgbm  Light Gradient Boosting Machine    0.6244  0.6725  0.6267  0.6650   
lda          Linear Discriminant Analysis    0.6122  0.7000  0.6267  0.6543   
svm                   SVM - Linear Kernel    0.5200  0.0000  0.5600  0.3939   
nb                            Naive Bayes    0.4578 



                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.9222  0.9525   0.960  0.9067   
lda          Linear Discriminant Analysis    0.9111  0.9200   1.000  0.8695   
rf               Random Forest Classifier    0.9111  0.9450   0.930  0.9050   
ridge                    Ridge Classifier    0.8889  0.0000   0.955  0.8662   
lr                    Logistic Regression    0.9000  0.9250   0.905  0.9000   
et                 Extra Trees Classifier    0.9000  0.9400   0.910  0.9100   
nb                            Naive Bayes    0.8667  0.9025   0.980  0.8171   
ada                  Ada Boost Classifier    0.8889  0.9300   0.885  0.8967   
gbc          Gradient Boosting Classifier    0.8667  0.9250   0.840  0.8933   
lightgbm  Light Gradient Boosting Machine    0.8556  0.9550   0.815  0.8850   
dt               Decision Tree Classifier    0.8556  0.8500   0.815  0.8833   
svm                   SVM - Linear Kernel    0.6444 



                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.7289  0.8449  0.7467  0.7889   
knn                K Neighbors Classifier    0.6989  0.8249  0.7833  0.7332   
ada                  Ada Boost Classifier    0.7100  0.8439  0.7133  0.7996   
lightgbm  Light Gradient Boosting Machine    0.7000  0.8546  0.6800  0.8290   
et                 Extra Trees Classifier    0.6578  0.7998  0.7133  0.7232   
gbc          Gradient Boosting Classifier    0.6667  0.8214  0.6900  0.7255   
dt               Decision Tree Classifier    0.6544  0.6954  0.6733  0.7421   
svm                   SVM - Linear Kernel    0.5533  0.0000  0.7200  0.5092   
lda          Linear Discriminant Analysis    0.5778  0.8851  0.2900  0.7833   
lr                    Logistic Regression    0.5556  0.8809  0.2300  0.8000   
ridge                    Ridge Classifier    0.5556  0.0000  0.2300  0.8000   
nb                            Naive Bayes    0.4933 



                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.8544  0.9160  0.8800  0.8588   
lightgbm  Light Gradient Boosting Machine    0.8444  0.9050  0.8600  0.8588   
dt               Decision Tree Classifier    0.8333  0.8202  0.8433  0.8588   
gbc          Gradient Boosting Classifier    0.8233  0.8660  0.8233  0.8621   
ada                  Ada Boost Classifier    0.7900  0.8585  0.8233  0.7971   
et                 Extra Trees Classifier    0.7911  0.8264  0.8033  0.8088   
rf               Random Forest Classifier    0.7800  0.8787  0.8033  0.7888   
nb                            Naive Bayes    0.6800  0.8390  0.7067  0.7399   
svm                   SVM - Linear Kernel    0.6756  0.0000  0.5867  0.6955   
lr                    Logistic Regression    0.6778  0.8317  0.5467  0.7717   
ridge                    Ridge Classifier    0.6778  0.0000  0.5467  0.7717   
lda          Linear Discriminant Analysis    0.6778 



                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8067  0.7904  0.8833  0.7983   
knn                K Neighbors Classifier    0.7967  0.8429  0.8333  0.8323   
ada                  Ada Boost Classifier    0.7856  0.8488  0.7867  0.8412   
lr                    Logistic Regression    0.7289  0.6872  0.9233  0.7076   
gbc          Gradient Boosting Classifier    0.7744  0.8196  0.7867  0.8198   
lda          Linear Discriminant Analysis    0.7189  0.6872  0.9033  0.7042   
ridge                    Ridge Classifier    0.7178  0.0000  0.9033  0.7009   
rf               Random Forest Classifier    0.7622  0.8222  0.7833  0.8017   
et                 Extra Trees Classifier    0.7533  0.8082  0.7667  0.8000   
dt               Decision Tree Classifier    0.7189  0.7142  0.7433  0.7612   
svm                   SVM - Linear Kernel    0.5289  0.0000  0.3367  0.7056   
nb                            Naive Bayes    0.5389 



                                    Model  Accuracy     AUC  Recall   Prec.  \
nb                            Naive Bayes    0.6411  0.7740   0.920  0.6131   
lightgbm  Light Gradient Boosting Machine    0.7200  0.8165   0.705  0.7600   
ada                  Ada Boost Classifier    0.6811  0.8140   0.670  0.7148   
rf               Random Forest Classifier    0.6556  0.6820   0.685  0.6714   
lda          Linear Discriminant Analysis    0.6989  0.7130   0.585  0.8342   
et                 Extra Trees Classifier    0.6156  0.5940   0.665  0.6288   
lr                    Logistic Regression    0.6889  0.7230   0.585  0.8142   
ridge                    Ridge Classifier    0.6889  0.0000   0.565  0.8288   
gbc          Gradient Boosting Classifier    0.6444  0.7195   0.640  0.6433   
knn                K Neighbors Classifier    0.6367  0.7200   0.645  0.6702   
dt               Decision Tree Classifier    0.5944  0.5900   0.580  0.6033   
svm                   SVM - Linear Kernel    0.4589 



                                    Model  Accuracy     AUC  Recall   Prec.  \
gbc          Gradient Boosting Classifier    0.8656  0.9122  0.9033  0.8776   
rf               Random Forest Classifier    0.8533  0.9173  0.9233  0.8476   
ada                  Ada Boost Classifier    0.8556  0.8951  0.8667  0.8848   
et                 Extra Trees Classifier    0.8333  0.9220  0.8833  0.8473   
lightgbm  Light Gradient Boosting Machine    0.8111  0.8753  0.8433  0.8429   
dt               Decision Tree Classifier    0.8022  0.7983  0.8067  0.8431   
lr                    Logistic Regression    0.7622  0.8382  0.9067  0.7538   
ridge                    Ridge Classifier    0.7211  0.0000  0.9433  0.6735   
lda          Linear Discriminant Analysis    0.7211  0.8010  0.9433  0.6735   
nb                            Naive Bayes    0.7111  0.8458  0.9267  0.6681   
knn                K Neighbors Classifier    0.7433  0.8323  0.7267  0.8276   
svm                   SVM - Linear Kernel    0.6589 



                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.5361  0.6062   0.555  0.5638   
nb                            Naive Bayes    0.5361  0.6062   0.555  0.5638   
ridge                    Ridge Classifier    0.5361  0.0000   0.555  0.5638   
lda          Linear Discriminant Analysis    0.5361  0.6062   0.555  0.5638   
lightgbm  Light Gradient Boosting Machine    0.4917  0.4456   0.370  0.5767   
rf               Random Forest Classifier    0.4486  0.4612   0.350  0.4567   
dt               Decision Tree Classifier    0.4833  0.4613   0.305  0.4833   
knn                K Neighbors Classifier    0.5083  0.5019   0.285  0.5100   
et                 Extra Trees Classifier    0.4722  0.4531   0.305  0.4667   
gbc          Gradient Boosting Classifier    0.4833  0.4613   0.280  0.4900   
ada                  Ada Boost Classifier    0.5181  0.4775   0.285  0.4750   
svm                   SVM - Linear Kernel    0.4722 



                                    Model  Accuracy     AUC  Recall   Prec.  \
ridge                    Ridge Classifier    0.7911  0.0000   0.805  0.7964   
lda          Linear Discriminant Analysis    0.7911  0.8150   0.805  0.7964   
lr                    Logistic Regression    0.7800  0.8100   0.780  0.7964   
lightgbm  Light Gradient Boosting Machine    0.7578  0.8010   0.755  0.7714   
knn                K Neighbors Classifier    0.6911  0.7945   0.730  0.6893   
gbc          Gradient Boosting Classifier    0.6478  0.7465   0.690  0.6667   
ada                  Ada Boost Classifier    0.6367  0.7130   0.695  0.6317   
et                 Extra Trees Classifier    0.6689  0.7020   0.640  0.7029   
rf               Random Forest Classifier    0.6578  0.6975   0.640  0.6862   
dt               Decision Tree Classifier    0.6356  0.6155   0.665  0.6521   
nb                            Naive Bayes    0.5933  0.7220   0.650  0.5705   
svm                   SVM - Linear Kernel    0.5389 



                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.8433  0.8798  0.9200  0.8169   
lr                    Logistic Regression    0.8333  0.8717  0.8833  0.8255   
lightgbm  Light Gradient Boosting Machine    0.7989  0.7707  0.9000  0.7752   
ridge                    Ridge Classifier    0.7744  0.0000  0.9800  0.7270   
lda          Linear Discriminant Analysis    0.7744  0.8817  0.9800  0.7270   
ada                  Ada Boost Classifier    0.8111  0.8532  0.8333  0.8281   
rf               Random Forest Classifier    0.8111  0.8815  0.8100  0.8567   
gbc          Gradient Boosting Classifier    0.7811  0.8112  0.8100  0.7960   
dt               Decision Tree Classifier    0.7800  0.7883  0.7967  0.8112   
et                 Extra Trees Classifier    0.7689  0.8528  0.7533  0.8398   
svm                   SVM - Linear Kernel    0.7311  0.0000  0.9200  0.7230   
nb                            Naive Bayes    0.5533 



                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8022  0.7890    0.96  0.7554   
dt               Decision Tree Classifier    0.7922  0.7675    0.96  0.7411   
rf               Random Forest Classifier    0.7922  0.7755    0.96  0.7411   
ada                  Ada Boost Classifier    0.7922  0.7675    0.96  0.7411   
gbc          Gradient Boosting Classifier    0.7922  0.7675    0.96  0.7411   
et                 Extra Trees Classifier    0.7922  0.7755    0.96  0.7411   
lr                    Logistic Regression    0.7800  0.7850    0.92  0.7473   
nb                            Naive Bayes    0.7800  0.7855    0.92  0.7473   
ridge                    Ridge Classifier    0.7800  0.0000    0.92  0.7473   
lda          Linear Discriminant Analysis    0.7800  0.7855    0.92  0.7473   
svm                   SVM - Linear Kernel    0.6467  0.0000    0.56  0.4458   
knn                K Neighbors Classifier    0.4644 



                                    Model  Accuracy     AUC  Recall   Prec.  \
ada                  Ada Boost Classifier    0.9478  0.9603  0.9433  0.9667   
et                 Extra Trees Classifier    0.9478  0.9825  0.9233  0.9833   
dt               Decision Tree Classifier    0.9378  0.9350  0.9400  0.9524   
rf               Random Forest Classifier    0.9367  0.9900  0.9233  0.9667   
gbc          Gradient Boosting Classifier    0.9278  0.9583  0.9233  0.9500   
lightgbm  Light Gradient Boosting Machine    0.9167  0.9462  0.9300  0.9333   
knn                K Neighbors Classifier    0.8967  0.9709  0.9267  0.8933   
lr                    Logistic Regression    0.8356  0.8952  0.8700  0.8455   
ridge                    Ridge Classifier    0.8244  0.0000  0.8500  0.8424   
lda          Linear Discriminant Analysis    0.8044  0.8853  0.8100  0.8374   
svm                   SVM - Linear Kernel    0.7267  0.0000  0.8000  0.6087   
nb                            Naive Bayes    0.7189 



                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.8511  0.8655   0.900  0.8400   
rf               Random Forest Classifier    0.8300  0.8970   0.860  0.8431   
gbc          Gradient Boosting Classifier    0.8211  0.9090   0.860  0.8200   
lightgbm  Light Gradient Boosting Machine    0.8189  0.8675   0.855  0.8231   
ada                  Ada Boost Classifier    0.8100  0.8470   0.840  0.8217   
et                 Extra Trees Classifier    0.8111  0.8945   0.840  0.8214   
dt               Decision Tree Classifier    0.7911  0.7950   0.800  0.8031   
lr                    Logistic Regression    0.7256  0.6940   0.660  0.7733   
nb                            Naive Bayes    0.7256  0.8690   0.660  0.7733   
ridge                    Ridge Classifier    0.7256  0.0000   0.660  0.7733   
lda          Linear Discriminant Analysis    0.7256  0.6790   0.660  0.7733   
svm                   SVM - Linear Kernel    0.6700 



                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.8633  0.9535   0.895  0.8614   
dt               Decision Tree Classifier    0.8533  0.8525   0.875  0.8642   
et                 Extra Trees Classifier    0.8533  0.8985   0.875  0.8581   
ada                  Ada Boost Classifier    0.8411  0.9120   0.875  0.8442   
lightgbm  Light Gradient Boosting Machine    0.8322  0.9190   0.875  0.8406   
knn                K Neighbors Classifier    0.8344  0.8960   0.875  0.8195   
gbc          Gradient Boosting Classifier    0.8111  0.9030   0.860  0.8125   
lr                    Logistic Regression    0.7256  0.8130   0.710  0.7725   
lda          Linear Discriminant Analysis    0.6867  0.7370   0.740  0.6981   
ridge                    Ridge Classifier    0.6656  0.0000   0.715  0.6776   
svm                   SVM - Linear Kernel    0.6022  0.0000   0.675  0.6090   
nb                            Naive Bayes    0.6156 



                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.9444  0.9300    1.00  0.9181   
knn                K Neighbors Classifier    0.9222  0.9675    0.96  0.9148   
dt               Decision Tree Classifier    0.9111  0.9100    0.92  0.9300   
gbc          Gradient Boosting Classifier    0.9000  0.9700    0.92  0.9100   
rf               Random Forest Classifier    0.9000  0.9750    0.92  0.9100   
et                 Extra Trees Classifier    0.9000  0.9175    0.92  0.9100   
nb                            Naive Bayes    0.8778  0.9100    1.00  0.8401   
ridge                    Ridge Classifier    0.8778  0.0000    1.00  0.8401   
lda          Linear Discriminant Analysis    0.8778  0.9300    1.00  0.8401   
ada                  Ada Boost Classifier    0.8889  0.9550    0.90  0.9100   
lightgbm  Light Gradient Boosting Machine    0.8667  0.9325    0.92  0.8614   
svm                   SVM - Linear Kernel    0.7667 



                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.9133  0.9430   0.915  0.9250   
rf               Random Forest Classifier    0.9033  0.9635   0.915  0.9131   
svm                   SVM - Linear Kernel    0.9011  0.0000   0.895  0.9333   
lr                    Logistic Regression    0.8933  0.9720   0.895  0.9131   
et                 Extra Trees Classifier    0.8733  0.9480   0.895  0.8733   
gbc          Gradient Boosting Classifier    0.8811  0.9600   0.895  0.9012   
ada                  Ada Boost Classifier    0.8811  0.9550   0.875  0.9179   
lightgbm  Light Gradient Boosting Machine    0.8744  0.9270   0.855  0.9131   
dt               Decision Tree Classifier    0.8511  0.8525   0.895  0.8543   
lda          Linear Discriminant Analysis    0.7478  0.8860   0.820  0.7471   
ridge                    Ridge Classifier    0.7378  0.0000   0.800  0.7471   
nb                            Naive Bayes    0.6022 



                                    Model  Accuracy     AUC  Recall   Prec.  \
gbc          Gradient Boosting Classifier    0.9567  0.9725  0.9633  0.9667   
et                 Extra Trees Classifier    0.9233  0.9779  0.9833  0.9095   
rf               Random Forest Classifier    0.9256  0.9775  0.9300  0.9500   
ada                  Ada Boost Classifier    0.9156  0.9592  0.8933  0.9633   
dt               Decision Tree Classifier    0.8700  0.8750  0.8500  0.9183   
lightgbm  Light Gradient Boosting Machine    0.8333  0.9258  0.8633  0.8729   
knn                K Neighbors Classifier    0.8211  0.8754  0.8767  0.8479   
lda          Linear Discriminant Analysis    0.7978  0.8575  0.8400  0.8457   
nb                            Naive Bayes    0.7378  0.9350  1.0000  0.6966   
lr                    Logistic Regression    0.7778  0.8725  0.8233  0.8314   
ridge                    Ridge Classifier    0.7778  0.0000  0.8233  0.8314   
svm                   SVM - Linear Kernel    0.6211 



                                    Model  Accuracy     AUC  Recall   Prec.  \
lda          Linear Discriminant Analysis    0.7542  0.7917  0.7633  0.8214   
rf               Random Forest Classifier    0.7181  0.7892  0.7833  0.7486   
dt               Decision Tree Classifier    0.7194  0.7058  0.7867  0.7527   
et                 Extra Trees Classifier    0.7181  0.7514  0.7833  0.7421   
ridge                    Ridge Classifier    0.7306  0.0000  0.7267  0.8064   
ada                  Ada Boost Classifier    0.7292  0.7314  0.7400  0.7855   
lr                    Logistic Regression    0.7181  0.7906  0.7000  0.7867   
nb                            Naive Bayes    0.6861  0.6722  0.7667  0.7104   
lightgbm  Light Gradient Boosting Machine    0.6958  0.7669  0.7067  0.7567   
knn                K Neighbors Classifier    0.6986  0.7619  0.7233  0.7314   
gbc          Gradient Boosting Classifier    0.6500  0.7500  0.7067  0.6936   
svm                   SVM - Linear Kernel    0.5472 



                                    Model  Accuracy     AUC  Recall   Prec.  \
gbc          Gradient Boosting Classifier    0.6900  0.7846  0.7500  0.7119   
ada                  Ada Boost Classifier    0.7089  0.7989  0.6900  0.7621   
ridge                    Ridge Classifier    0.7111  0.0000  0.6333  0.7998   
lda          Linear Discriminant Analysis    0.7111  0.7930  0.6333  0.7998   
nb                            Naive Bayes    0.7311  0.7865  0.6133  0.8598   
lr                    Logistic Regression    0.7011  0.8020  0.6333  0.7831   
knn                K Neighbors Classifier    0.6989  0.8268  0.6533  0.7740   
et                 Extra Trees Classifier    0.6767  0.7668  0.6700  0.7131   
lightgbm  Light Gradient Boosting Machine    0.6600  0.7878  0.6767  0.7142   
dt               Decision Tree Classifier    0.6678  0.6660  0.6533  0.6981   
rf               Random Forest Classifier    0.6567  0.8033  0.6533  0.6964   
svm                   SVM - Linear Kernel    0.5978 



                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.7811  0.8807  0.8800  0.7669   
knn                K Neighbors Classifier    0.7800  0.8725  0.9000  0.7520   
dt               Decision Tree Classifier    0.7489  0.8365  0.8400  0.7359   
et                 Extra Trees Classifier    0.7400  0.8352  0.8200  0.7494   
ada                  Ada Boost Classifier    0.7067  0.8515  0.7667  0.7222   
lightgbm  Light Gradient Boosting Machine    0.6944  0.8563  0.7067  0.7606   
gbc          Gradient Boosting Classifier    0.6733  0.8440  0.7400  0.6953   
lr                    Logistic Regression    0.4067  0.4525  0.3700  0.3706   
ridge                    Ridge Classifier    0.4567  0.0000  0.2900  0.5112   
lda          Linear Discriminant Analysis    0.4700  0.4525  0.2700  0.3933   
nb                            Naive Bayes    0.4800  0.5772  0.2567  0.5095   
svm                   SVM - Linear Kernel    0.4678 



                                    Model  Accuracy     AUC  Recall   Prec.  \
ada                  Ada Boost Classifier    0.6839  0.7292  0.6750  0.6383   
rf               Random Forest Classifier    0.6571  0.6535  0.6167  0.6000   
gbc          Gradient Boosting Classifier    0.6554  0.6996  0.6083  0.6167   
knn                K Neighbors Classifier    0.6071  0.5973  0.6500  0.6129   
et                 Extra Trees Classifier    0.6161  0.6227  0.5500  0.5833   
svm                   SVM - Linear Kernel    0.5554  0.0000  0.6417  0.5845   
lightgbm  Light Gradient Boosting Machine    0.5571  0.5850  0.5417  0.4800   
lr                    Logistic Regression    0.6071  0.7092  0.4167  0.6167   
ridge                    Ridge Classifier    0.6071  0.0000  0.4167  0.6167   
lda          Linear Discriminant Analysis    0.6071  0.7292  0.4167  0.6167   
dt               Decision Tree Classifier    0.5554  0.5442  0.4833  0.4567   
nb                            Naive Bayes    0.6589 



                                    Model  Accuracy     AUC  Recall   Prec.  \
dt               Decision Tree Classifier    0.8922  0.8925    0.88  0.9314   
rf               Random Forest Classifier    0.8922  0.9455    0.88  0.9314   
et                 Extra Trees Classifier    0.8811  0.8895    0.88  0.9064   
gbc          Gradient Boosting Classifier    0.8822  0.9405    0.86  0.9314   
nb                            Naive Bayes    0.8378  0.9297    1.00  0.7869   
ada                  Ada Boost Classifier    0.8500  0.9140    0.80  0.9214   
lr                    Logistic Regression    0.8267  0.9508    0.80  0.9024   
lda          Linear Discriminant Analysis    0.8178  0.9508    0.78  0.9131   
knn                K Neighbors Classifier    0.7833  0.9078    0.78  0.8240   
svm                   SVM - Linear Kernel    0.7800  0.0000    0.82  0.8317   
ridge                    Ridge Classifier    0.7867  0.0000    0.76  0.8893   
lightgbm  Light Gradient Boosting Machine    0.7878 



                                    Model  Accuracy     AUC  Recall   Prec.  \
ridge                    Ridge Classifier    0.7667  0.0000  0.6900  0.8917   
lda          Linear Discriminant Analysis    0.7667  0.8213  0.6900  0.8917   
rf               Random Forest Classifier    0.7678  0.8121  0.6733  0.9167   
et                 Extra Trees Classifier    0.7678  0.8096  0.6733  0.9167   
lr                    Logistic Regression    0.7567  0.8147  0.6733  0.8917   
nb                            Naive Bayes    0.5922  0.7813  1.0000  0.5856   
lightgbm  Light Gradient Boosting Machine    0.7589  0.7958  0.6400  0.9300   
dt               Decision Tree Classifier    0.7500  0.7617  0.6067  0.9550   
knn                K Neighbors Classifier    0.7167  0.7917  0.6933  0.8417   
gbc          Gradient Boosting Classifier    0.7367  0.7854  0.6000  0.9217   
ada                  Ada Boost Classifier    0.7178  0.6938  0.5500  0.9550   
svm                   SVM - Linear Kernel    0.6244 



                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.9578  0.9750  1.0000  0.9417   
gbc          Gradient Boosting Classifier    0.9578  0.9833  1.0000  0.9417   
dt               Decision Tree Classifier    0.9478  0.9667  1.0000  0.9274   
knn                K Neighbors Classifier    0.9278  0.9542  1.0000  0.9024   
et                 Extra Trees Classifier    0.9278  0.9750  0.9833  0.9095   
lightgbm  Light Gradient Boosting Machine    0.9078  0.9458  0.9833  0.8845   
ada                  Ada Boost Classifier    0.8967  0.9283  0.9433  0.8955   
lr                    Logistic Regression    0.7867  0.7222  0.7567  0.8764   
ridge                    Ridge Classifier    0.7867  0.0000  0.7567  0.8764   
lda          Linear Discriminant Analysis    0.7867  0.7213  0.7567  0.8764   
nb                            Naive Bayes    0.6956  0.8337  0.7067  0.7489   
svm                   SVM - Linear Kernel    0.5589 



                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.8878  0.9115   0.935  0.8681   
lightgbm  Light Gradient Boosting Machine    0.8678  0.8700   0.915  0.8606   
rf               Random Forest Classifier    0.8467  0.9050   0.850  0.8548   
dt               Decision Tree Classifier    0.8356  0.7870   0.850  0.8429   
ada                  Ada Boost Classifier    0.8356  0.8525   0.850  0.8429   
gbc          Gradient Boosting Classifier    0.8356  0.8460   0.850  0.8429   
et                 Extra Trees Classifier    0.8356  0.8740   0.850  0.8348   
nb                            Naive Bayes    0.6167  0.9200   1.000  0.5687   
lr                    Logistic Regression    0.7622  0.9200   0.555  0.9167   
ridge                    Ridge Classifier    0.7622  0.0000   0.555  0.9167   
lda          Linear Discriminant Analysis    0.7622  0.9200   0.555  0.9167   
svm                   SVM - Linear Kernel    0.6422 

In [86]:
mean_acc_swell = statistics.mean(accuracies_swell)
mean_prec_swell = statistics.mean(precision_swell)
mean_rec_swell = statistics.mean(recall_swell)
mean_f1_swell = statistics.mean(f1scores_swell)
std_acc_swell = statistics.stdev(accuracies_swell)
std_prec_swell = statistics.stdev(precision_swell)
std_rec_swell = statistics.stdev(recall_swell)
std_f1_swell = statistics.stdev(f1scores_swell)

In [87]:
print("Mean Accuracy SWELL: ",mean_acc_swell)
print("Mean Precision SWELL: ",mean_prec_swell)
print("Mean Recall SWELL: ",mean_rec_swell)
print("Mean F1-score SWELL: ",mean_f1_swell)
print("Standard deviation of Accuracy SWELL: ",std_acc_swell)
print("Standard deviation of Precision SWELL: ",std_prec_swell)
print("Standard deviation of Recall SWELL: ",std_rec_swell)
print("Standard deviation of F1-score SWELL: ",std_f1_swell)

Mean Accuracy SWELL:  0.816788
Mean Precision SWELL:  0.82596
Mean Recall SWELL:  0.856396
Mean F1-score SWELL:  0.8288719999999999
Standard deviation of Accuracy SWELL:  0.10761224496620571
Standard deviation of Precision SWELL:  0.10686676673940003
Standard deviation of Recall SWELL:  0.11587951889786219
Standard deviation of F1-score SWELL:  0.1050498672060084


In [88]:
folder_path = "SWELL_User_Based_Splitting_Output_Files_Bias"

# List all CSV files in the folder:
csv_files = []
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        csv_files.append(os.path.join(folder_path, filename))

# Concatenate files
combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

In [89]:
output_folder = "Output_Files"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)  # Create folder using makedirs()

# Create the full path with the output folder name
output_filepath = os.path.join(output_folder, "SWELL_User_Based_Splitting_Bias.csv")

# Save the concatenated DataFrame to the new CSV file
combined_df.to_csv(output_filepath, index=False)

print(f"All predictions saved to: {output_filepath}")

All predictions saved to: Output_Files\SWELL_User_Based_Splitting_Bias.csv
