## Import Libraries

In [1]:
import pickle
import numpy as np
import pandas as pd
from os import listdir
import pycaret
from pycaret.classification import *
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from datetime import datetime

modalities = ['ACC', 'BVP', 'EDA', 'TEMP']

#sampling frequencies
ACC_WE = 32
BVP_WE = 64
EDA_WE = 4
TEMP_WE = 4

ACC_AD = 1920
BVP_AD = 3840
EDA_AD = 240
TEMP_AD = 240

## SWELL

In [2]:
swell = pd.read_csv("Final_CSVs/swell.csv")

In [3]:
swell["C"].value_counts()

1    1179
2     981
3     980
Name: C, dtype: int64

In [4]:
swell = swell.drop(columns=["Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10", "Unnamed: 11", "timestamp", "C"])

In [5]:
swell["Condition"].value_counts()

N    1029
I     996
T     664
R     451
Name: Condition, dtype: int64

In [6]:
swell

Unnamed: 0,PP,Condition,HR,RMSSD,SCL
0,PP1,R,999,999.000000,80.239727
1,PP1,R,61,0.061420,77.365127
2,PP1,R,64,0.049663,77.359559
3,PP1,R,60,0.052487,76.728772
4,PP1,R,61,0.051189,76.512877
...,...,...,...,...,...
3135,PP25,T,999,999.000000,999.000000
3136,PP25,T,999,999.000000,999.000000
3137,PP25,T,999,999.000000,999.000000
3138,PP25,T,999,999.000000,999.000000


In [7]:
# swell['Condition'] = np.where(swell['Condition']=='N','No-Stress',swell['Condition'])
# swell['Condition'] = np.where(swell['Condition']=='R','No-Stress',swell['Condition'])
# swell['Condition'] = np.where(swell['Condition']=='I','Stress',swell['Condition'])
# swell['Condition'] = np.where(swell['Condition']=='T','Stress',swell['Condition'])

swell['Condition'] = np.where(swell['Condition']=='N',0,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='R',0,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='I',1,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='T',1,swell['Condition'])

In [8]:
swell.columns

Index(['PP', 'Condition', 'HR', 'RMSSD', 'SCL'], dtype='object')

In [13]:
def train_test_split_per_user(data, train_size=0.7):
    users = list(set(data.PP))
    users = sorted(users, reverse=True)  # fix randomness
    total_users = len(users)
    slice = int(train_size * total_users)
    users_train = users[:slice]
    users_test = users[slice:]
    return data[data.PP.isin(users_train)], data[data.PP.isin(users_test)]

In [14]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split_per_user(swell)
# train_data, test_data = train_test_split(swell)

fold_groups = train_data.PP

# Save the 'id' column from the test set
test_ids = test_data['PP']

train_data = train_data.drop(columns=['PP'])
test_data = test_data.drop(columns=['PP'])

In [15]:
grid = setup(data=train_data, target='Condition', fix_imbalance = True, html=False, verbose=False, fold_strategy='groupkfold', fold=3, fold_groups=fold_groups, test_data=test_data) #fix_imbalance = True,
best = compare_models(sort='F1')

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.5310  0.5384  0.5963  0.5402   
ada                  Ada Boost Classifier    0.5114  0.5085  0.5542  0.5387   
ridge                    Ridge Classifier    0.5327  0.0000  0.5263  0.5516   
lda          Linear Discriminant Analysis    0.5321  0.5258  0.5236  0.5511   
qda       Quadratic Discriminant Analysis    0.5136  0.5208  0.5057  0.5481   
gbc          Gradient Boosting Classifier    0.5034  0.4774  0.4931  0.5325   
dt               Decision Tree Classifier    0.4818  0.4648  0.4808  0.5099   
nb                            Naive Bayes    0.5173  0.5384  0.4767  0.5382   
rf               Random Forest Classifier    0.4781  0.4629  0.4537  0.5073   
lightgbm  Light Gradient Boosting Machine    0.4761  0.4468  0.4413  0.5059   
svm                   SVM - Linear Kernel    0.4895  0.0000  0.4398  0.5177   
et                 Extra Trees Classifier    0.4713 



In [20]:
print(best)
# plot_model(best)
evaluate_model(best)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=8825, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [21]:
predictions = predict_model(best, data=test_data)

                 Model  Accuracy     AUC  Recall   Prec.      F1   Kappa  \
0  Logistic Regression    0.5373  0.5841  0.5918  0.5596  0.5753  0.0681   

      MCC  
0  0.0682  


In [22]:
print(predictions.columns)

Index(['HR', 'RMSSD', 'SCL', 'Condition', 'prediction_label',
       'prediction_score'],
      dtype='object')


In [139]:
# Add 'id' column back to predictions DataFrame
predictions['PP'] = test_ids  # Use the 'id' column from the test data

In [140]:
print(predictions['PP'])

0        PP1
1        PP1
2        PP1
3        PP1
4        PP1
        ... 
2009    PP16
2010    PP16
2011    PP16
2012    PP16
2013    PP16
Name: PP, Length: 1018, dtype: object


In [141]:
print(predictions.columns)

Index(['HR', 'RMSSD', 'SCL', 'Condition', 'prediction_label',
       'prediction_score', 'PP'],
      dtype='object')


In [142]:
print(predictions)

       HR       RMSSD         SCL Condition  prediction_label  \
0     999  999.000000   80.239723         0                 0   
1      61    0.061420   77.365128         0                 1   
2      64    0.049663   77.359558         0                 0   
3      60    0.052487   76.728775         0                 1   
4      61    0.051189   76.512878         0                 1   
...   ...         ...         ...       ...               ...   
2009   82    0.074241  179.787384         1                 1   
2010   78    0.089203  176.769348         1                 1   
2011   83    0.084138  174.879044         1                 1   
2012   79    0.083535  179.373138         1                 1   
2013   74    0.094341  171.047012         1                 1   

      prediction_score    PP  
0               0.5896   PP1  
1               0.5002   PP1  
2               0.5004   PP1  
3               0.5003   PP1  
4               0.5001   PP1  
...                ...   ...  
20

In [143]:
# Load the CSV data into a pandas dataframe
df_user_info = pd.read_csv("Scored_Surveys/swell_person.csv", sep=";")

# Merge the dataframes based on the 'PP' column
predictions_with_info = predictions.merge(df_user_info, on="PP", how="left")

# Print the result
print(predictions_with_info)

       HR       RMSSD         SCL Condition  prediction_label  \
0     999  999.000000   80.239723         0                 0   
1      61    0.061420   77.365128         0                 1   
2      64    0.049663   77.359558         0                 0   
3      60    0.052487   76.728775         0                 1   
4      61    0.051189   76.512878         0                 1   
...   ...         ...         ...       ...               ...   
1013   82    0.074241  179.787384         1                 1   
1014   78    0.089203  176.769348         1                 1   
1015   83    0.084138  174.879044         1                 1   
1016   79    0.083535  179.373138         1                 1   
1017   74    0.094341  171.047012         1                 1   

      prediction_score    PP  Age Gender Occupation Dominant hand Glasses  \
0               0.5896   PP1   27      m    student         right      no   
1               0.5002   PP1   27      m    student         right

In [144]:
print(predictions_with_info)

       HR       RMSSD         SCL Condition  prediction_label  \
0     999  999.000000   80.239723         0                 0   
1      61    0.061420   77.365128         0                 1   
2      64    0.049663   77.359558         0                 0   
3      60    0.052487   76.728775         0                 1   
4      61    0.051189   76.512878         0                 1   
...   ...         ...         ...       ...               ...   
1013   82    0.074241  179.787384         1                 1   
1014   78    0.089203  176.769348         1                 1   
1015   83    0.084138  174.879044         1                 1   
1016   79    0.083535  179.373138         1                 1   
1017   74    0.094341  171.047012         1                 1   

      prediction_score    PP  Age Gender Occupation Dominant hand Glasses  \
0               0.5896   PP1   27      m    student         right      no   
1               0.5002   PP1   27      m    student         right

In [145]:
import os

# Specify folder name
folder_name = "Output_Files"

id = predictions_with_info['PP']

# Extract true labels (y_true), rename the column
y_true = predictions_with_info[['Condition']].rename(columns={'Condition': 'y_true'})

# Extract predicted labels (y_pred), rename the column
y_pred = predictions_with_info[['prediction_label']].rename(columns={'prediction_label': 'y_pred'})

# Identify protected attribute columns
protected_attributes = predictions_with_info[['Age', 'Gender', 'Occupation']]

# Concatenate DataFrames containing predictions and protected attributes
all_data = pd.concat([id, y_true, y_pred, protected_attributes], axis=1)

# Create filename
filename = f"SWELL_Generic_Model.csv"

# Create folder if it doesn't exist
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Create full path with folder name
filepath = os.path.join(folder_name, filename)

# Save predictions
all_data.to_csv(filepath, index=False)

In [146]:
mismatched_predictions = (all_data['y_pred'] != all_data['y_true']).sum()

print(f"Number of rows with different y_pred and y_true values: {mismatched_predictions}")

Number of rows with different y_pred and y_true values: 461


In [147]:
matched_predictions = (all_data['y_pred'] == all_data['y_true']).sum()

print(f"Number of rows with same y_pred and y_true values: {matched_predictions}")

Number of rows with same y_pred and y_true values: 557


In [148]:
accuracy = matched_predictions / (matched_predictions+mismatched_predictions)
print(accuracy)

0.5471512770137524


### ML model trained with protected attributes

In [149]:
swell = pd.read_csv("Final_CSVs/swell.csv")

In [150]:
swell = swell.drop(columns=["Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10", "Unnamed: 11", "timestamp", "C"])

In [151]:
# swell['Condition'] = np.where(swell['Condition']=='N','No-Stress',swell['Condition'])
# swell['Condition'] = np.where(swell['Condition']=='R','No-Stress',swell['Condition'])
# swell['Condition'] = np.where(swell['Condition']=='I','Stress',swell['Condition'])
# swell['Condition'] = np.where(swell['Condition']=='T','Stress',swell['Condition'])

swell['Condition'] = np.where(swell['Condition']=='N',0,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='R',0,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='I',1,swell['Condition'])
swell['Condition'] = np.where(swell['Condition']=='T',1,swell['Condition'])

In [152]:
swell

Unnamed: 0,PP,Condition,HR,RMSSD,SCL
0,PP1,0,999,999.000000,80.239727
1,PP1,0,61,0.061420,77.365127
2,PP1,0,64,0.049663,77.359559
3,PP1,0,60,0.052487,76.728772
4,PP1,0,61,0.051189,76.512877
...,...,...,...,...,...
3135,PP25,1,999,999.000000,999.000000
3136,PP25,1,999,999.000000,999.000000
3137,PP25,1,999,999.000000,999.000000
3138,PP25,1,999,999.000000,999.000000


In [153]:
# Load the CSV data into a pandas dataframe
df_user_info = pd.read_csv("Scored_Surveys/swell_person.csv", sep=";")

# Merge the dataframes based on the 'PP' column
swell_with_info = swell.merge(df_user_info, on="PP", how="left")

# Print the result
print(swell_with_info)

        PP Condition   HR       RMSSD         SCL  Age Gender  \
0      PP1         0  999  999.000000   80.239727   27      m   
1      PP1         0   61    0.061420   77.365127   27      m   
2      PP1         0   64    0.049663   77.359559   27      m   
3      PP1         0   60    0.052487   76.728772   27      m   
4      PP1         0   61    0.051189   76.512877   27      m   
...    ...       ...  ...         ...         ...  ...    ...   
3135  PP25         1  999  999.000000  999.000000   26      m   
3136  PP25         1  999  999.000000  999.000000   26      m   
3137  PP25         1  999  999.000000  999.000000   26      m   
3138  PP25         1  999  999.000000  999.000000   26      m   
3139  PP25         1  999  999.000000  999.000000   26      m   

                      Occupation Dominant hand Glasses  smoke  coffee  \
0                        student         right      no      6       6   
1                        student         right      no      6       6   


In [154]:
print(swell_with_info['Age'])

0       27
1       27
2       27
3       27
4       27
        ..
3135    26
3136    26
3137    26
3138    26
3139    26
Name: Age, Length: 3140, dtype: int64


In [155]:
def train_test_split_per_user(data, train_size=0.7):
    users = list(set(data.PP))
    users = sorted(users, reverse=True)  # fix randomness
    total_users = len(users)
    slice = int(train_size * total_users)
    users_train = users[:slice]
    users_test = users[slice:]
    return data[data.PP.isin(users_train)], data[data.PP.isin(users_test)]

In [156]:
train_data, test_data = train_test_split_per_user(swell_with_info)

fold_groups = train_data.PP

# Save the 'id' column from the test set
test_ids = test_data['PP']

train_data = train_data.drop(columns=['PP'])
test_data = test_data.drop(columns=['PP'])

In [157]:
grid = setup(data=train_data, target='Condition', fix_imbalance = True, html=False, verbose=False, fold_strategy='groupkfold', fold=3, fold_groups=fold_groups, test_data=test_data) #fix_imbalance = True,
best = compare_models(sort='F1')

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
qda       Quadratic Discriminant Analysis    0.5286  0.5000  1.0000  0.5286   
svm                   SVM - Linear Kernel    0.5399  0.0000  0.7194  0.5512   
nb                            Naive Bayes    0.5140  0.5182  0.7411  0.5232   
ridge                    Ridge Classifier    0.5364  0.0000  0.5548  0.5623   
ada                  Ada Boost Classifier    0.5186  0.4997  0.5722  0.5412   
lda          Linear Discriminant Analysis    0.5382  0.5330  0.5560  0.5562   
knn                K Neighbors Classifier    0.5059  0.5136  0.4329  0.5458   
lr                    Logistic Regression    0.5241  0.5282  0.3931  0.5625   
dt               Decision Tree Classifier    0.5080  0.5261  0.3983  0.5373   
gbc          Gradient Boosting Classifier    0.4823  0.4724  0.3834  0.5090   
lightgbm  Light Gradient Boosting Machine    0.4511  0.4612  0.3595  0.4724   
rf               Random Forest Classifier    0.4188 



In [159]:
print(best)
# plot_model(best)
evaluate_model(best)

QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                              store_covariance=False, tol=0.0001)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [160]:
predictions = predict_model(best, data=test_data)

                             Model  Accuracy  AUC  Recall   Prec.      F1  \
0  Quadratic Discriminant Analysis    0.5295  0.5     1.0  0.5295  0.6924   

   Kappa  MCC  
0    0.0  0.0  


In [161]:
# Add 'id' column back to predictions DataFrame
predictions['PP'] = test_ids  # Use the 'id' column from the test data

In [162]:
predictions.columns

Index(['HR', 'RMSSD', 'SCL', 'Age', 'Gender', 'Occupation', 'Dominant hand',
       'Glasses', 'smoke', 'coffee', 'alcohol', 'physical', 'stress',
       'heart disease', 'medicine', 'Internal control index', 'Condition',
       'prediction_label', 'prediction_score', 'PP'],
      dtype='object')

In [163]:
import os

# Specify folder name
folder_name = "Output_Files"

id = predictions['PP']

# Extract true labels (y_true), rename the column
y_true = predictions[['Condition']].rename(columns={'Condition': 'y_true'})

# Extract predicted labels (y_pred), rename the column
y_pred = predictions[['prediction_label']].rename(columns={'prediction_label': 'y_pred'})

# Identify protected attribute columns
protected_attributes = predictions[['Age', 'Gender', 'Occupation']]

# Concatenate DataFrames containing predictions and protected attributes
all_data = pd.concat([id, y_true, y_pred, protected_attributes], axis=1)

# Create filename
filename = f"SWELL_Generic_Model_Bias.csv"

# Create folder if it doesn't exist
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Create full path with folder name
filepath = os.path.join(folder_name, filename)

# Save predictions
all_data.to_csv(filepath, index=False)

In [164]:
mismatched_predictions = (all_data['y_pred'] != all_data['y_true']).sum()

print(f"Number of rows with different y_pred and y_true values: {mismatched_predictions}")

Number of rows with different y_pred and y_true values: 479


In [165]:
matched_predictions = (all_data['y_pred'] == all_data['y_true']).sum()

print(f"Number of rows with same y_pred and y_true values: {matched_predictions}")

Number of rows with same y_pred and y_true values: 539


In [166]:
accuracy = matched_predictions / (matched_predictions+mismatched_predictions)
print(accuracy)

0.5294695481335953
