In [1]:
import numpy as np
import pandas as pd
import statistics
import pycaret
from pycaret.classification import *
import matplotlib.pyplot as plt


## SWELL

In [2]:
swell_dataset = pd.read_csv('Final_CSVs/swell_fuzzy10_14.csv')
#swell_dataset

In [3]:
# Drop 'dataset' to run pycaret tests based on "Cluster".

swell_dataset = swell_dataset.drop('dataset', axis = 1)
#swell_dataset

In [4]:
unique_participants = swell_dataset["Cluster"].unique()
swell_group = swell_dataset.groupby('Cluster')

In [5]:
#for participant in unique_participants:
#    print("Participant: ",participant)
#    part_df = swell_group.get_group(participant)
#    print(len(part_df))

In [6]:
print(unique_participants)

[0 1 2 4 6 9 8 7 5 3]


In [7]:
for participant in unique_participants:
    print("Participant: ",participant)    
    part_df = swell_group.get_group(participant)

    unique_ids = part_df['id'].unique()
    print("Unique IDs:", unique_ids)

Participant:  0
Unique IDs: ['PP1' 'PP2' 'PP13' 'PP16' 'PP17']
Participant:  1
Unique IDs: ['PP1' 'PP2' 'PP3' 'PP5' 'PP6' 'PP7' 'PP10' 'PP12' 'PP13' 'PP15' 'PP17'
 'PP20' 'PP22']
Participant:  2
Unique IDs: ['PP1' 'PP2' 'PP3' 'PP4' 'PP13' 'PP15' 'PP17' 'PP20' 'PP22' 'PP23' 'PP25']
Participant:  4
Unique IDs: ['PP1' 'PP2' 'PP3' 'PP5' 'PP6' 'PP7' 'PP12' 'PP13' 'PP15' 'PP17' 'PP19'
 'PP20' 'PP22' 'PP23']
Participant:  6
Unique IDs: ['PP1' 'PP2' 'PP13' 'PP14' 'PP17' 'PP18']
Participant:  9
Unique IDs: ['PP1' 'PP2' 'PP12' 'PP13' 'PP15' 'PP17' 'PP20' 'PP22' 'PP23']
Participant:  8
Unique IDs: ['PP2' 'PP13' 'PP15' 'PP17' 'PP20' 'PP23' 'PP24']
Participant:  7
Unique IDs: ['PP8' 'PP11']
Participant:  5
Unique IDs: ['PP9' 'PP17']
Participant:  3
Unique IDs: ['PP17' 'PP21']


In [8]:
for participant in unique_participants:
  print("Group: ", participant)
  part_df = swell_group.get_group(participant)

  # Count the number of rows for each ID using value_counts()
  id_counts = part_df['id'].value_counts()

  # Print the ID and its corresponding count
  print("ID Counts:")
  for id, count in id_counts.items():
    print(f"  ID: {id}, Count: {count}")

Group:  0
ID Counts:
  ID: PP16, Count: 127
  ID: PP2, Count: 109
  ID: PP17, Count: 87
  ID: PP1, Count: 3
  ID: PP13, Count: 2
Group:  1
ID Counts:
  ID: PP10, Count: 128
  ID: PP7, Count: 125
  ID: PP3, Count: 123
  ID: PP6, Count: 121
  ID: PP1, Count: 112
  ID: PP22, Count: 107
  ID: PP13, Count: 65
  ID: PP17, Count: 10
  ID: PP2, Count: 5
  ID: PP20, Count: 5
  ID: PP12, Count: 2
  ID: PP15, Count: 2
  ID: PP5, Count: 1
Group:  2
ID Counts:
  ID: PP25, Count: 128
  ID: PP4, Count: 127
  ID: PP13, Count: 8
  ID: PP20, Count: 7
  ID: PP22, Count: 4
  ID: PP15, Count: 3
  ID: PP17, Count: 2
  ID: PP1, Count: 1
  ID: PP2, Count: 1
  ID: PP3, Count: 1
  ID: PP23, Count: 1
Group:  4
ID Counts:
  ID: PP19, Count: 130
  ID: PP12, Count: 125
  ID: PP5, Count: 124
  ID: PP20, Count: 114
  ID: PP13, Count: 36
  ID: PP17, Count: 15
  ID: PP22, Count: 12
  ID: PP1, Count: 8
  ID: PP6, Count: 4
  ID: PP7, Count: 4
  ID: PP3, Count: 3
  ID: PP2, Count: 2
  ID: PP23, Count: 2
  ID: PP15, Count:

In [9]:
def train_test_split_per_user(data, train_size=0.7):
    users = list(set(data.id))
    users = sorted(users, reverse=True)  # fix randomness
    total_users = len(users)
    slice = int(train_size * total_users)
    users_train = users[:slice]
    users_test = users[slice:]
    return data[data.id.isin(users_train)], data[data.id.isin(users_test)]

In [10]:
from sklearn.model_selection import train_test_split
import os  # Import os module for folder creation

# Specify folder name
folder_name = "SWELL_Fuzzy_Splitting_Output_Files"

# Load the CSV data into a pandas dataframe
df_user_info = pd.read_csv("Scored_Surveys/swell_person.csv", sep=";")

accuracies_swell = []
precision_swell = []
recall_swell = []
f1scores_swell = []

for participant in unique_participants:
    print("Group: ",participant)    
    part_df = swell_group.get_group(participant)

    # -----------------------------------------------------

    train_data, test_data = train_test_split_per_user(part_df)

    fold_groups = train_data.id

    # Save the 'id' column from the test set
    test_ids = test_data['id']

    train_data = train_data.drop(columns=['id'])
    test_data = test_data.drop(columns=['id'])

    # -----------------------------------------------------

    grid = setup(data=train_data, target='stress', fix_imbalance = True, html=False, verbose=False, test_data=test_data) #fix_imbalance = True,
    best = compare_models(sort='F1')
    accuracies_swell.append(pull()['Accuracy'][0])
    precision_swell.append(pull()['Prec.'][0])
    recall_swell.append(pull()['Recall'][0])
    f1scores_swell.append(pull()['F1'][0])
    print(best)

    # ---------------------------------------------------

    # Make predictions using the best model
    predictions = predict_model(best, data=test_data)

    # Add 'id' column back to predictions DataFrame
    predictions['PP'] = test_ids  # Use the 'id' column from the test data

    # Merge the dataframes based on the 'PP' column
    predictions_with_info = predictions.merge(df_user_info, on="PP", how="left")

    # Extract true labels (y_true), rename the column
    y_true = predictions_with_info[['stress_x']].rename(columns={'stress_x': 'y_true'})

    # Extract predicted labels (y_pred), rename the column
    y_pred = predictions_with_info[['prediction_label']].rename(columns={'prediction_label': 'y_pred'})

    # Identify protected attribute columns (assuming you know the column names)
    protected_attributes = predictions_with_info[['Age', 'Gender', 'Occupation']]

    # Concatenate DataFrames containing predictions and protected attributes
    all_data = pd.concat([test_ids.reset_index(drop=True), y_true, y_pred, protected_attributes], axis=1)

    # Create filename
    filename = f"predictions_{participant}.csv"

    # Create folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)  # Create folder using makedirs()

    # Create full path with folder name
    filepath = os.path.join(folder_name, filename)

    # Save predictions
    all_data.to_csv(filepath, index=False)

    print(f"Predictions saved to: {filepath}")

Group:  0


                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.8976  0.9485  0.9242  0.8931   
rf               Random Forest Classifier    0.8915  0.9545  0.9359  0.8748   
lightgbm  Light Gradient Boosting Machine    0.8884  0.9407  0.9131  0.8844   
et                 Extra Trees Classifier    0.8759  0.9334  0.9072  0.8716   
gbc          Gradient Boosting Classifier    0.8697  0.9318  0.8892  0.8758   
ada                  Ada Boost Classifier    0.8603  0.9158  0.8899  0.8674   
dt               Decision Tree Classifier    0.8295  0.8287  0.8487  0.8438   
nb                            Naive Bayes    0.6072  0.5333  0.9941  0.5773   
ridge                    Ridge Classifier    0.6633  0.0000  0.7065  0.7446   
lda          Linear Discriminant Analysis    0.6633  0.7995  0.7007  0.7465   
lr                    Logistic Regression    0.6631  0.8053  0.6716  0.7456   
svm                   SVM - Linear Kernel    0.6298 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
et                 Extra Trees Classifier    0.6008  0.6497  0.5984  0.6085   
knn                K Neighbors Classifier    0.6548  0.7110  0.5221  0.6952   
rf               Random Forest Classifier    0.5849  0.6915  0.5682  0.6119   
lightgbm  Light Gradient Boosting Machine    0.5987  0.7206  0.5721  0.5737   
gbc          Gradient Boosting Classifier    0.5828  0.6911  0.5876  0.5071   
dt               Decision Tree Classifier    0.5507  0.6314  0.5339  0.5529   
ada                  Ada Boost Classifier    0.5186  0.6214  0.5346  0.4531   
nb                            Naive Bayes    0.4623  0.4144  0.4547  0.4053   
lda          Linear Discriminant Analysis    0.4764  0.3258  0.3665  0.4003   
lr                    Logistic Regression    0.4583  0.3216  0.3670  0.3805   
ridge                    Ridge Classifier    0.4443  0.0000  0.3288  0.3387   
svm                   SVM - Linear Kernel    0.4991 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8591  0.9049  0.9000  0.8541   
knn                K Neighbors Classifier    0.8479  0.8904  0.8857  0.8489   
ada                  Ada Boost Classifier    0.8329  0.8615  0.9071  0.8104   
gbc          Gradient Boosting Classifier    0.8255  0.8689  0.8590  0.8301   
dt               Decision Tree Classifier    0.8218  0.7857  0.8514  0.8277   
rf               Random Forest Classifier    0.8218  0.8721  0.8371  0.8362   
et                 Extra Trees Classifier    0.8181  0.8238  0.8371  0.8306   
nb                            Naive Bayes    0.6212  0.8650  0.9500  0.5893   
lr                    Logistic Regression    0.7288  0.8691  0.5671  0.9015   
ridge                    Ridge Classifier    0.7288  0.0000  0.5671  0.9015   
lda          Linear Discriminant Analysis    0.7288  0.8705  0.5671  0.9015   
svm                   SVM - Linear Kernel    0.6768 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.7165  0.7542  0.8119  0.7143   
lightgbm  Light Gradient Boosting Machine    0.7194  0.8043  0.7801  0.7301   
et                 Extra Trees Classifier    0.7090  0.7316  0.7749  0.7169   
rf               Random Forest Classifier    0.6886  0.7483  0.7569  0.7045   
ada                  Ada Boost Classifier    0.6914  0.7238  0.7424  0.7124   
gbc          Gradient Boosting Classifier    0.6814  0.7583  0.7424  0.6943   
dt               Decision Tree Classifier    0.6662  0.6686  0.7665  0.6680   
lr                    Logistic Regression    0.5626  0.5300  0.5818  0.5923   
lda          Linear Discriminant Analysis    0.5451  0.5215  0.5500  0.5771   
ridge                    Ridge Classifier    0.5501  0.0000  0.5312  0.5869   
nb                            Naive Bayes    0.5088  0.5548  0.4574  0.5137   
svm                   SVM - Linear Kernel    0.5265 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
et                 Extra Trees Classifier    0.7371  0.7669  0.8187  0.7429   
rf               Random Forest Classifier    0.7292  0.7941  0.8049  0.7451   
lightgbm  Light Gradient Boosting Machine    0.7178  0.8157  0.7764  0.7549   
gbc          Gradient Boosting Classifier    0.7134  0.8018  0.7753  0.7259   
dt               Decision Tree Classifier    0.7098  0.7064  0.7401  0.7411   
knn                K Neighbors Classifier    0.6898  0.7285  0.7473  0.7130   
ada                  Ada Boost Classifier    0.6902  0.7429  0.7544  0.7162   
ridge                    Ridge Classifier    0.6275  0.0000  0.7352  0.6430   
lda          Linear Discriminant Analysis    0.6155  0.6560  0.6852  0.6327   
lr                    Logistic Regression    0.5922  0.6661  0.6423  0.6223   
svm                   SVM - Linear Kernel    0.4838  0.0000  0.6000  0.4818   
nb                            Naive Bayes    0.4318 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.7245  0.7204  0.9154  0.7212   
rf               Random Forest Classifier    0.7660  0.8362  0.6314  0.8721   
lightgbm  Light Gradient Boosting Machine    0.7580  0.8474  0.6314  0.8534   
et                 Extra Trees Classifier    0.7578  0.8148  0.6237  0.8572   
ada                  Ada Boost Classifier    0.7540  0.8116  0.6237  0.8527   
dt               Decision Tree Classifier    0.7540  0.7939  0.6083  0.8677   
gbc          Gradient Boosting Classifier    0.7537  0.8275  0.6071  0.8771   
ridge                    Ridge Classifier    0.6038  0.0000  0.8603  0.6062   
lr                    Logistic Regression    0.5997  0.6881  0.7904  0.5941   
lda          Linear Discriminant Analysis    0.5837  0.6805  0.7596  0.5883   
svm                   SVM - Linear Kernel    0.5490  0.0000  0.6737  0.4554   
nb                            Naive Bayes    0.4343 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
dt               Decision Tree Classifier    0.9192  0.9372  0.9714  0.8927   
rf               Random Forest Classifier    0.9192  0.9265  0.9714  0.8927   
gbc          Gradient Boosting Classifier    0.9192  0.9384  0.9714  0.8927   
lightgbm  Light Gradient Boosting Machine    0.9115  0.9313  0.9857  0.8728   
et                 Extra Trees Classifier    0.9044  0.9286  0.9429  0.8899   
ada                  Ada Boost Classifier    0.8967  0.8997  0.9571  0.8708   
knn                K Neighbors Classifier    0.8115  0.9321  0.8000  0.8613   
lr                    Logistic Regression    0.7692  0.7272  0.7571  0.8010   
ridge                    Ridge Classifier    0.7692  0.0000  0.7571  0.8010   
lda          Linear Discriminant Analysis    0.7692  0.7150  0.7571  0.8010   
nb                            Naive Bayes    0.7165  0.8588  0.7714  0.7261   
svm                   SVM - Linear Kernel    0.5341 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.5879  0.6169  0.5767  0.5814   
nb                            Naive Bayes    0.5879  0.6169  0.5767  0.5814   
ridge                    Ridge Classifier    0.5879  0.0000  0.5767  0.5814   
lda          Linear Discriminant Analysis    0.5879  0.6169  0.5767  0.5814   
lightgbm  Light Gradient Boosting Machine    0.5720  0.5742  0.4933  0.5800   
svm                   SVM - Linear Kernel    0.4841  0.0000  0.6500  0.4258   
et                 Extra Trees Classifier    0.5447  0.4878  0.3533  0.5067   
rf               Random Forest Classifier    0.5280  0.4897  0.3367  0.4950   
gbc          Gradient Boosting Classifier    0.5280  0.4947  0.3367  0.4950   
ada                  Ada Boost Classifier    0.5098  0.5025  0.3233  0.5050   
dt               Decision Tree Classifier    0.5197  0.4806  0.3200  0.4783   
knn                K Neighbors Classifier    0.5098 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lda          Linear Discriminant Analysis    0.7295  0.7437  0.7071  0.7823   
et                 Extra Trees Classifier    0.7199  0.7429  0.7429  0.7555   
ridge                    Ridge Classifier    0.7295  0.0000  0.6929  0.7879   
lr                    Logistic Regression    0.7141  0.7463  0.7071  0.7554   
rf               Random Forest Classifier    0.7128  0.7681  0.7286  0.7458   
gbc          Gradient Boosting Classifier    0.6878  0.7968  0.7238  0.7215   
dt               Decision Tree Classifier    0.6872  0.6677  0.6810  0.7189   
ada                  Ada Boost Classifier    0.6712  0.7381  0.6810  0.7204   
nb                            Naive Bayes    0.6487  0.6807  0.7095  0.6692   
lightgbm  Light Gradient Boosting Machine    0.6397  0.7728  0.6619  0.6825   
knn                K Neighbors Classifier    0.5987  0.6975  0.6476  0.6420   
svm                   SVM - Linear Kernel    0.5256 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.6336  0.6850   0.660  0.5848   
ada                  Ada Boost Classifier    0.6436  0.6767   0.600  0.6350   
gbc          Gradient Boosting Classifier    0.6445  0.6842   0.545  0.6467   
rf               Random Forest Classifier    0.6227  0.6988   0.575  0.5633   
dt               Decision Tree Classifier    0.5945  0.5942   0.555  0.5588   
svm                   SVM - Linear Kernel    0.5564  0.0000   0.670  0.5580   
et                 Extra Trees Classifier    0.6055  0.6717   0.530  0.5355   
lda          Linear Discriminant Analysis    0.6309  0.6217   0.430  0.6400   
lr                    Logistic Regression    0.6145  0.6108   0.410  0.5833   
ridge                    Ridge Classifier    0.6036  0.0000   0.410  0.5829   
lightgbm  Light Gradient Boosting Machine    0.5400  0.6125   0.420  0.5157   
nb                            Naive Bayes    0.6427 

In [11]:
mean_acc = statistics.mean(accuracies_swell)
mean_prec = statistics.mean(precision_swell)
mean_rec = statistics.mean(recall_swell)
mean_f1 = statistics.mean(f1scores_swell)

In [12]:
print("Mean Accuracy SWELL - Fuzzy Clustering: ", mean_acc)
print("Mean Precision SWELL - Fuzzy Clustering: ", mean_prec)
print("Mean Recall SWELL - Fuzzy Clustering: ", mean_rec)
print("Mean F1-score SWELL - Fuzzy Clustering: ", mean_f1)

Mean Accuracy SWELL - Fuzzy Clustering:  0.74058
Mean Precision SWELL - Fuzzy Clustering:  0.73753
Mean Recall SWELL - Fuzzy Clustering:  0.78838
Mean F1-score SWELL - Fuzzy Clustering:  0.7512099999999999


In [13]:
folder_path = "SWELL_Fuzzy_Splitting_Output_Files"

# List all CSV files in the folder:
csv_files = []
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        csv_files.append(os.path.join(folder_path, filename))

# Concatenate files
combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)


In [14]:
output_folder = "Output_Files"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)  # Create folder using makedirs()

# Create the full path with the output folder name
output_filepath = os.path.join(output_folder, "SWELL_Fuzzy_Splitting.csv")

# Save the concatenated DataFrame to the new CSV file
combined_df.to_csv(output_filepath, index=False)

print(f"All predictions saved to: {output_filepath}")

All predictions saved to: Output_Files\SWELL_Fuzzy_Splitting.csv


### Fuzzy Splitting model trained with protected attributes

In [15]:
swell_dataset = pd.read_csv('Final_CSVs/swell_fuzzy10_14.csv')

In [16]:
# Load the CSV data into a pandas dataframe
df_user_info = pd.read_csv("Scored_Surveys/swell_person.csv", sep=";")
df_user_info.drop(columns=['stress'], inplace=True)
df_user_info.columns

Index(['PP', 'Age', 'Gender', 'Occupation', 'Dominant hand', 'Glasses',
       'smoke', 'coffee', 'alcohol', 'physical', 'heart disease', 'medicine',
       'Internal control index'],
      dtype='object')

In [17]:
# Load the CSV data into a pandas dataframe
df_user_info = pd.read_csv("Scored_Surveys/swell_person.csv", sep=";")

# Rename the 'PP' column to 'id'
df_user_info.rename(columns={'PP': 'id'}, inplace=True)

# Merge the dataframes based on the 'PP' column
swell_dataset = swell_dataset.merge(df_user_info, on="id", how="left")

# Print the result
print(swell_dataset.columns)

Index(['HR', 'RMSSD', 'SCL', 'id', 'dataset', 'stress_x', 'Cluster', 'Age',
       'Gender', 'Occupation', 'Dominant hand', 'Glasses', 'smoke', 'coffee',
       'alcohol', 'physical', 'stress_y', 'heart disease', 'medicine',
       'Internal control index'],
      dtype='object')


In [18]:
# Drop 'dataset' to run pycaret tests based on "Cluster".

swell_dataset = swell_dataset.drop('dataset', axis = 1)
#swell_dataset

In [19]:
unique_participants = swell_dataset["Cluster"].unique()
swell_group = swell_dataset.groupby('Cluster')

In [20]:
from sklearn.model_selection import train_test_split
import os  # Import os module for folder creation

# Specify folder name
folder_name = "SWELL_Fuzzy_Splitting_Output_Files_Bias"

# Load the CSV data into a pandas dataframe
df_user_info = pd.read_csv("Scored_Surveys/swell_person.csv", sep=";")

accuracies_swell = []
precision_swell = []
recall_swell = []
f1scores_swell = []

for participant in unique_participants:
    print("Group: ",participant)    
    part_df = swell_group.get_group(participant)

    # -----------------------------------------------------

    train_data, test_data = train_test_split_per_user(part_df)

    fold_groups = train_data.id

    # Save the 'id' column from the test set
    test_ids = test_data['id']

    train_data = train_data.drop(columns=['id'])
    test_data = test_data.drop(columns=['id'])

    # -----------------------------------------------------

    grid = setup(data=train_data, target='stress_x', fix_imbalance = True, html=False, verbose=False, test_data=test_data) #fix_imbalance = True,
    best = compare_models(sort='F1')
    accuracies_swell.append(pull()['Accuracy'][0])
    precision_swell.append(pull()['Prec.'][0])
    recall_swell.append(pull()['Recall'][0])
    f1scores_swell.append(pull()['F1'][0])
    print(best)

    # ---------------------------------------------------

    # Make predictions using the best model
    predictions = predict_model(best, data=test_data)

    # Add 'id' column back to predictions DataFrame
    predictions['PP'] = test_ids  # Use the 'id' column from the test data

    # Extract true labels (y_true), rename the column
    y_true = predictions[['stress_x']].rename(columns={'stress_x': 'y_true'})

    # Extract predicted labels (y_pred), rename the column
    y_pred = predictions[['prediction_label']].rename(columns={'prediction_label': 'y_pred'})

    # Identify protected attribute columns (assuming you know the column names)
    protected_attributes = predictions[['Age', 'Gender', 'Occupation']]

    # Concatenate DataFrames containing predictions and protected attributes
    all_data = pd.concat([test_ids, y_true, y_pred, protected_attributes], axis=1)

    # Create filename
    filename = f"predictions_{participant}.csv"

    # Create folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)  # Create folder using makedirs()

    # Create full path with folder name
    filepath = os.path.join(folder_name, filename)

    # Save predictions
    all_data.to_csv(filepath, index=False)

    print(f"Predictions saved to: {filepath}")

Group:  0


                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.9009  0.9569  0.9242  0.8954   
knn                K Neighbors Classifier    0.9008  0.9491  0.9127  0.9071   
et                 Extra Trees Classifier    0.8947  0.9541  0.9307  0.8823   
gbc          Gradient Boosting Classifier    0.8790  0.9259  0.8951  0.8875   
lightgbm  Light Gradient Boosting Machine    0.8759  0.9428  0.8833  0.8883   
ada                  Ada Boost Classifier    0.8571  0.9061  0.8660  0.8756   
dt               Decision Tree Classifier    0.8357  0.8341  0.8670  0.8458   
ridge                    Ridge Classifier    0.7592  0.0000  0.7745  0.7797   
lda          Linear Discriminant Analysis    0.7562  0.7971  0.7634  0.7799   
lr                    Logistic Regression    0.7443  0.7952  0.7458  0.7587   
nb                            Naive Bayes    0.6072  0.4850  0.9941  0.5773   
svm                   SVM - Linear Kernel    0.5741 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
et                 Extra Trees Classifier    0.7696  0.8020  0.7567  0.8033   
rf               Random Forest Classifier    0.7636  0.8369  0.7453  0.8004   
lightgbm  Light Gradient Boosting Machine    0.7496  0.8225  0.6956  0.8027   
gbc          Gradient Boosting Classifier    0.7436  0.8295  0.7105  0.7941   
knn                K Neighbors Classifier    0.7356  0.7880  0.6731  0.8044   
dt               Decision Tree Classifier    0.7096  0.7110  0.7000  0.7430   
nb                            Naive Bayes    0.5371  0.4295  0.9665  0.5353   
ada                  Ada Boost Classifier    0.6233  0.6790  0.6846  0.6321   
svm                   SVM - Linear Kernel    0.5211  0.0000  0.7889  0.4249   
qda       Quadratic Discriminant Analysis    0.5151  0.7002  0.8000  0.4211   
lda          Linear Discriminant Analysis    0.4103  0.3405  0.3872  0.4077   
lr                    Logistic Regression    0.4244 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
ada                  Ada Boost Classifier    0.8628  0.8435  0.9214  0.8458   
knn                K Neighbors Classifier    0.8477  0.8997  0.8786  0.8548   
gbc          Gradient Boosting Classifier    0.8369  0.8754  0.8657  0.8421   
lightgbm  Light Gradient Boosting Machine    0.8258  0.8993  0.8614  0.8366   
rf               Random Forest Classifier    0.8219  0.8879  0.8448  0.8359   
et                 Extra Trees Classifier    0.8182  0.8523  0.8376  0.8338   
dt               Decision Tree Classifier    0.8181  0.7908  0.8248  0.8367   
lr                    Logistic Regression    0.7103  0.8433  0.5671  0.8679   
ridge                    Ridge Classifier    0.7140  0.0000  0.5600  0.8759   
lda          Linear Discriminant Analysis    0.7140  0.8433  0.5600  0.8759   
svm                   SVM - Linear Kernel    0.6843  0.0000  0.7286  0.6667   
nb                            Naive Bayes    0.6584 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.7192  0.7748  0.7848  0.7260   
lightgbm  Light Gradient Boosting Machine    0.7317  0.8109  0.6998  0.7917   
gbc          Gradient Boosting Classifier    0.7193  0.7979  0.7145  0.7526   
rf               Random Forest Classifier    0.7140  0.7797  0.6996  0.7603   
dt               Decision Tree Classifier    0.7091  0.7373  0.6755  0.7706   
ada                  Ada Boost Classifier    0.6891  0.7405  0.6957  0.7328   
et                 Extra Trees Classifier    0.6938  0.7807  0.6623  0.7492   
ridge                    Ridge Classifier    0.5346  0.0000  0.5067  0.5761   
lda          Linear Discriminant Analysis    0.5347  0.5221  0.5061  0.5607   
lr                    Logistic Regression    0.5372  0.4983  0.4881  0.5725   
svm                   SVM - Linear Kernel    0.5290  0.0000  0.6310  0.4797   
nb                            Naive Bayes    0.4583 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.7489  0.8190  0.8121  0.7681   
lightgbm  Light Gradient Boosting Machine    0.7454  0.8314  0.8264  0.7582   
et                 Extra Trees Classifier    0.7451  0.7721  0.7984  0.7631   
gbc          Gradient Boosting Classifier    0.7334  0.8208  0.8038  0.7556   
dt               Decision Tree Classifier    0.7094  0.7029  0.7687  0.7283   
knn                K Neighbors Classifier    0.6778  0.7204  0.7396  0.6986   
ada                  Ada Boost Classifier    0.6622  0.7120  0.7407  0.6791   
lda          Linear Discriminant Analysis    0.6431  0.6430  0.7352  0.6575   
ridge                    Ridge Classifier    0.6354  0.0000  0.7423  0.6485   
lr                    Logistic Regression    0.6152  0.6385  0.6610  0.6465   
qda       Quadratic Discriminant Analysis    0.4968  0.3685  0.6786  0.4289   
nb                            Naive Bayes    0.4923 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.7205  0.7823  0.9000  0.7223   
lightgbm  Light Gradient Boosting Machine    0.8020  0.8558  0.7160  0.8639   
rf               Random Forest Classifier    0.8062  0.8507  0.7013  0.8828   
et                 Extra Trees Classifier    0.8018  0.8468  0.6929  0.8870   
lr                    Logistic Regression    0.7778  0.8210  0.7385  0.8101   
ada                  Ada Boost Classifier    0.7897  0.8397  0.6686  0.8847   
dt               Decision Tree Classifier    0.7818  0.7842  0.6699  0.8580   
gbc          Gradient Boosting Classifier    0.7817  0.8378  0.6609  0.8723   
lda          Linear Discriminant Analysis    0.7577  0.8293  0.7224  0.7962   
ridge                    Ridge Classifier    0.7457  0.0000  0.7147  0.7801   
qda       Quadratic Discriminant Analysis    0.5120  0.0500  0.9000  0.4640   
svm                   SVM - Linear Kernel    0.6012 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
knn                K Neighbors Classifier    0.9269  0.9417  1.0000  0.8881   
dt               Decision Tree Classifier    0.9192  0.9372  0.9714  0.8927   
rf               Random Forest Classifier    0.9192  0.9245  0.9714  0.8927   
gbc          Gradient Boosting Classifier    0.9192  0.9384  0.9714  0.8927   
et                 Extra Trees Classifier    0.9044  0.9262  0.9571  0.8792   
ada                  Ada Boost Classifier    0.8967  0.9102  0.9571  0.8708   
lightgbm  Light Gradient Boosting Machine    0.8890  0.9194  0.9857  0.8489   
lr                    Logistic Regression    0.7692  0.7313  0.7571  0.8010   
ridge                    Ridge Classifier    0.7692  0.0000  0.7571  0.8010   
lda          Linear Discriminant Analysis    0.7692  0.7211  0.7571  0.8010   
nb                            Naive Bayes    0.5341  0.8670  0.9714  0.5325   
qda       Quadratic Discriminant Analysis    0.5269 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.5879  0.6169  0.5767  0.5814   
nb                            Naive Bayes    0.5879  0.6169  0.5767  0.5814   
ridge                    Ridge Classifier    0.5879  0.0000  0.5767  0.5814   
lda          Linear Discriminant Analysis    0.5879  0.6169  0.5767  0.5814   
lightgbm  Light Gradient Boosting Machine    0.5621  0.5206  0.4900  0.5767   
dt               Decision Tree Classifier    0.5280  0.4944  0.3367  0.4950   
rf               Random Forest Classifier    0.5280  0.5019  0.3367  0.4950   
gbc          Gradient Boosting Classifier    0.5280  0.4897  0.3367  0.4950   
ada                  Ada Boost Classifier    0.5015  0.4608  0.3233  0.5000   
knn                K Neighbors Classifier    0.5098  0.5525  0.3233  0.4783   
et                 Extra Trees Classifier    0.5288  0.4881  0.3200  0.4867   
svm                   SVM - Linear Kernel    0.4742 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
et                 Extra Trees Classifier    0.7269  0.7527  0.7286  0.7818   
ridge                    Ridge Classifier    0.7295  0.0000  0.7071  0.7769   
lda          Linear Discriminant Analysis    0.7295  0.7579  0.7071  0.7769   
lr                    Logistic Regression    0.7218  0.7398  0.7214  0.7577   
gbc          Gradient Boosting Classifier    0.6795  0.7803  0.7381  0.6894   
lightgbm  Light Gradient Boosting Machine    0.6635  0.7851  0.7381  0.6847   
rf               Random Forest Classifier    0.6795  0.7435  0.6952  0.7173   
ada                  Ada Boost Classifier    0.6615  0.7881  0.6952  0.6977   
dt               Decision Tree Classifier    0.6622  0.6497  0.6643  0.6921   
nb                            Naive Bayes    0.6160  0.6842  0.6619  0.6288   
knn                K Neighbors Classifier    0.5987  0.6842  0.6333  0.6300   
svm                   SVM - Linear Kernel    0.5583 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
ada                  Ada Boost Classifier    0.6718  0.7167   0.665  0.6517   
et                 Extra Trees Classifier    0.6618  0.6838   0.640  0.6221   
rf               Random Forest Classifier    0.6509  0.7058   0.580  0.6655   
gbc          Gradient Boosting Classifier    0.6436  0.7125   0.585  0.6450   
knn                K Neighbors Classifier    0.6036  0.6508   0.615  0.5464   
svm                   SVM - Linear Kernel    0.5227  0.0000   0.780  0.4785   
lightgbm  Light Gradient Boosting Machine    0.5964  0.6337   0.510  0.5646   
dt               Decision Tree Classifier    0.6045  0.5942   0.505  0.5545   
lda          Linear Discriminant Analysis    0.6127  0.6217   0.430  0.6513   
lr                    Logistic Regression    0.6227  0.6250   0.430  0.6275   
ridge                    Ridge Classifier    0.6127  0.0000   0.430  0.6233   
nb                            Naive Bayes    0.6336 

In [21]:
mean_acc = statistics.mean(accuracies_swell)
mean_prec = statistics.mean(precision_swell)
mean_rec = statistics.mean(recall_swell)
mean_f1 = statistics.mean(f1scores_swell)

In [22]:
print("Mean Accuracy SWELL- Cluster Personality: ", mean_acc)
print("Mean Precision SWELL- Cluster Personality: ", mean_prec)
print("Mean Recall SWELL- Cluster Personality: ", mean_rec)
print("Mean F1-score SWELL- Cluster Personality: ", mean_f1)

Mean Accuracy SWELL- Cluster Personality:  0.76354
Mean Precision SWELL- Cluster Personality:  0.76639
Mean Recall SWELL- Cluster Personality:  0.8069500000000001
Mean F1-score SWELL- Cluster Personality:  0.77432


In [23]:
folder_path = "SWELL_Fuzzy_Splitting_Output_Files_Bias"

# List all CSV files in the folder:
csv_files = []
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        csv_files.append(os.path.join(folder_path, filename))

# Concatenate files
combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

In [24]:
output_folder = "Output_Files"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)  # Create folder using makedirs()

# Create the full path with the output folder name
output_filepath = os.path.join(output_folder, "SWELL_Fuzzy_Splitting_Bias.csv")

# Save the concatenated DataFrame to the new CSV file
combined_df.to_csv(output_filepath, index=False)

print(f"All predictions saved to: {output_filepath}")

All predictions saved to: Output_Files\SWELL_Fuzzy_Splitting_Bias.csv
