In [1]:
import numpy as np
import pandas as pd
import statistics
import pycaret
from pycaret.classification import *
import matplotlib.pyplot as plt

In [2]:
lifesnaps_dataset = pd.read_csv('Final_CSVs/lifesnaps_fuzzy8_13.csv')

In [3]:
lifesnaps_dataset['age']

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
5132    0.0
5133    0.0
5134    0.0
5135    0.0
5136    0.0
Name: age, Length: 5137, dtype: float64

In [4]:
lifesnaps_dataset['gender']

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
5132    1.0
5133    1.0
5134    1.0
5135    1.0
5136    1.0
Name: gender, Length: 5137, dtype: float64

In [5]:
lifesnaps_dataset['bmi']

0       18.0
1       18.0
2       18.0
3       18.0
4       18.0
        ... 
5132    23.0
5133    23.0
5134    23.0
5135    23.0
5136    23.0
Name: bmi, Length: 5137, dtype: float64

In [6]:
# Drop 'dataset' to run pycaret tests based on "Cluster".

lifesnaps_dataset = lifesnaps_dataset.drop('dataset', axis = 1)

In [7]:
unique_participants = lifesnaps_dataset["Cluster"].unique()
lifesnaps_group = lifesnaps_dataset.groupby('Cluster')

In [8]:
for participant in unique_participants:
    print("Participant: ",participant)
    part_df = lifesnaps_group.get_group(participant)
    print(len(part_df))

Participant:  0
554
Participant:  1
971
Participant:  2
403
Participant:  3
367
Participant:  4
1154
Participant:  5
379
Participant:  6
657
Participant:  7
652


In [9]:
for participant in unique_participants:
    print("Participant: ",participant)
    part_df = lifesnaps_group.get_group(participant)
    print(part_df.columns)

Participant:  0
Index(['ENTERTAINMENT', 'GYM', 'HOME', 'HOME1OFFICE', 'OTHER', 'OUTDOORS',
       'TRANSIT', 'WORK/SCHOOL', 'age', 'bmi', 'bpm', 'calories',
       'daily1temperature1variation', 'day', 'day1cos', 'day1sin', 'distance',
       'filteredDemographicVO2Max', 'full1sleep1breathing1rate', 'gender',
       'id', 'lightly1active1minutes', 'max1goal', 'min1goal',
       'mindfulness1session', 'minutesAfterWakeup', 'minutesAsleep',
       'minutesAwake', 'minutesToFallAsleep', 'minutes1below1default1zone11',
       'minutes1in1default1zone11', 'minutes1in1default1zone12',
       'minutes1in1default1zone13', 'moderately1active1minutes', 'month',
       'month1cos', 'month1sin', 'nightly1temperature', 'nremhr', 'resting1hr',
       'rmssd', 'scl1avg', 'sedentary1minutes', 'sleep1deep1ratio',
       'sleep1duration', 'sleep1efficiency', 'sleep1light1ratio',
       'sleep1rem1ratio', 'sleep1wake1ratio', 'spo2', 'step1goal',
       'step1goal1label', 'steps', 'very1active1minutes', '

In [10]:
for participant in unique_participants:
    print("Participant: ",participant)
    part_df = lifesnaps_group.get_group(participant)
    print(part_df.id)
    break

Participant:  0
0       621e2e8e67b776a24055b564
161     621e2ed667b776a24085d8d1
343     621e2f6167b776a240e082a9
499     621e2f9167b776a240011ccb
500     621e2f9167b776a240011ccb
                  ...           
4855    621e36dd67b776a240ce9a45
4856    621e36dd67b776a240ce9a45
4857    621e36dd67b776a240ce9a45
4953    621e36f967b776a240e5e7c9
4954    621e36f967b776a240e5e7c9
Name: id, Length: 554, dtype: object


In [11]:
print(unique_participants)

[0 1 2 3 4 5 6 7]


In [12]:
def train_test_split_per_user(data, train_size=0.7):
    users = list(set(data.id))
    users = sorted(users, reverse=True)  # fix randomness
    total_users = len(users)
    slice = int(train_size * total_users)
    users_train = users[:slice]
    users_test = users[slice:]
    return data[data.id.isin(users_train)], data[data.id.isin(users_test)]

In [13]:
import os  # Import os module for folder creation

# Specify folder name
folder_name = "LifeSnaps_Fuzzy_Splitting_Output_Files"

accuracies = []
precision = []
recall = []
f1scores = []

for participant in unique_participants:
    print("Group: ",participant)
    part_df = lifesnaps_group.get_group(participant)

    # ----------------------------------------------------

    train_data, test_data = train_test_split_per_user(part_df)

    fold_groups = train_data.id

    test_ids = test_data['id']

    train_data = train_data.drop(columns=['id'])
    test_data = test_data.drop(columns=['id'])

    # -----------------------------------------------------

    part_df = part_df.drop('id', axis = 1)
    grid = setup(data=train_data, target='stress', fix_imbalance = True, html=False, verbose=False, fold_strategy='groupkfold', fold=3, fold_groups=fold_groups, test_data=test_data) #fix_imbalance = True,

    best = compare_models(sort="F1")
    accuracies.append(pull()['Accuracy'][0])
    precision.append(pull()['Prec.'][0])
    recall.append(pull()['Recall'][0])
    f1scores.append(pull()['F1'][0])
    print(best)

    # ---------------------------------------------------

    # Make predictions using the best model
    predictions = predict_model(best, data=test_data)

    # Extract true labels (y_true), rename the column
    y_true = predictions[['stress']].rename(columns={'stress': 'y_true'})  # Rename using rename()

    # Extract predicted labels (y_pred), rename the column
    y_pred = predictions[['prediction_label']].rename(columns={'prediction_label': 'y_pred'})  # Rename using rename()

    # Identify protected attribute columns (assuming you know the column names)
    protected_attributes = predictions[['age', 'gender', 'bmi']]

    # Concatenate DataFrames containing predictions and protected attributes
    # all_data = pd.concat([y_true, y_pred, protected_attributes], axis=1)
    all_data = pd.concat([test_ids, y_true, y_pred, protected_attributes], axis=1)

    # Create filename
    filename = f"predictions_{participant}.csv"

    # Create folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)  # Create folder using makedirs()

    # Create full path with folder name
    filepath = os.path.join(folder_name, filename)

    # Save predictions
    all_data.to_csv(filepath, index=False)

    print(f"Predictions saved to: {filepath}")

Group:  0


                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
gbc          Gradient Boosting Classifier    0.9463  0.9622  0.8056  0.5079   
lightgbm  Light Gradient Boosting Machine    0.9383  0.9622  0.6898  0.4708   
knn                K Neighbors Classifier    0.9302  0.8926  0.7685  0.4444   
rf               Random Forest Classifier    0.9329  0.9516  0.5324  0.3963   
et                 Extra Trees Classifier    0.9383  0.9588  0.4954  0.4730   
ada                  Ada Boost Classifier    0.9302  0.9303  0.4259  0.3481   
dt               Decision Tree Classifier    0.9409  0.6648  0.3611  0.3205   
ridge                    Ridge Classifier    0.8873  0.0000  0.4583  0.2417   
nb                            Naive Bayes    0.7451  0.7512  0.6944  0.1688   
lda          Linear Discriminant Analysis    0.8283  0.7006  0.4537  0.1410   
lr                    Logistic Regression    0.7611  0.4863  0.3796  0.1099   
svm                   SVM - Linear Kernel    0.0537 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
gbc          Gradient Boosting Classifier    0.8607  0.8782  0.4666  0.5019   
dt               Decision Tree Classifier    0.8451  0.7022  0.4992  0.4322   
ada                  Ada Boost Classifier    0.8529  0.8063  0.3615  0.4061   
lightgbm  Light Gradient Boosting Machine    0.8555  0.8784  0.3157  0.4013   
rf               Random Forest Classifier    0.8385  0.8678  0.2551  0.3735   
knn                K Neighbors Classifier    0.7161  0.6954  0.5362  0.1914   
et                 Extra Trees Classifier    0.8477  0.8570  0.2451  0.3968   
nb                            Naive Bayes    0.6927  0.5857  0.4740  0.1762   
ridge                    Ridge Classifier    0.7695  0.0000  0.4645  0.2230   
lda          Linear Discriminant Analysis    0.7565  0.5850  0.3811  0.1650   
lr                    Logistic Regression    0.6732  0.4452  0.3664  0.1385   
qda       Quadratic Discriminant Analysis    0.8841 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
et                 Extra Trees Classifier    0.6049  0.3137  0.1111  0.1667   
lightgbm  Light Gradient Boosting Machine    0.6049  0.3072  0.1111  0.1667   
ada                  Ada Boost Classifier    0.6049  0.2571  0.1111  0.1111   
gbc          Gradient Boosting Classifier    0.5988  0.3007  0.1111  0.0833   
lr                    Logistic Regression    0.5494  0.3028  0.2222  0.0556   
lda          Linear Discriminant Analysis    0.5741  0.1983  0.1111  0.0417   
knn                K Neighbors Classifier    0.5432  0.2288  0.1111  0.0278   
nb                            Naive Bayes    0.5556  0.2200  0.0000  0.0000   
dt               Decision Tree Classifier    0.5679  0.1569  0.0000  0.0000   
svm                   SVM - Linear Kernel    0.3148  0.0000  0.0000  0.0000   
ridge                    Ridge Classifier    0.5802  0.0000  0.0000  0.0000   
rf               Random Forest Classifier    0.5926 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
gbc          Gradient Boosting Classifier    0.9197  0.9258  0.3512  0.3056   
et                 Extra Trees Classifier    0.9312  0.9487  0.3036  0.3889   
ada                  Ada Boost Classifier    0.9311  0.8682  0.2619  0.3000   
nb                            Naive Bayes    0.7906  0.8611  0.6071  0.2326   
rf               Random Forest Classifier    0.9274  0.9531  0.1369  0.4444   
lightgbm  Light Gradient Boosting Machine    0.9235  0.9375  0.1845  0.2540   
dt               Decision Tree Classifier    0.9197  0.5539  0.1369  0.2778   
knn                K Neighbors Classifier    0.6494  0.6099  0.5119  0.1293   
lr                    Logistic Regression    0.4393  0.5361  0.7024  0.0922   
lda          Linear Discriminant Analysis    0.5426  0.3133  0.1310  0.0347   
ridge                    Ridge Classifier    0.5542  0.0000  0.2560  0.0333   
svm                   SVM - Linear Kernel    0.6553 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
dt               Decision Tree Classifier    0.8876  0.7159  0.4957  0.4580   
ada                  Ada Boost Classifier    0.8964  0.8662  0.4800  0.4861   
gbc          Gradient Boosting Classifier    0.8839  0.8963  0.4654  0.4670   
nb                            Naive Bayes    0.7169  0.7722  0.7789  0.2985   
et                 Extra Trees Classifier    0.8961  0.9264  0.4095  0.3382   
rf               Random Forest Classifier    0.8926  0.9305  0.3324  0.3154   
lda          Linear Discriminant Analysis    0.8148  0.6695  0.4412  0.2706   
ridge                    Ridge Classifier    0.8094  0.0000  0.3812  0.2450   
knn                K Neighbors Classifier    0.5946  0.6858  0.6300  0.2146   
lightgbm  Light Gradient Boosting Machine    0.8788  0.9217  0.2908  0.2728   
svm                   SVM - Linear Kernel    0.6466  0.0000  0.3333  0.0419   
lr                    Logistic Regression    0.7018 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.7155  0.8291  0.6745  0.4334   
ada                  Ada Boost Classifier    0.7701  0.6613  0.5382  0.5173   
dt               Decision Tree Classifier    0.7356  0.6915  0.5676  0.4710   
gbc          Gradient Boosting Classifier    0.7184  0.7940  0.6245  0.4817   
knn                K Neighbors Classifier    0.6839  0.7342  0.6735  0.4044   
et                 Extra Trees Classifier    0.6868  0.8275  0.6108  0.4619   
nb                            Naive Bayes    0.6379  0.7018  0.5696  0.3749   
rf               Random Forest Classifier    0.6782  0.8356  0.5618  0.5143   
ridge                    Ridge Classifier    0.4828  0.0000  0.6696  0.3324   
lr                    Logistic Regression    0.5575  0.5721  0.5225  0.2993   
lda          Linear Discriminant Analysis    0.4971  0.7184  0.6304  0.3811   
svm                   SVM - Linear Kernel    0.7845 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.8376  0.8932  0.5281  0.5275   
gbc          Gradient Boosting Classifier    0.8087  0.8560  0.5279  0.4852   
lightgbm  Light Gradient Boosting Machine    0.8123  0.8726  0.4743  0.5093   
dt               Decision Tree Classifier    0.8214  0.6693  0.4463  0.4594   
nb                            Naive Bayes    0.7042  0.7691  0.5686  0.3180   
ada                  Ada Boost Classifier    0.7997  0.8435  0.4354  0.4968   
et                 Extra Trees Classifier    0.8250  0.8854  0.3773  0.5668   
lr                    Logistic Regression    0.6483  0.5643  0.5437  0.2503   
lda          Linear Discriminant Analysis    0.6932  0.6616  0.4882  0.2832   
ridge                    Ridge Classifier    0.6896  0.0000  0.4819  0.2840   
knn                K Neighbors Classifier    0.6194  0.6249  0.5087  0.2475   
svm                   SVM - Linear Kernel    0.6953 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
dt               Decision Tree Classifier    0.7766  0.6132  0.3732  0.3149   
gbc          Gradient Boosting Classifier    0.7698  0.7853  0.3496  0.3068   
nb                            Naive Bayes    0.6410  0.5723  0.4184  0.2200   
lr                    Logistic Regression    0.5265  0.5218  0.6176  0.1824   
lightgbm  Light Gradient Boosting Machine    0.7743  0.8091  0.2563  0.2932   
ada                  Ada Boost Classifier    0.7743  0.7739  0.2851  0.2721   
svm                   SVM - Linear Kernel    0.3944  0.0000  0.6667  0.1129   
rf               Random Forest Classifier    0.7698  0.7860  0.2056  0.1709   
et                 Extra Trees Classifier    0.7856  0.7777  0.1784  0.1712   
knn                K Neighbors Classifier    0.6208  0.5223  0.2414  0.1174   
ridge                    Ridge Classifier    0.6795  0.0000  0.1823  0.1366   
lda          Linear Discriminant Analysis    0.6840 



In [14]:
mean_acc = statistics.mean(accuracies)
mean_prec = statistics.mean(precision)
mean_rec = statistics.mean(recall)
mean_f1 = statistics.mean(f1scores)

In [15]:
print("Mean Accuracy LifeSnaps - Fuzzy Clustering: ", mean_acc)
print("Mean Precision LifeSnaps - Fuzzy Clustering: ", mean_prec)
print("Mean Recall LifeSnaps - Fuzzy Clustering: ", mean_rec)
print("Mean F1-score LifeSnaps - Fuzzy Clustering: ", mean_f1)

Mean Accuracy LifeSnaps - Fuzzy Clustering:  0.8186125
Mean Precision LifeSnaps - Fuzzy Clustering:  0.4019875
Mean Recall LifeSnaps - Fuzzy Clustering:  0.47575
Mean F1-score LifeSnaps - Fuzzy Clustering:  0.4156375


In [16]:
folder_path = "LifeSnaps_Fuzzy_Splitting_Output_Files"

# List all CSV files in the folder:
csv_files = []
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        csv_files.append(os.path.join(folder_path, filename))

# Concatenate files
combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)


In [17]:
output_folder = "Output_Files"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)  # Create folder using makedirs()

# Create the full path with the output folder name
output_filepath = os.path.join(output_folder, "LifeSnaps_Fuzzy_Splitting.csv")

# Save the concatenated DataFrame to the new CSV file
combined_df.to_csv(output_filepath, index=False)

print(f"All predictions saved to: {output_filepath}")

All predictions saved to: Output_Files\LifeSnaps_Fuzzy_Splitting.csv


In [18]:
predictions = pd.read_csv('Output_Files/LifeSnaps_Fuzzy_Splitting.csv')

In [19]:
predictions['bmi'] = predictions.bmi.apply(lambda bmi: 0 if bmi < 18.5 else (1 if bmi < 25 else (
    2 if bmi < 30 else 3)))  # 0: Underweight, 1: Normal, 2: Overweight, 3: Obese

In [20]:
predictions.to_csv('Output_Files/LifeSnaps_Fuzzy_Splitting.csv', index=False)