## Model fitting and evaluation

In [84]:
Y_train = df_combined["stress_level"]
X_train = df_combined.drop(["stress_level"], axis= 1)
Y_test = test["stress_level"]
X_test = test.drop(["stress_level"], axis= 1)

In [86]:
Y_train=Y_train.astype(int)

In [88]:
print("X_train shape is",X_train.shape)
print("X_test shape is",X_test.shape)
print("Y_train shape is",Y_train.shape)
print("Y_test shape is",Y_test.shape)

X_train shape is (944, 8)
X_test shape is (158, 8)
Y_train shape is (944,)
Y_test shape is (158,)


### 1.) K-Nearest Neighbours

In [90]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    "n_neighbors": range(1, 31),
    "weights": ["uniform", "distance"],  
    "metric": ["euclidean", "manhattan", "minkowski"]
}

# Initialize KNN model
knn = KNeighborsClassifier()

# Perform GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2)
grid_search.fit(X_train, Y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best Parameters: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'uniform'}


In [92]:
knn = KNeighborsClassifier(metric="manhattan", n_neighbors=5, weights="uniform")
knn_model = knn.fit(X_train, Y_train)
knn_model

In [71]:
Y_pred_train = knn_model.predict(X_train)
Y_pred_test = knn_model.predict(X_test)
print("Train Accuracy:",accuracy_score(Y_train, Y_pred_train),"\n""Test Accuracy:",accuracy_score(Y_test, Y_pred_test))

Train Accuracy: 0.9936440677966102 
Test Accuracy: 1.0


In [75]:
print(classification_report(Y_test, Y_pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        34
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        36
           4       1.00      1.00      1.00        27

    accuracy                           1.00       158
   macro avg       1.00      1.00      1.00       158
weighted avg       1.00      1.00      1.00       158



### 2.) Random Forest

In [124]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid 
param_dist = {
    "n_estimators": np.arange(100, 1000, 50), 
    "max_depth": np.arange(5, 50, 2),  
    "min_samples_split": np.arange(1, 20),  
    "min_samples_leaf": np.arange(1, 10),  
    "max_features": ["sqrt", "log2", None],  
    "bootstrap": [True, False]  
}

# Create Random Forest model
rf = RandomForestClassifier(random_state=42)

# RandomizedSearchCV 
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=20, cv=5, scoring='accuracy', 
                                      n_jobs=-1, verbose=2, random_state=42)

# Fit the model
random_search.fit(X_train, Y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'n_estimators': 750, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 19, 'bootstrap': False}


In [122]:
rf_model = RandomForestClassifier(
    n_estimators=750,    
    criterion='gini',
    max_features="sqrt",
    min_samples_split=7,
    min_samples_leaf=4,
    max_depth=19, 
    random_state=42)
rf_model.fit(X_train, Y_train)

In [85]:
Y_pred_train = rf_model.predict(X_train)
Y_pred_test = rf_model.predict(X_test)
print("Train Accuracy:",accuracy_score(Y_train, Y_pred_train),"\n""Test Accuracy",accuracy_score(Y_test, Y_pred_test))

Train Accuracy: 0.9947033898305084 
Test Accuracy 0.9936708860759493


In [87]:
print(classification_report(Y_test, Y_pred_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        32
           1       1.00      0.97      0.99        34
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        36
           4       1.00      1.00      1.00        27

    accuracy                           0.99       158
   macro avg       0.99      0.99      0.99       158
weighted avg       0.99      0.99      0.99       158



### 3.) Logistic Regression

In [90]:
from sklearn.linear_model import LogisticRegression

In [134]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [94]:
prediction_train = np.round(logistic_model.predict(X_train))
prediction_test = np.round(logistic_model.predict(X_test))
print("Train Accuracy:",accuracy_score(Y_train,prediction_train),"\n""Test Accuracy:",accuracy_score(Y_test,prediction_test))

Train Accuracy: 0.9788135593220338 
Test Accuracy: 0.9556962025316456


In [96]:
print(classification_report(Y_test, prediction_test))

              precision    recall  f1-score   support

           0       1.00      0.88      0.93        32
           1       0.89      1.00      0.94        34
           2       0.94      1.00      0.97        29
           3       0.97      0.94      0.96        36
           4       1.00      0.96      0.98        27

    accuracy                           0.96       158
   macro avg       0.96      0.96      0.96       158
weighted avg       0.96      0.96      0.96       158



### 4.) Support Vector Classifier

In [99]:
from sklearn.svm import SVC

# Define the parameter grid
param_grid = {
    "C": [0.1, 1, 10],  
    "kernel": ["linear", "rbf"],  
    "gamma": ["scale", "auto", 0.01, 0.1]
}

# Initialize SVC
svc = SVC()

# Perform Grid Search
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)

# Fit model
grid_search.fit(X_train, Y_train)

# Best Parameters
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}


In [110]:
svc_model = SVC(kernel='rbf', C=1, gamma=0.01)
svc_model.fit(X_train, Y_train)

In [103]:
Y_pred_train = svc_model.predict(X_train)
Y_pred_test = svc_model.predict(X_test)
print("Train Accuracy:",accuracy_score(Y_train, Y_pred_train),"\n""Test Accuracy:",accuracy_score(Y_test, Y_pred_test))

Train Accuracy: 0.9936440677966102 
Test Accuracy: 0.9746835443037974


In [105]:
print(classification_report(Y_test, Y_pred_test))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        32
           1       1.00      1.00      1.00        34
           2       0.97      0.97      0.97        29
           3       0.92      0.97      0.95        36
           4       1.00      1.00      1.00        27

    accuracy                           0.97       158
   macro avg       0.98      0.98      0.98       158
weighted avg       0.98      0.97      0.97       158



### 5.) Naive Bayes Classifier

In [108]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, Y_train)

In [110]:
Y_pred_train = nb_model.predict(X_train)
Y_pred_test = nb_model.predict(X_test)
print("Train Accuracy:",accuracy_score(Y_train, Y_pred_train),"\n""Test Accuracy:",accuracy_score(Y_test, Y_pred_test))

Train Accuracy: 0.9788135593220338 
Test Accuracy: 0.9620253164556962


In [112]:
print(classification_report(Y_test, Y_pred_test))

              precision    recall  f1-score   support

           0       0.97      0.94      0.95        32
           1       0.94      0.97      0.96        34
           2       0.94      1.00      0.97        29
           3       0.97      0.94      0.96        36
           4       1.00      0.96      0.98        27

    accuracy                           0.96       158
   macro avg       0.96      0.96      0.96       158
weighted avg       0.96      0.96      0.96       158



### 6.) CatBoost Classifier

In [190]:
from catboost import CatBoostClassifier

# Define parameter grid
param_grid = {
    "iterations": [200, 500], 
    "learning_rate": [0.03, 0.1],  
    "depth": [4, 6, 8],  
    "l2_leaf_reg": [1, 3, 5],  
}

# Initialize CatBoostClassifier
catboost = CatBoostClassifier(verbose=0)

# Perform Grid Search
grid_search = GridSearchCV(catboost, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2)
grid_search.fit(X_train, Y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'depth': 8, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.03}


In [114]:
cb_model = CatBoostClassifier(random_state = 42, iterations=200, depth=8, learning_rate=0.03, l2_leaf_reg=1)
cb_model.fit(X_train, Y_train)

0:	learn: 1.5118095	total: 237ms	remaining: 47.2s
1:	learn: 1.4251454	total: 286ms	remaining: 28.3s
2:	learn: 1.3472667	total: 326ms	remaining: 21.4s
3:	learn: 1.2780060	total: 365ms	remaining: 17.9s
4:	learn: 1.2113925	total: 414ms	remaining: 16.1s
5:	learn: 1.1517758	total: 490ms	remaining: 15.8s
6:	learn: 1.0968555	total: 538ms	remaining: 14.8s
7:	learn: 1.0459552	total: 578ms	remaining: 13.9s
8:	learn: 0.9975863	total: 623ms	remaining: 13.2s
9:	learn: 0.9552527	total: 668ms	remaining: 12.7s
10:	learn: 0.9169212	total: 716ms	remaining: 12.3s
11:	learn: 0.8797469	total: 761ms	remaining: 11.9s
12:	learn: 0.8445233	total: 806ms	remaining: 11.6s
13:	learn: 0.8111654	total: 849ms	remaining: 11.3s
14:	learn: 0.7814804	total: 926ms	remaining: 11.4s
15:	learn: 0.7524660	total: 1.03s	remaining: 11.8s
16:	learn: 0.7242023	total: 1.08s	remaining: 11.7s
17:	learn: 0.6980093	total: 1.14s	remaining: 11.5s
18:	learn: 0.6739389	total: 1.2s	remaining: 11.4s
19:	learn: 0.6491078	total: 1.26s	remainin

<catboost.core.CatBoostClassifier at 0x239766da630>

In [116]:
Y_pred_train = cb_model.predict(X_train)
Y_pred_test = cb_model.predict(X_test)
print("Train Accuracy:",accuracy_score(Y_train, Y_pred_train),"\n""Test Accuracy:",accuracy_score(Y_test, Y_pred_test))

Train Accuracy: 0.9947033898305084 
Test Accuracy: 0.9873417721518988


In [119]:
print(classification_report(Y_test, Y_pred_test))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        32
           1       0.94      1.00      0.97        34
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        36
           4       1.00      1.00      1.00        27

    accuracy                           0.99       158
   macro avg       0.99      0.99      0.99       158
weighted avg       0.99      0.99      0.99       158



## Cross-Validation

In [126]:
from sklearn.model_selection import cross_val_score

models = {"KNN":knn_model,"RandomForest":rf_model,"Logistic":logistic_model,"SVC":svc_model,"NaiveBayes":nb_model,"CatBoost":cb_model}

# Perform cross-validation for each model
cv_results = {}

for name, model in models.items():
    scores = cross_val_score(model, X_train, Y_train, cv=5, scoring="accuracy", n_jobs=-1)
    cv_results[name] = (np.mean(scores), np.std(scores))
    print(f"{name}: Mean Accuracy = {np.mean(scores):.4f} ± {np.std(scores):.4f}")

# Display sorted results
sorted_results = sorted(cv_results.items(), key=lambda x: x[1][0], reverse=True)
print("\nCross-Validation Summary")
for name, (mean_acc, std_acc) in sorted_results:
    print(f"{name}: {mean_acc:.4f} ± {std_acc:.4f}")

KNN: Mean Accuracy = 0.9936 ± 0.0085
RandomForest: Mean Accuracy = 0.9936 ± 0.0085
Logistic: Mean Accuracy = 0.9650 ± 0.0110
SVC: Mean Accuracy = 0.9883 ± 0.0091
NaiveBayes: Mean Accuracy = 0.9608 ± 0.0179
CatBoost: Mean Accuracy = 0.9947 ± 0.0067

Cross-Validation Summary
CatBoost: 0.9947 ± 0.0067
KNN: 0.9936 ± 0.0085
RandomForest: 0.9936 ± 0.0085
SVC: 0.9883 ± 0.0091
Logistic: 0.9650 ± 0.0110
NaiveBayes: 0.9608 ± 0.0179
