In [1]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error,make_scorer, root_mean_squared_error

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
file_path = "playground-series-s4e11//"

df_train = pd.read_csv(file_path + "train.csv")

In [104]:
df_train.head(5)

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [105]:
df_train['Occupation Satisfaction'] = df_train['Study Satisfaction'].combine_first(df_train['Job Satisfaction'])
df_train['Overall Pressure'] = df_train['Academic Pressure'].combine_first(df_train['Work Pressure'])
df_train = df_train.drop(['Study Satisfaction','Job Satisfaction','Academic Pressure', 'Work Pressure','CGPA'], axis=1)
df_train.head(5)

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Occupation Satisfaction,Overall Pressure
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0,2.0,5.0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1,3.0,4.0
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1,2.0,5.0
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1,1.0,5.0
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0,1.0,1.0


In [106]:
df_train_modified = df_train.drop(['Name', 'Depression','id','City'], axis=1)
df_dummy_train = pd.get_dummies(df_train_modified,dtype=float)

In [107]:
y = df_train['Depression']


In [108]:
X_train, X_test, y_train, y_test = train_test_split(df_dummy_train,y,test_size=0.2, random_state=69)

In [21]:
seed=2
def objective(params):
    est=int(params['n_estimators'])
    md=int(params['max_depth'])
    msl=int(params['min_samples_leaf'])
    mss=int(params['min_samples_split'])
    model=RandomForestClassifier(n_estimators=est,max_depth=md,min_samples_leaf=msl,min_samples_split=mss)
    model.fit(X_train,y_train)
    pred=model.predict(X_test)
    score=mean_squared_error(y_test,pred)
    return score

def optimize(trial):
    params={'n_estimators':hp.uniform('n_estimators',100,500),
           'max_depth':hp.uniform('max_depth',5,20),
           'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
           'min_samples_split':hp.uniform('min_samples_split',2,6)}
    best=fmin(fn=objective,space=params,algo=tpe.suggest,trials=trial,max_evals=10)
    return best

In [22]:
trials = Trials()

best_hyperparams = optimize(trials)
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

100%|██████████| 10/10 [02:03<00:00, 12.39s/trial, best loss: 0.0]                
The best hyperparameters are :  

{'max_depth': 18.850945873092968, 'min_samples_leaf': 3.9708411517731985, 'min_samples_split': 2.328560406588981, 'n_estimators': 487.43552388296905}


In [109]:
rf_model = RandomForestClassifier(max_depth = 14 ,
                               min_samples_leaf = 1,
                               min_samples_split = 3,
                               n_estimators =235,
                                random_state=25,
                                class_weight='balanced')


rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)



print(mean_squared_error(y_test, y_pred,squared=False))
print(rf_model.score(X_train,y_train))



0.28502047956942883
0.9278873489694385


In [110]:
importances = rf_model.feature_importances_
columns = df_dummy_train.columns
i = 0

while i< len(columns):
    if importances[i]*100 > 1.0:
        print (f" The importance of feature '{columns[i]}' is {round(importances[i]*100, 2)}%.")
    i += 1

 The importance of feature 'Age' is 29.0%.
 The importance of feature 'Work/Study Hours' is 2.88%.
 The importance of feature 'Financial Stress' is 4.08%.
 The importance of feature 'Occupation Satisfaction' is 2.27%.
 The importance of feature 'Overall Pressure' is 6.36%.
 The importance of feature 'Working Professional or Student_Student' is 11.93%.
 The importance of feature 'Working Professional or Student_Working Professional' is 12.22%.
 The importance of feature 'Profession_Teacher' is 1.07%.
 The importance of feature 'Dietary Habits_Unhealthy' is 1.21%.
 The importance of feature 'Degree_Class 12' is 3.78%.
 The importance of feature 'Have you ever had suicidal thoughts ?_No' is 9.08%.
 The importance of feature 'Have you ever had suicidal thoughts ?_Yes' is 8.99%.


In [111]:
# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# Get cross-validated predictions
y_pred = cross_val_predict(rf_model, df_dummy_train, y, cv=cv)

# Calculate F1-score
f1 = f1_score(y, y_pred)
print("F1 Score:", f1)

# Generate confusion matrix
cm = confusion_matrix(y, y_pred)
print("Confusion Matrix:")
print(cm)

# Breakdown of the confusion matrix
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Positives (TP): {tp}")

F1 Score: 0.7994225986538896
Confusion Matrix:
[[106184   8949]
 [  2584  22983]]
True Negatives (TN): 106184
False Positives (FP): 8949
False Negatives (FN): 2584
True Positives (TP): 22983
