<a href="https://colab.research.google.com/github/aditi-gup/ML_Project_Depression_Survey/blob/main/ML_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Comparison of Different ML Algorithms on Depression Survey Dataset from Kaggle

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
df = pd.read_csv('depression_survey.csv')
df.isnull().sum()

Unnamed: 0,0
Name,0
Gender,0
Age,0
City,0
Working Professional or Student,0
Profession,673
Academic Pressure,2054
Work Pressure,502
CGPA,2054
Study Satisfaction,2054


In [None]:
df.describe()

Unnamed: 0,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress
count,2556.0,502.0,2054.0,502.0,502.0,2054.0,2556.0,2556.0
mean,39.043036,3.003984,3.021908,7.567809,3.075697,3.015093,6.023865,2.968701
std,12.260596,1.390007,1.417312,1.465367,1.37349,1.418432,3.771743,1.415527
min,18.0,1.0,1.0,5.03,1.0,1.0,0.0,1.0
25%,28.0,2.0,2.0,6.21,2.0,2.0,3.0,2.0
50%,39.0,3.0,3.0,7.605,3.0,3.0,6.0,3.0
75%,50.0,4.0,4.0,8.825,4.0,4.0,9.0,4.0
max,60.0,5.0,5.0,10.0,5.0,5.0,12.0,5.0


Dataset has people who are either students or working professionals

Based on this:

Academic stress = only for students

Work-life balance = only for working professionals


 ==> Spliting the data based on role (student/working)

In [None]:
student_df = df[df['Working Professional or Student'] == 'Student'].copy()
working_df = df[df['Working Professional or Student'] == 'Working Professional'].copy()

Analyzing student_df using the following ML models:
  * Logistic Regression
  * Random Forest
  * XGBoost
  * Support Vector Machine (SVM)
  * K-Nearest Neighbors (KNN)



In [None]:
# Removing Columns NOT Relevant to Students
student_df.drop(columns=['Profession','Work Pressure', 'Job Satisfaction'], inplace=True, errors='ignore')
# percentage of missing values per column => to decide whether to drop the rows with missing values or replace them with statistical data
# student_df.isnull().mean() * 100
student_df.dropna(inplace=True)

In [None]:
label_encoders = {}
for column in student_df.select_dtypes(include='object'):
    if column != 'Depression':  # target label ko alag handle karenge
        le = LabelEncoder()
        student_df[column] = le.fit_transform(student_df[column])
        label_encoders[column] = le

# Encode target column
student_df['Depression'] = student_df['Depression'].map({'Yes': 1, 'No': 0})

In [None]:
X_s = student_df.drop('Depression', axis=1)
y_s = student_df['Depression']
X_s_train, X_s_test, y_s_train, y_s_test = train_test_split(X_s, y_s, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_s_train_scaled = scaler.fit_transform(X_s_train)
X_s_test_scaled = scaler.transform(X_s_test)

1. Logistic Regression

In [9]:
lr = LogisticRegression()
lr.fit(X_s_train_scaled, y_s_train)
lr_pred_s = lr.predict(X_s_test_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_s_test, lr_pred_s))
print(classification_report(y_s_test, lr_pred_s))


Logistic Regression Accuracy: 0.9702970297029703
              precision    recall  f1-score   support

           0       0.98      0.96      0.97        48
           1       0.96      0.98      0.97        53

    accuracy                           0.97       101
   macro avg       0.97      0.97      0.97       101
weighted avg       0.97      0.97      0.97       101



2. Random Forest

In [10]:
rf = RandomForestClassifier()
rf.fit(X_s_train, y_s_train)
rf_pred_s = rf.predict(X_s_test)

print("Random Forest Accuracy:", accuracy_score(y_s_test, rf_pred_s))
print(classification_report(y_s_test, rf_pred_s))

Random Forest Accuracy: 0.9207920792079208
              precision    recall  f1-score   support

           0       0.95      0.88      0.91        48
           1       0.89      0.96      0.93        53

    accuracy                           0.92       101
   macro avg       0.92      0.92      0.92       101
weighted avg       0.92      0.92      0.92       101



3. XGBoost

In [11]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_s_train, y_s_train)
xgb_pred_s = xgb.predict(X_s_test)

print("XGBoost Accuracy:", accuracy_score(y_s_test, xgb_pred_s))
print(classification_report(y_s_test, xgb_pred_s))

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9306930693069307
              precision    recall  f1-score   support

           0       0.96      0.90      0.92        48
           1       0.91      0.96      0.94        53

    accuracy                           0.93       101
   macro avg       0.93      0.93      0.93       101
weighted avg       0.93      0.93      0.93       101



4. Support Vector Machine (SVM)

In [12]:
svm = SVC()
svm.fit(X_s_train_scaled, y_s_train)
svm_pred_s = svm.predict(X_s_test_scaled)

print("SVM Accuracy:", accuracy_score(y_s_test, svm_pred_s))
print(classification_report(y_s_test, svm_pred_s))

SVM Accuracy: 0.9306930693069307
              precision    recall  f1-score   support

           0       0.94      0.92      0.93        48
           1       0.93      0.94      0.93        53

    accuracy                           0.93       101
   macro avg       0.93      0.93      0.93       101
weighted avg       0.93      0.93      0.93       101



5. K-Nearest Neighbors (KNN)

In [13]:
knn = KNeighborsClassifier()
knn.fit(X_s_train_scaled, y_s_train)
knn_pred_s = knn.predict(X_s_test_scaled)

print("KNN Accuracy:", accuracy_score(y_s_test, knn_pred_s))
print(classification_report(y_s_test, knn_pred_s))

KNN Accuracy: 0.8910891089108911
              precision    recall  f1-score   support

           0       0.91      0.85      0.88        48
           1       0.88      0.92      0.90        53

    accuracy                           0.89       101
   macro avg       0.89      0.89      0.89       101
weighted avg       0.89      0.89      0.89       101



Analyzing working_df using the following ML models:
  * Logistic Regression
  * Random Forest
  * XGBoost
  * Support Vector Machine (SVM)
  * K-Nearest Neighbors (KNN)

In [14]:
# Removing Columns NOT Relevant to working
working_df.drop(columns=['Academic Pressure','CGPA','Study Satisfaction',], inplace=True, errors='ignore')
# percentage of missing values per column => to decide whether to drop the rows with missing values or replace them with statistical data
# working_df.isnull().mean() * 100
working_df.dropna(inplace=True)

In [15]:
label_encoders = {}
for column in working_df.select_dtypes(include='object'):
    if column != 'Depression':  # target label ko alag handle karenge
        le = LabelEncoder()
        working_df[column] = le.fit_transform(working_df[column])
        label_encoders[column] = le

# Encode target column
working_df['Depression'] = working_df['Depression'].map({'Yes': 1, 'No': 0})

In [16]:
X_w = working_df.drop('Depression', axis=1)
y_w = working_df['Depression']
X_w_train, X_w_test, y_w_train, y_w_test = train_test_split(X_w, y_w, test_size=0.2, random_state=42)

In [17]:
scaler = StandardScaler()
X_w_train_scaled = scaler.fit_transform(X_w_train)
X_w_test_scaled = scaler.transform(X_w_test)

1. Logistic Regression

In [18]:
lr = LogisticRegression()
lr.fit(X_w_train_scaled, y_w_train)
lr_pred_w = lr.predict(X_w_test_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_w_test, lr_pred_w))
print(classification_report(y_w_test, lr_pred_w))

Logistic Regression Accuracy: 0.986737400530504
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       354
           1       0.88      0.91      0.89        23

    accuracy                           0.99       377
   macro avg       0.93      0.95      0.94       377
weighted avg       0.99      0.99      0.99       377



2. Random Forest

In [19]:
rf = RandomForestClassifier()
rf.fit(X_w_train, y_w_train)
rf_pred_w = rf.predict(X_w_test)

print("Random Forest Accuracy:", accuracy_score(y_w_test, rf_pred_w))
print(classification_report(y_w_test, rf_pred_w))

Random Forest Accuracy: 0.9628647214854111
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       354
           1       1.00      0.39      0.56        23

    accuracy                           0.96       377
   macro avg       0.98      0.70      0.77       377
weighted avg       0.96      0.96      0.96       377



3. XGBoost

In [20]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_w_train, y_w_train)
xgb_pred_w = xgb.predict(X_w_test)

print("XGBoost Accuracy:", accuracy_score(y_w_test, xgb_pred_w))
print(classification_report(y_w_test, xgb_pred_w))

XGBoost Accuracy: 0.9787798408488063
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       354
           1       0.94      0.70      0.80        23

    accuracy                           0.98       377
   macro avg       0.96      0.85      0.89       377
weighted avg       0.98      0.98      0.98       377



Parameters: { "use_label_encoder" } are not used.



4. Support Vector Machine (SVM)

In [21]:
svm = SVC()
svm.fit(X_w_train_scaled, y_w_train)
svm_pred_w = svm.predict(X_w_test_scaled)

print("SVM Accuracy:", accuracy_score(y_w_test, svm_pred_w))
print(classification_report(y_w_test, svm_pred_w))

SVM Accuracy: 0.9734748010610079
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       354
           1       0.88      0.65      0.75        23

    accuracy                           0.97       377
   macro avg       0.93      0.82      0.87       377
weighted avg       0.97      0.97      0.97       377



5. K-Nearest Neighbors (KNN)

In [22]:
knn = KNeighborsClassifier()
knn.fit(X_w_train_scaled, y_w_train)
knn_pred_w = knn.predict(X_w_test_scaled)

print("KNN Accuracy:", accuracy_score(y_w_test, knn_pred_w))
print(classification_report(y_w_test, knn_pred_w))

KNN Accuracy: 0.9496021220159151
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       354
           1       0.67      0.35      0.46        23

    accuracy                           0.95       377
   macro avg       0.81      0.67      0.72       377
weighted avg       0.94      0.95      0.94       377

