# Import Libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# Load Dataset

In [None]:
# For Google Colab (mount google drive)
from google.colab import drive

drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Portfolio/DS Projects/Depression Risk/Student Depression Dataset.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# For Local Environment
df = pd.read_csv('dataset/Student Depression Dataset.csv')

In [3]:
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [5]:
df.describe()

Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,Depression
count,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27898.0,27901.0
mean,70442.149421,25.8223,3.141214,0.00043,7.656104,2.943837,0.000681,7.156984,3.139867,0.585499
std,40641.175216,4.905687,1.381465,0.043992,1.470707,1.361148,0.044394,3.707642,1.437347,0.492645
min,2.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,35039.0,21.0,2.0,0.0,6.29,2.0,0.0,4.0,2.0,0.0
50%,70684.0,25.0,3.0,0.0,7.77,3.0,0.0,8.0,3.0,1.0
75%,105818.0,30.0,4.0,0.0,8.92,4.0,0.0,10.0,4.0,1.0
max,140699.0,59.0,5.0,5.0,10.0,5.0,4.0,12.0,5.0,1.0


# Data Preprocessing

In [6]:
missing_values = df.isnull().sum()
print(missing_values)

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         3
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [7]:
# Drop rows with missing values from the DataFrame df
df_cleaned = df.copy()
df_cleaned = df_cleaned.dropna()

# Optionally, you can print the shape of the original and cleaned DataFrames
print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {df_cleaned.shape}")

# If you want to check the missing values again after dropping
missing_values_after = df_cleaned.isnull().sum()
print("Missing values after dropping rows:")
print(missing_values_after)

Original shape: (27901, 18)
Cleaned shape: (27898, 18)
Missing values after dropping rows:
id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [8]:
# Drop irrelevant feature (based on domain knowledge & goal of analysis)
df_cleaned = df_cleaned.drop({'id','Profession','City','Job Satisfaction','Work Pressure'}, axis=1)

In [None]:
df_cleaned.head()

Unnamed: 0,Gender,Age,Academic Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,5.0,8.97,2.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,Female,24.0,2.0,5.9,5.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,Male,31.0,3.0,7.03,5.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,Female,28.0,3.0,5.59,2.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,Female,25.0,4.0,8.13,3.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [10]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27898 entries, 0 to 27900
Data columns (total 13 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 27898 non-null  object 
 1   Age                                    27898 non-null  float64
 2   Academic Pressure                      27898 non-null  float64
 3   CGPA                                   27898 non-null  float64
 4   Study Satisfaction                     27898 non-null  float64
 5   Sleep Duration                         27898 non-null  object 
 6   Dietary Habits                         27898 non-null  object 
 7   Degree                                 27898 non-null  object 
 8   Have you ever had suicidal thoughts ?  27898 non-null  object 
 9   Work/Study Hours                       27898 non-null  float64
 10  Financial Stress                       27898 non-null  float64
 11  Family 

In [11]:
label_encoders = {
    'Gender': LabelEncoder(),
    'Sleep Duration': LabelEncoder(),
    'Dietary Habits': LabelEncoder(),
    'Degree': LabelEncoder(),
    'Have you ever had suicidal thoughts ?': LabelEncoder(),
    'Family History of Mental Illness': LabelEncoder()
}

for feature, encoder in label_encoders.items():
    df_cleaned[feature] = encoder.fit_transform(df_cleaned[feature])

    print(f"\nMapping feature '{feature}':")
    for category, value in zip(encoder.classes_, range(len(encoder.classes_))):
        print(f"{category} -> {value}")

print("\nDataFrame after Label Encoding:")
df_cleaned.head()


Mapping feature 'Gender':
Female -> 0
Male -> 1

Mapping feature 'Sleep Duration':
5-6 hours -> 0
7-8 hours -> 1
Less than 5 hours -> 2
More than 8 hours -> 3
Others -> 4

Mapping feature 'Dietary Habits':
Healthy -> 0
Moderate -> 1
Others -> 2
Unhealthy -> 3

Mapping feature 'Degree':
B.Arch -> 0
B.Com -> 1
B.Ed -> 2
B.Pharm -> 3
B.Tech -> 4
BA -> 5
BBA -> 6
BCA -> 7
BE -> 8
BHM -> 9
BSc -> 10
Class 12 -> 11
LLB -> 12
LLM -> 13
M.Com -> 14
M.Ed -> 15
M.Pharm -> 16
M.Tech -> 17
MA -> 18
MBA -> 19
MBBS -> 20
MCA -> 21
MD -> 22
ME -> 23
MHM -> 24
MSc -> 25
Others -> 26
PhD -> 27

Mapping feature 'Have you ever had suicidal thoughts ?':
No -> 0
Yes -> 1

Mapping feature 'Family History of Mental Illness':
No -> 0
Yes -> 1

DataFrame after Label Encoding:


Unnamed: 0,Gender,Age,Academic Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,1,33.0,5.0,8.97,2.0,0,0,3,1,3.0,1.0,0,1
1,0,24.0,2.0,5.9,5.0,0,1,10,0,3.0,2.0,1,0
2,1,31.0,3.0,7.03,5.0,2,0,5,0,9.0,1.0,1,0
3,0,28.0,3.0,5.59,2.0,1,1,7,1,4.0,5.0,1,1
4,0,25.0,4.0,8.13,3.0,0,1,17,1,1.0,1.0,0,0


# Feature Importance

In [12]:
X = df_cleaned.drop('Depression', axis=1)
y = df_cleaned['Depression']

In [22]:
random_state_ = 42
n = 10  # Number of top features to display

In [13]:
# Function to extract top features and save them
def get_top_features(model, model_name):
    if model_name == "Logistic Regression":
        importance = np.abs(model.coef_[0])
    elif model_name in ["SVM"]:
        importance = np.abs(model.coef_[0])
    else:
        importance = model.feature_importances_
    
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance}).sort_values(by='Importance', ascending=False)
    top_features = importance_df.head(n)
    print(f"\nTop {n} Features from {model_name}:")
    display(top_features)
    return top_features['Feature'].values

In [14]:
# 1. Random Forest Classifier
random_forest_classifier_entropy = RandomForestClassifier(criterion='entropy', random_state=random_state_)
random_forest_classifier_entropy.fit(X, y)
top_features_rf = get_top_features(random_forest_classifier_entropy, "Random Forest")


Top 10 Features from Random Forest:


Unnamed: 0,Feature,Importance
8,Have you ever had suicidal thoughts ?,0.186505
2,Academic Pressure,0.155957
3,CGPA,0.117439
1,Age,0.100126
10,Financial Stress,0.092791
9,Work/Study Hours,0.086226
7,Degree,0.082891
4,Study Satisfaction,0.052984
5,Sleep Duration,0.044251
6,Dietary Habits,0.039558


In [15]:
# 2. Extra Trees Classifier
extra_trees_classifier = ExtraTreesClassifier(random_state=random_state_)
extra_trees_classifier.fit(X, y)
top_features_et = get_top_features(extra_trees_classifier, "Extra Trees")


Top 10 Features from Extra Trees:


Unnamed: 0,Feature,Importance
8,Have you ever had suicidal thoughts ?,0.245702
2,Academic Pressure,0.156853
10,Financial Stress,0.09782
1,Age,0.088031
9,Work/Study Hours,0.080054
3,CGPA,0.078139
7,Degree,0.067623
4,Study Satisfaction,0.053962
5,Sleep Duration,0.048699
6,Dietary Habits,0.039998


In [16]:
# 3. Logistic Regression
logistic_model = LogisticRegression(max_iter=1000, random_state=random_state_)
logistic_model.fit(X, y)
top_features_logistic = get_top_features(logistic_model, "Logistic Regression")


Top 10 Features from Logistic Regression:


Unnamed: 0,Feature,Importance
8,Have you ever had suicidal thoughts ?,2.509588
2,Academic Pressure,0.836118
10,Financial Stress,0.548398
6,Dietary Habits,0.350208
11,Family History of Mental Illness,0.246299
4,Study Satisfaction,0.244796
9,Work/Study Hours,0.117489
1,Age,0.108388
3,CGPA,0.062474
5,Sleep Duration,0.030192


In [17]:
# 4. AdaBoost Classifier
adaboost_classifier = AdaBoostClassifier(random_state=random_state_)
adaboost_classifier.fit(X, y)
top_features_ab = get_top_features(adaboost_classifier, "AdaBoost")


Top 10 Features from AdaBoost:


Unnamed: 0,Feature,Importance
2,Academic Pressure,0.287806
8,Have you ever had suicidal thoughts ?,0.146945
10,Financial Stress,0.146884
1,Age,0.133104
9,Work/Study Hours,0.083626
6,Dietary Habits,0.078495
4,Study Satisfaction,0.049291
5,Sleep Duration,0.043283
11,Family History of Mental Illness,0.013227
3,CGPA,0.011693


In [18]:
# 5. Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=random_state_)
svm_model.fit(X, y)
top_features_svm = get_top_features(svm_model, "SVM")


Top 10 Features from SVM:


Unnamed: 0,Feature,Importance
8,Have you ever had suicidal thoughts ?,1.780188
2,Academic Pressure,0.568317
10,Financial Stress,0.372992
6,Dietary Habits,0.229096
11,Family History of Mental Illness,0.175586
4,Study Satisfaction,0.158895
9,Work/Study Hours,0.079606
1,Age,0.074821
3,CGPA,0.037278
5,Sleep Duration,0.026405


# Modelling & Evaluation

In [40]:
# Define a function to create train-test splits for each model based on selected features
def train_and_evaluate_model(X, y, selected_features, model, model_name):
    X_selected = X[selected_features]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=random_state_)
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')
    
    # For binary classification, calculate ROC AUC score if applicable
    if len(np.unique(y)) == 2:  # Check if binary classification
        roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    else:
        roc_auc = None
    
    confusion = confusion_matrix(y_test, predictions)
    
    # Create a DataFrame for the confusion matrix with labels
    confusion_df = pd.DataFrame(confusion, 
                                 index=[f'Actual {label}' for label in np.unique(y)], 
                                 columns=[f'Predicted {label}' for label in np.unique(y)])
    
    print(f"\n{model_name} Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    if roc_auc is not None:
        print(f"ROC AUC: {roc_auc:.2f}")
    print("Confusion Matrix:\n", confusion_df)

In [41]:
# Train and evaluate each model with its selected features
train_and_evaluate_model(X, y, top_features_rf, RandomForestClassifier(criterion='entropy', random_state=random_state_), "Random Forest")


Random Forest Evaluation Metrics:
Accuracy: 0.84
Precision: 0.84
Recall: 0.84
F1 Score: 0.84
ROC AUC: 0.91
Confusion Matrix:
           Predicted 0  Predicted 1
Actual 0         1807          541
Actual 1          374         2858


In [None]:
train_and_evaluate_model(X, y, top_features_et, ExtraTreesClassifier(random_state=random_state_), "Extra Trees")


Extra Trees Evaluation Metrics:
Accuracy: 0.83
Precision: 0.83
Recall: 0.83
F1 Score: 0.83
ROC AUC: 0.91
Confusion Matrix:
           Predicted 0  Predicted 1
Actual 0         1807          541
Actual 1          396         2836


In [43]:
train_and_evaluate_model(X, y, top_features_logistic, LogisticRegression(max_iter=1000, random_state=random_state_), "Logistic Regression")


Logistic Regression Evaluation Metrics:
Accuracy: 0.84
Precision: 0.84
Recall: 0.84
F1 Score: 0.84
ROC AUC: 0.92
Confusion Matrix:
           Predicted 0  Predicted 1
Actual 0         1824          524
Actual 1          343         2889


In [44]:
train_and_evaluate_model(X, y, top_features_ab, AdaBoostClassifier(random_state=random_state_), "AdaBoost")


AdaBoost Evaluation Metrics:
Accuracy: 0.85
Precision: 0.85
Recall: 0.85
F1 Score: 0.85
ROC AUC: 0.92
Confusion Matrix:
           Predicted 0  Predicted 1
Actual 0         1825          523
Actual 1          335         2897


In [47]:
# Define your model with probability estimates
svm_model = SVC(kernel='linear', probability=True, random_state=random_state_)

# Call the evaluation function
train_and_evaluate_model(X, y, top_features_svm, svm_model, "SVM")


SVM Evaluation Metrics:
Accuracy: 0.84
Precision: 0.84
Recall: 0.84
F1 Score: 0.84
ROC AUC: 0.92
Confusion Matrix:
           Predicted 0  Predicted 1
Actual 0         1810          538
Actual 1          331         2901


# Hyperparameter Tuning

In [None]:
# Normalization


# Save Model

In [None]:
!pip install pycaret

In [None]:
# prompt: deploy ML model that already saved using pycaret

import pycaret
# Load the saved model
model = joblib.load('/content/model_ETRFinalBalikpapan.joblib')

# Deploy the model as an API
app = deploy_model(model, model_name='my_model')

# Print the URL of the deployed API
print(app.url)

# Make a prediction using the deployed API
data = {'O2Content': 73254.20730, 'FuelGasBehindCv': 4995.837888, 'ExcessAir': 0.205098, 'MainSteamHeader': 74216.97196, 'EconomizerWaterOutlet': 203.372497}
prediction = predict_model(app, data=data)

# Print the prediction
print(prediction)

# Undeploy the model
undeploy_model(app)
