In [881]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [882]:
df=df = pd.read_csv('StudentDepressionDataset.csv')

In [883]:
df.sample(3)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
13799,69876,Male,34.0,Kanpur,Student,3.0,0.0,8.59,2.0,0.0,Less than 5 hours,Unhealthy,B.Com,Yes,8.0,3.0,No,1
823,4243,Female,29.0,Meerut,Student,4.0,0.0,8.96,3.0,0.0,More than 8 hours,Unhealthy,B.Tech,No,9.0,2.0,No,0
15983,80804,Male,30.0,Ludhiana,Student,3.0,0.0,6.47,4.0,0.0,Less than 5 hours,Moderate,M.Com,Yes,11.0,5.0,No,1


In [884]:
# remove id column as it is not needed
df.drop('id', axis=1, inplace=True)
df.shape



(27901, 17)

In [885]:
df.nunique()

Gender                                     2
Age                                       34
City                                      52
Profession                                14
Academic Pressure                          6
Work Pressure                              3
CGPA                                     332
Study Satisfaction                         6
Job Satisfaction                           5
Sleep Duration                             5
Dietary Habits                             4
Degree                                    28
Have you ever had suicidal thoughts ?      2
Work/Study Hours                          13
Financial Stress                           5
Family History of Mental Illness           2
Depression                                 2
dtype: int64

In [886]:
missing_values_count = df.isnull().sum()
print("Number of Missing Values in Each Column:\n", missing_values_count)

Number of Missing Values in Each Column:
 Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         3
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [887]:
# make missing values of financial stress to be the mean of the column
df['Financial Stress'].fillna(df['Financial Stress'].mean(), inplace=True)
# ensure that there are no missing values
df.isnull().sum()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Financial Stress'].fillna(df['Financial Stress'].mean(), inplace=True)


Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64

In [888]:
df.shape

(27901, 17)

In [889]:
# checking unique values in financial stress column
df['Financial Stress'].value_counts()

Financial Stress
5.000000    6715
4.000000    5775
3.000000    5226
1.000000    5121
2.000000    5061
3.139867       3
Name: count, dtype: int64

In [890]:
# removing rows with values of age appearing less than or equal to 4 times as they are not enough to make a good prediction
df = df[df['Financial Stress'].map(df['Financial Stress'].value_counts()) >=4]

In [891]:
df.shape

(27898, 17)

In [892]:
# while evaluating the df.nununique()
# job satisfaction is mostly 0, so we can drop it, as it will not be useful
# profession is mostly student, so we can drop it, as it will not be useful
# work pressure is mostly 0, so we can drop it, as it will not be useful
df.drop(['Job Satisfaction', 'Profession','Work Pressure'], axis=1, inplace=True)


In [893]:
df.shape

(27898, 14)

In [894]:
#  Remove Rows with 'Others' in Sleep Duration, Dietary Habits, Degree as they appear to less times
#  reminder better to do a visual for the others
df = df[~df['Sleep Duration'].str.contains('Others', na=False)]
df = df[~df['Dietary Habits'].str.contains('Others', na=False)]
df = df[~df['Degree'].str.contains('Others', na=False)]

#  Remove Rows with Study Satisfaction Value Equal to 0
df = df[df['Study Satisfaction'] != 0]

# check the shape of the data after cleaning
df.shape


In [896]:
df.nunique()

Gender                                     2
Age                                       34
City                                      52
Academic Pressure                          6
CGPA                                     332
Study Satisfaction                         5
Sleep Duration                             4
Dietary Habits                             3
Degree                                    27
Have you ever had suicidal thoughts ?      2
Work/Study Hours                          13
Financial Stress                           5
Family History of Mental Illness           2
Depression                                 2
dtype: int64

In [897]:
df['Age'].value_counts()

Age
24.0    2254
20.0    2230
28.0    2126
29.0    1946
33.0    1890
25.0    1781
21.0    1724
23.0    1638
18.0    1581
19.0    1559
34.0    1464
27.0    1457
31.0    1423
32.0    1255
22.0    1157
26.0    1152
30.0    1138
35.0      10
38.0       7
36.0       7
42.0       4
39.0       3
48.0       3
46.0       2
43.0       2
37.0       2
44.0       1
51.0       1
49.0       1
58.0       1
54.0       1
59.0       1
56.0       1
41.0       1
Name: count, dtype: int64

In [898]:
# removing rows with values of age appearing less than or equal to 10 times as they are not enough to make a good prediction
df = df[df['Age'].map(df['Age'].value_counts()) >= 11]

# check the shape of the data after cleaning
df.shape

In [900]:
df.nunique()

Gender                                     2
Age                                       17
City                                      52
Academic Pressure                          6
CGPA                                     331
Study Satisfaction                         5
Sleep Duration                             4
Dietary Habits                             3
Degree                                    27
Have you ever had suicidal thoughts ?      2
Work/Study Hours                          13
Financial Stress                           5
Family History of Mental Illness           2
Depression                                 2
dtype: int64

In [901]:
df['City'].value_counts()

City
Kalyan                1565
Srinagar              1365
Hyderabad             1335
Vasai-Virar           1286
Lucknow               1150
Thane                 1138
Ludhiana              1106
Agra                  1089
Surat                 1076
Kolkata               1055
Jaipur                1035
Patna                  999
Pune                   966
Visakhapatnam          964
Ahmedabad              944
Bhopal                 929
Chennai                883
Meerut                 820
Rajkot                 810
Bangalore              765
Delhi                  765
Ghaziabad              741
Mumbai                 694
Vadodara               690
Varanasi               682
Nagpur                 646
Indore                 642
Kanpur                 607
Nashik                 544
Faridabad              458
Saanvi                   2
Bhavna                   2
City                     2
Harsha                   2
Less Delhi               1
M.Tech                   1
3.0                    

In [902]:
# removing rows with values of City appearing less than or equal to 3 times as they are not enough to make a good prediction
df = df[df['City'].map(df['City'].value_counts()) >= 3]

In [903]:
df.shape

(27749, 14)

In [904]:
# there is too  many degrees, so we can group them into 3 categories by using our existing knowledge of degrees
df['Degree'].value_counts()


Degree
Class 12    6054
B.Ed        1859
B.Com       1501
B.Arch      1474
BCA         1429
MSc         1185
B.Tech      1151
MCA         1037
M.Tech      1018
BHM          923
BSc          882
M.Ed         815
B.Pharm      809
M.Com        731
BBA          696
MBBS         689
LLB          668
BE           609
BA           595
M.Pharm      580
MD           569
MBA          560
MA           544
PhD          515
LLM          481
MHM          191
ME           184
Name: count, dtype: int64

In [905]:
# we can see that there are many degrees, so we can group them into 3 categories for better encoding later 

# making class 12th as higher secondary
df['Degree'] = df['Degree'].replace('Class 12th', 'Higher Secondary')


In [906]:
# making bachelors
bachelors_replacements = {
    'B.Ed': 'Bachelor',
    'B.Com': 'Bachelor',
    'B.Pharm': 'Bachelor',
    'BCA': 'Bachelor',
    'BE': 'Bachelor',
    'BHM': 'Bachelor',
    'MBBS': 'Bachelor', 
    'LLB': 'Bachelor',
    'BA': 'Bachelor',
    'B.Arch': 'Bachelor',
    'BBA': 'Bachelor',
    'B.Tech': 'Bachelor',
    'BSc': 'Bachelor',
}

# Apply replacements
df['Degree'] = df['Degree'].replace(bachelors_replacements)

In [907]:
# making masters
master_replacements = {
    'M.Tech': 'Master',
    'MCA': 'Master',
    'MBA': 'Master',
    'MSc': 'Master',
    'M.Com': 'Master',
    'MA': 'Master',
    'MHM': 'Master',
    'M.Arch': 'Master',
    'M.Pharm': 'Master',
    'ME': 'Master',  
    'LLM': 'Master',
    'MD': 'Master',  
    'M.Ed': 'Master',
    'PhD': 'Master' 
}

# Apply replacements
df['Degree'] = df['Degree'].replace(master_replacements)



In [908]:
df.sample(3)

Unnamed: 0,Gender,Age,City,Academic Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
4751,Male,33.0,Nagpur,4.0,6.75,2.0,Less than 5 hours,Moderate,Master,No,0.0,1.0,Yes,0
10985,Female,22.0,Kolkata,4.0,7.14,1.0,Less than 5 hours,Healthy,Master,Yes,10.0,3.0,No,1
9702,Male,19.0,Chennai,2.0,7.83,5.0,7-8 hours,Healthy,Class 12,Yes,7.0,5.0,Yes,1


In [909]:
# doing one hot encoding for the categorical variables
df = pd.get_dummies(df, columns=['City', 'Degree', 'Sleep Duration', 'Dietary Habits'])


In [910]:
df.shape

(27749, 50)

In [911]:
df.info ()

<class 'pandas.core.frame.DataFrame'>
Index: 27749 entries, 0 to 27900
Data columns (total 50 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 27749 non-null  object 
 1   Age                                    27749 non-null  float64
 2   Academic Pressure                      27749 non-null  float64
 3   CGPA                                   27749 non-null  float64
 4   Study Satisfaction                     27749 non-null  float64
 5   Have you ever had suicidal thoughts ?  27749 non-null  object 
 6   Work/Study Hours                       27749 non-null  float64
 7   Financial Stress                       27749 non-null  float64
 8   Family History of Mental Illness       27749 non-null  object 
 9   Depression                             27749 non-null  int64  
 10  City_Agra                              27749 non-null  bool   
 11  City_Ah

In [912]:
# mental illness,sucidial thoughts, and gender are objects but only got 2 answers but in string  so we can convert them to binary
df['Family History of Mental Illness'] = df['Family History of Mental Illness'].map({'Yes': 1, 'No': 0})
df['Have you ever had suicidal thoughts ?'] = df['Have you ever had suicidal thoughts ?'].map({'Yes': 1, 'No': 0})
df['Gender']=df['Gender'].map({'Male':1 , 'Female':0})



In [913]:
# split data into X and y
X=df.drop('Depression', axis=1)
y=df['Depression']


In [914]:
# split data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [915]:
# logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

LR=LogisticRegression(solver='lbfgs', max_iter=1000)
LR.fit(X_train, y_train)
y_pred_LR=LR.predict(X_test)
accuracy=accuracy_score(y_test, y_pred_LR)
print('Accuracy:', accuracy)



# mse
from sklearn.metrics import mean_squared_error
mse_LR=mean_squared_error(y_test, y_pred_LR)
print('Mean Squared Error:', mse_LR)


# mae
from sklearn.metrics import mean_absolute_error
mae_LR=mean_absolute_error(y_test, y_pred_LR)
print('Mean Absolute Error:', mae_LR)





Accuracy: 0.8367567567567568
Mean Squared Error: 0.16324324324324324
Mean Absolute Error: 0.16324324324324324


In [916]:
print (classification_report(y_test,y_pred_LR))

              precision    recall  f1-score   support

           0       0.82      0.78      0.80      2296
           1       0.85      0.88      0.86      3254

    accuracy                           0.84      5550
   macro avg       0.83      0.83      0.83      5550
weighted avg       0.84      0.84      0.84      5550



In [917]:
# linear discriminant analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

LDA=LinearDiscriminantAnalysis()
LDA.fit(X_train, y_train)
y_pred_LDA=LDA.predict(X_test)
accuracy=accuracy_score(y_test, y_pred_LDA)
print('Accuracy:', accuracy)




# mse
from sklearn.metrics import mean_squared_error
mse_LDA=mean_squared_error(y_test, y_pred_LDA)
print('Mean Squared Error:', mse_LDA)

# mae
from sklearn.metrics import mean_absolute_error
mae_LDA=mean_absolute_error(y_test, y_pred_LDA)
print('Mean Absolute Error:', mae_LDA)


Accuracy: 0.8369369369369369
Mean Squared Error: 0.16306306306306306
Mean Absolute Error: 0.16306306306306306


In [918]:
# k neighbors classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

KNN=KNeighborsClassifier()
KNN.fit(X_train, y_train)
y_pred_KNN=KNN.predict(X_test)
accuracy=accuracy_score(y_test, y_pred_KNN)
print('Accuracy:', accuracy)

# mse
from sklearn.metrics import mean_squared_error
mse_KNN=mean_squared_error(y_test, y_pred_KNN)
print('Mean Squared Error:', mse_KNN)

# mae
from sklearn.metrics import mean_absolute_error
mae_KNN=mean_absolute_error(y_test, y_pred_KNN)
print('Mean Absolute Error:', mae_KNN)



Accuracy: 0.7972972972972973
Mean Squared Error: 0.20270270270270271
Mean Absolute Error: 0.20270270270270271


In [919]:
# decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

DTC=DecisionTreeClassifier()
DTC.fit(X_train, y_train)
y_pred_DTC=DTC.predict(X_test)
accuracy=accuracy_score(y_test, y_pred_DTC)
print('Accuracy:', accuracy)

# mae
from sklearn.metrics import mean_absolute_error
mae=mean_absolute_error(y_test, y_pred_DTC)
print('mae:',mae)

# mse
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_test, y_pred_DTC)
print('mse:',mse)



Accuracy: 0.7609009009009009
mae: 0.2390990990990991
mse: 0.2390990990990991


In [920]:
# guassian naive bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

GNB=GaussianNB()
GNB.fit(X_train, y_train)
y_pred_GNB=GNB.predict(X_test)

accuracy=accuracy_score(y_test, y_pred_GNB)
print('Accuracy:', accuracy)

# mse
from sklearn.metrics import mean_squared_error
mse_GNB=mean_squared_error(y_test, y_pred_GNB)
print('Mean Squared Error:', mse_GNB)

# mae
from sklearn.metrics import mean_absolute_error
mae_GNB=mean_absolute_error(y_test, y_pred_GNB)
print('Mean Absolute Error:', mae_GNB)



Accuracy: 0.7612612612612613
Mean Squared Error: 0.23873873873873874
Mean Absolute Error: 0.23873873873873874


In [921]:
# svc
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

SVM=SVC(gamma='auto')
SVM.fit(X_train, y_train)
y_pred_SVC=SVM.predict(X_test)
accuracy=accuracy_score(y_test, y_pred_SVC)
print('Accuracy:', accuracy)

# mse
from sklearn.metrics import mean_squared_error
mse_SVC=mean_squared_error(y_test, y_pred_SVC)
print('Mean Squared Error:', mse_SVC)


# mae
from sklearn.metrics import mean_absolute_error
mae_SVC=mean_absolute_error(y_test, y_pred_SVC)
print('Mean Absolute Error:', mae_SVC)



Accuracy: 0.8356756756756757
Mean Squared Error: 0.1643243243243243
Mean Absolute Error: 0.1643243243243243


In [922]:
# dumping the model
import joblib
# choosing lr as the best model since the accuracy is the highest
joblib.dump(LR, 'model.pkl')


['model.pkl']