In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import mutual_info_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_curve

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV

import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('C:/Users/WOYES/Downloads/students.csv')

In [3]:
# make the column names and values look uniform

df.columns = df.columns.str.lower().str.replace(' ','_')

cat_columns = list(df.dtypes[df.dtypes=='object'].index)

for c in cat_columns:
    df[c] = df[c].str.lower().str.replace(' ','_')

# recheck the dataset
df.head()

Unnamed: 0,id,gender,age,city,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,have_you_ever_had_suicidal_thoughts_?,work/study_hours,financial_stress,family_history_of_mental_illness,depression
0,2,male,33.0,visakhapatnam,student,5.0,0.0,8.97,2.0,0.0,5-6_hours,healthy,b.pharm,yes,3.0,1.0,no,1
1,8,female,24.0,bangalore,student,2.0,0.0,5.9,5.0,0.0,5-6_hours,moderate,bsc,no,3.0,2.0,yes,0
2,26,male,31.0,srinagar,student,3.0,0.0,7.03,5.0,0.0,less_than_5_hours,healthy,ba,no,9.0,1.0,yes,0
3,30,female,28.0,varanasi,student,3.0,0.0,5.59,2.0,0.0,7-8_hours,moderate,bca,yes,4.0,5.0,yes,1
4,32,female,25.0,jaipur,student,4.0,0.0,8.13,3.0,0.0,5-6_hours,moderate,m.tech,yes,1.0,1.0,no,0


In [4]:
#  drop unnecessary column
df.drop(columns=['id'], inplace = True)

In [5]:
#  drop the missing values
df.dropna(subset= ['financial_stress'], inplace = True)

In [6]:
others = df[df['sleep_duration']=='others'].index
df.drop(others, inplace = True )

In [7]:
cat_map = {'5-6_hours': 0, 
           'less_than_5_hours':1,
           '7-8_hours':2,
           'more_than_8_hours':3
    }

df['sleep_duration'] = df['sleep_duration'].map(cat_map)

We should remove least correlated feature from our dataset to optimize our model performance. Features like `work_pressure`, `job_satisfaction`, `gender`, `profession`, `family_history_of_mental_illness`, and `city` are less important to the response variable (`depression`)

In [8]:
df.drop(columns = ['work_pressure', 'job_satisfaction', 'gender','profession','family_history_of_mental_illness','city'], inplace=True)
df.head()

Unnamed: 0,age,academic_pressure,cgpa,study_satisfaction,sleep_duration,dietary_habits,degree,have_you_ever_had_suicidal_thoughts_?,work/study_hours,financial_stress,depression
0,33.0,5.0,8.97,2.0,0,healthy,b.pharm,yes,3.0,1.0,1
1,24.0,2.0,5.9,5.0,0,moderate,bsc,no,3.0,2.0,0
2,31.0,3.0,7.03,5.0,1,healthy,ba,no,9.0,1.0,0
3,28.0,3.0,5.59,2.0,2,moderate,bca,yes,4.0,5.0,1
4,25.0,4.0,8.13,3.0,0,moderate,m.tech,yes,1.0,1.0,0


In [10]:
student= {'age':33.0, 'academic_pressure':5.0, 'cgpa':8.97, 'study_satisfaction':2.0, 'sleep_duration':0, 'dietary_habits':'healthy', 'degree':'b.pharm', 'have_you_ever_had_suicidal_thoughts_?':'yes', 'work/study_hours':3.0, 'financial_stress':1.0}

In [11]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

y_test = df_test.depression.values

In [12]:
# new numerical and categorical features 
num = ['age', 'academic_pressure', 'cgpa',
       'study_satisfaction','sleep_duration',
       'work/study_hours', 'financial_stress']

cat = ['dietary_habits', 'degree', 'have_you_ever_had_suicidal_thoughts_?']

In [13]:
def train(df_train, y_train, C=0.1):
    dicts = df_train[cat+num].to_dict(orient='records')

    dv = DictVectorizer(sparse = False)
    
    # fit and transform the dictionary vectorizer
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=0.1, solver='lbfgs', max_iter=1000) 
    model.fit(X_train, y_train) 

    return dv, model

In [14]:
def predict(df, dv, model):
    dicts = df[cat+num].to_dict(orient='records')
    
    # fit and transform the dictionary vectorizer
    X = dv.transform(dicts)

    y_pred = model.predict_proba(X)[:,1] 

    return y_pred

In [15]:
dv, model = train(df_full_train,df_full_train.depression.values, C=0.1)
y_pred = predict(df_test,dv, model)

auc =roc_auc_score(y_test,y_pred)
auc

0.9189522761755399

In [16]:
import pickle

In [17]:
output_file = 'student.pkl'

In [18]:
with open(output_file,'wb') as f_out:
    pickle.dump((dv, model), f_out)