## Machine Learning Based Depression Test at Work Place

This Project is aimed to help those employees who are working in Tech space. This ML Based Test recommends them to visit Psychiatrist based on the answers given by the user. It helps them know whether they are suffering from Mental illness or not.

DataSet Used : https://www.kaggle.com/osmi/mental-health-in-tech-survey

In [1]:
#import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [3]:
#View the DataFrame
df = pd.read_csv('survey.csv') # import the dataset
df.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [4]:
#Drop all the columns which are not useful
df = df.drop(columns=['Timestamp','comments','state'])

In [5]:
#Fill all the null values
df['self_employed'] = df['self_employed'].fillna('No')
print(df['self_employed'].unique())

['No' 'Yes']


In [6]:
#Check all the null values in each column
df.isnull().sum()

Age                            0
Gender                         0
Country                        0
self_employed                  0
family_history                 0
treatment                      0
work_interfere               264
no_employees                   0
remote_work                    0
tech_company                   0
benefits                       0
care_options                   0
wellness_program               0
seek_help                      0
anonymity                      0
leave                          0
mental_health_consequence      0
phys_health_consequence        0
coworkers                      0
supervisor                     0
mental_health_interview        0
phys_health_interview          0
mental_vs_physical             0
obs_consequence                0
dtype: int64

In [7]:
print("Missing data of work_interefence : ",round((264/1259)*100,2),'percent')

Missing data of work_interefence :  20.97 percent


In [8]:
print("Missing data of self_employed : ",round((18/1259)*100,2),'percent')

Missing data of self_employed :  1.43 percent


In [9]:
df['work_interfere'].value_counts()

Sometimes    465
Never        213
Rarely       173
Often        144
Name: work_interfere, dtype: int64

In [10]:
df['self_employed'].value_counts()

No     1113
Yes     146
Name: self_employed, dtype: int64

In [11]:
df.self_employed.unique()

array(['No', 'Yes'], dtype=object)

In [12]:
df.work_interfere.unique()

array(['Often', 'Rarely', 'Never', 'Sometimes', nan], dtype=object)

In [13]:
df['work_interfere'] = df['work_interfere'].fillna('Don\'t know' )
print(df['work_interfere'].unique())

['Often' 'Rarely' 'Never' 'Sometimes' "Don't know"]


In [14]:
#Gender column need to be cleaned
df['Gender'].unique()

array(['Female', 'M', 'Male', 'male', 'female', 'm', 'Male-ish', 'maile',
       'Trans-female', 'Cis Female', 'F', 'something kinda male?',
       'Cis Male', 'Woman', 'f', 'Mal', 'Male (CIS)', 'queer/she/they',
       'non-binary', 'Femake', 'woman', 'Make', 'Nah', 'All', 'Enby',
       'fluid', 'Genderqueer', 'Female ', 'Androgyne', 'Agender',
       'cis-female/femme', 'Guy (-ish) ^_^', 'male leaning androgynous',
       'Male ', 'Man', 'Trans woman', 'msle', 'Neuter', 'Female (trans)',
       'queer', 'Female (cis)', 'Mail', 'cis male', 'A little about you',
       'Malr', 'p', 'femail', 'Cis Man',
       'ostensibly male, unsure what that really means'], dtype=object)

In [15]:
## cleaning Gender Column
gender = df['Gender'].str.lower()

male_str = ["male", "m", "male-ish", "maile", "mal", "male (cis)", "make", "male ", "man","msle", "mail", "malr","cis man", "Cis Male", "cis male"]
trans_str = ["trans-female", "something kinda male?", "queer/she/they", "non-binary","nah", "all", "enby", "fluid", "genderqueer", "androgyne", "agender", "male leaning androgynous", "guy (-ish) ^_^", "trans woman", "neuter", "female (trans)", "queer", "ostensibly male, unsure what that really means"]           
female_str = ["cis female", "f", "female", "woman",  "femake", "female ","cis-female/femme", "female (cis)", "femail"]

for (row, col) in df.iterrows():

    if str.lower(col.Gender) in male_str:
        df['Gender'].replace(to_replace=col.Gender, value='male', inplace=True)

    if str.lower(col.Gender) in female_str:
        df['Gender'].replace(to_replace=col.Gender, value='female', inplace=True)

    if str.lower(col.Gender) in trans_str:
        df['Gender'].replace(to_replace=col.Gender, value='trans', inplace=True)

#Get rid of BS
stk_list = ['A little about you', 'p']
df = df[~df['Gender'].isin(stk_list)]

print(df['Gender'].unique())

['female' 'male' 'trans']


In [16]:
##categorizing Age
s = pd.Series(df['Age'])
s[s<18] = df['Age'].median()
df['Age'] = s
s = pd.Series(df['Age'])
s[s>120] = df['Age'].median()
df['Age'] = s

#Ranges of Age
df['age_range'] = pd.cut(df['Age'], [0,20,30,65,100], labels=["0-20", "21-30", "31-65", "66-100"], include_lowest=True)

In [17]:
df.age_range.unique()

[31-65, 21-30, 0-20, 66-100]
Categories (4, object): [0-20 < 21-30 < 31-65 < 66-100]

In [18]:
#Encoding all the columns
labelDict = {}
for feature in df:
    le = LabelEncoder()
    le.fit(df[feature])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    df[feature] = le.transform(df[feature])
    # Get labels
    labelKey = 'label_' + feature
    labelValue = [*le_name_mapping]
    labelDict[labelKey] =labelValue
    
for key, value in labelDict.items():     
    print(key, value)

#Get rid of 'Country'
df = df.drop(['Country'], axis= 1)
df.head()

label_Age [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 60, 61, 62, 65, 72]
label_Gender ['female', 'male', 'trans']
label_Country ['Australia', 'Austria', 'Belgium', 'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Canada', 'China', 'Colombia', 'Costa Rica', 'Croatia', 'Czech Republic', 'Denmark', 'Finland', 'France', 'Georgia', 'Germany', 'Greece', 'Hungary', 'India', 'Ireland', 'Israel', 'Italy', 'Japan', 'Latvia', 'Mexico', 'Moldova', 'Netherlands', 'New Zealand', 'Nigeria', 'Norway', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Russia', 'Singapore', 'Slovenia', 'South Africa', 'Spain', 'Sweden', 'Switzerland', 'Thailand', 'United Kingdom', 'United States', 'Uruguay', 'Zimbabwe']
label_self_employed ['No', 'Yes']
label_family_history ['No', 'Yes']
label_treatment ['No', 'Yes']
label_work_interfere ["Don't know", 'Never', 'Often', 'Rarely', 'Sometimes']
label_no_emp

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,age_range
0,19,0,0,0,1,2,4,0,1,2,...,2,1,1,1,2,1,0,2,0,2
1,26,1,0,0,0,3,5,0,0,0,...,0,0,1,0,0,1,1,0,0,2
2,14,1,0,0,0,3,4,0,1,1,...,1,1,1,2,2,2,2,1,0,2
3,13,1,0,1,1,2,2,0,1,1,...,1,2,2,1,0,0,0,1,1,2
4,13,1,0,0,0,1,1,1,1,2,...,0,1,1,1,2,2,2,0,0,2


In [19]:
#New Encoded DataFrame
df

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,age_range
0,19,0,0,0,1,2,4,0,1,2,...,2,1,1,1,2,1,0,2,0,2
1,26,1,0,0,0,3,5,0,0,0,...,0,0,1,0,0,1,1,0,0,2
2,14,1,0,0,0,3,4,0,1,1,...,1,1,1,2,2,2,2,1,0,2
3,13,1,0,1,1,2,2,0,1,1,...,1,2,2,1,0,0,0,1,1,2
4,13,1,0,0,0,1,1,1,1,2,...,0,1,1,1,2,2,2,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,8,1,0,0,1,0,2,0,1,1,...,2,1,1,1,1,1,1,0,0,1
1255,14,1,0,1,1,2,2,1,1,2,...,1,1,1,1,2,1,1,2,0,2
1256,16,1,0,1,1,4,5,0,1,2,...,1,2,2,0,0,1,1,1,0,2
1257,28,0,0,0,0,0,1,1,1,1,...,0,2,1,0,0,1,1,1,0,2


In [20]:
#Using Scaler to normalize the data
scaler = MinMaxScaler()
df['Age'] = scaler.fit_transform(df[['Age']])
df.head()

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,age_range
0,0.431818,0,0,0,1,2,4,0,1,2,...,2,1,1,1,2,1,0,2,0,2
1,0.590909,1,0,0,0,3,5,0,0,0,...,0,0,1,0,0,1,1,0,0,2
2,0.318182,1,0,0,0,3,4,0,1,1,...,1,1,1,2,2,2,2,1,0,2
3,0.295455,1,0,1,1,2,2,0,1,1,...,1,2,2,1,0,0,0,1,1,2
4,0.295455,1,0,0,0,1,1,1,1,2,...,0,1,1,1,2,2,2,0,0,2


In [21]:
df.columns

Index(['Age', 'Gender', 'self_employed', 'family_history', 'treatment',
       'work_interfere', 'no_employees', 'remote_work', 'tech_company',
       'benefits', 'care_options', 'wellness_program', 'seek_help',
       'anonymity', 'leave', 'mental_health_consequence',
       'phys_health_consequence', 'coworkers', 'supervisor',
       'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'age_range'],
      dtype='object')

In [22]:
#Drop unnecessary columns that we don't require
df = df.drop(columns = ['self_employed', 'obs_consequence', 'age_range'])

In [23]:
df.columns

Index(['Age', 'Gender', 'family_history', 'treatment', 'work_interfere',
       'no_employees', 'remote_work', 'tech_company', 'benefits',
       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical'],
      dtype='object')

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1257 entries, 0 to 1258
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        1257 non-null   float64
 1   Gender                     1257 non-null   int32  
 2   family_history             1257 non-null   int32  
 3   treatment                  1257 non-null   int32  
 4   work_interfere             1257 non-null   int32  
 5   no_employees               1257 non-null   int32  
 6   remote_work                1257 non-null   int32  
 7   tech_company               1257 non-null   int32  
 8   benefits                   1257 non-null   int32  
 9   care_options               1257 non-null   int32  
 10  wellness_program           1257 non-null   int32  
 11  seek_help                  1257 non-null   int32  
 12  anonymity                  1257 non-null   int32  
 13  leave                      1257 non-null   int32

# Model Building

In [25]:
#Split the Dataset
X = df.drop(columns=['treatment'])
y = df.treatment

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [26]:
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression

In [67]:
#Evalution Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import sklearn.metrics as metrics

### Logistic Regression

In [28]:
log_reg = LogisticRegression()

log_reg.fit(X_train,y_train)

LogisticRegression()

In [29]:
y_pred = log_reg.predict(X_test)

In [30]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.783068783068783

In [31]:
# Confusion Matrix
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat

array([[138,  53],
       [ 29, 158]], dtype=int64)

In [32]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]

In [33]:
Accuracy = (true_positive + true_negative) / (true_positive +false_positive + false_negative + true_negative)
Accuracy

0.783068783068783

In [34]:
# Precison
Precision = true_positive/(true_positive+false_positive)
Precision

0.7225130890052356

In [35]:
# Recall
Recall = true_positive/(true_positive+false_negative)
Recall

0.8263473053892215

In [36]:
# F1 Score
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
F1_Score

0.7709497206703911

In [37]:
# Area Under Curve
auc = roc_auc_score(y_test, y_pred)
auc

0.7837164375507462

## Decision Tree classifier

In [38]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

In [39]:
import random as rd

In [40]:
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

DecisionTreeClassifier()

In [41]:
clf.score(X_train,y_train)

1.0

In [42]:
py_pred = clf.predict(X_test)

In [43]:
#Accuracy score before Hyperparamter Tuning
clf.score(X_test,y_test)

0.7354497354497355

In [44]:
param_dist = {"max_depth": [3, None],
              "max_features": rd.randint(1, len(X_test.columns)),
              "min_samples_split": rd.randint(2, 9),
              "min_samples_leaf": rd.randint(1, 9),
              "criterion": ["gini", "entropy"]}

In [45]:
# Using RandomizedSeachCV for Hyperparameter Tuning
tune = RandomizedSearchCV(clf, param_dist,cv=5,n_jobs =-1)

In [46]:
tune

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, None],
                                        'max_features': 2,
                                        'min_samples_leaf': 8,
                                        'min_samples_split': 9})

In [47]:
tree = DecisionTreeClassifier(max_depth=3, min_samples_split=4, max_features=12, criterion='entropy', min_samples_leaf=9)
tree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=3, max_features=12,
                       min_samples_leaf=9, min_samples_split=4)

In [48]:
#Accuracy Score after HyperParammeter Tuning
tree.score(X_test,y_test)

0.8068783068783069

## Random Forest

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
rand_clf = RandomForestClassifier(random_state=6)

In [51]:
rand_clf.fit(X_train,y_train)

RandomForestClassifier(random_state=6)

In [52]:
#Accuracy Score Befor HyperParameter Tuning
rand_clf.score(X_test,y_test)

0.8121693121693122

In [53]:
# Using RandomizedSeachCV for Hyperparameter Tuning
tuneforest = RandomizedSearchCV(rand_clf, param_dist,cv=5,n_jobs =-1)

In [54]:
tuneforest

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=6),
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, None],
                                        'max_features': 2,
                                        'min_samples_leaf': 8,
                                        'min_samples_split': 9})

In [55]:
forest = RandomForestClassifier(max_depth = None, min_samples_leaf=12, min_samples_split=4, n_estimators = 20, random_state = 1)
my_forest = forest.fit(X_train, y_train)

In [56]:
#Accuracy Score after Hyperparameter Tuning
forest.score(X_test,y_test)

0.8174603174603174

## Knn

In [57]:
from sklearn.neighbors import KNeighborsClassifier

In [58]:
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=10)

In [59]:
knn.score(X_test,y_test)

0.7645502645502645

In [60]:
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']

In [61]:
param_dist = {"n_neighbors":k_range,
              "weights":weight_options}

In [62]:
kNN = RandomizedSearchCV(knn, param_dist)

In [63]:
kNN

RandomizedSearchCV(estimator=KNeighborsClassifier(n_neighbors=10),
                   param_distributions={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8,
                                                        9, 10, 11, 12, 13, 14,
                                                        15, 16, 17, 18, 19, 20,
                                                        21, 22, 23, 24, 25, 26,
                                                        27, 28, 29, 30],
                                        'weights': ['uniform', 'distance']})

In [64]:
knn = KNeighborsClassifier(n_neighbors=27, weights='uniform')
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=27)

In [65]:
y_pred_class = knn.predict(X_test)

In [68]:
#Accuracy Score
metrics.accuracy_score(y_test, y_pred_class)

0.791005291005291

## Bagging

In [69]:
from sklearn.ensemble import BaggingClassifier

In [70]:
bag = BaggingClassifier(DecisionTreeClassifier(), max_samples=1.0, max_features=1.0, bootstrap_features=False)
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier())

In [71]:
y_pred_class = bag.predict(X_test)

In [80]:
accuracy_score = metrics.accuracy_score(y_test, y_pred_class)

In [73]:
#Accuracy Score
accuracy_score

0.783068783068783

## Boosting

In [74]:
from sklearn.ensemble import AdaBoostClassifier

In [75]:
boost = AdaBoostClassifier(tree,n_estimators=500)
boost.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                         max_depth=3,
                                                         max_features=12,
                                                         min_samples_leaf=9,
                                                         min_samples_split=4),
                   n_estimators=500)

In [76]:
y_pred_class = boost.predict(X_test)

In [77]:
#Accuracy Score
metrics.accuracy_score(y_test, y_pred_class)

0.7301587301587301

#### Using Random Forest Classifier as it gives best score

In [None]:
""""import pickle
filename = 'final.pkl'
outfile = open(filename,'wb')
pickle.dump(forest,outfile)
outfile.close()""""

In [None]:
"""""import pickle 
pickle_in = open('final.pkl',"rb")
forest_classifier = pickle.load(pickle_in)
forest_classifier.predict([Input])""""

In [78]:
# Checking the Sklearn Version
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.23.2.


In [79]:
# Checking the Pickle Version
import pickle 
print(pickle.format_version)

4.0
