In [1]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('Cap/EmpathVariables_PartsOfSpeech.csv', delimiter = ',',encoding='utf-8')

In [3]:
decode_map = {0: "NORMAL", 1: "DEPRESSED"}
def decode_sentiment(label):
    return decode_map[int(label)]

df.Depressed = df.Depressed.apply(lambda x: decode_sentiment(x))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Depressed','Body','Username']), df.Depressed, test_size=0.2)

In [5]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.53      0.52      0.52      7227
      NORMAL       0.49      0.49      0.49      6682

   micro avg       0.51      0.51      0.51     13909
   macro avg       0.51      0.51      0.51     13909
weighted avg       0.51      0.51      0.51     13909



[[3775 3452]
 [3406 3276]]


0.5069379538428356
0.5256928004456204
0.5223467552234675
0.5240144364242088
0.5063095643821618
0.012616130717324541


In [6]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.82      0.33      0.47      7227
      NORMAL       0.56      0.92      0.70      6682

   micro avg       0.62      0.62      0.62     13909
   macro avg       0.69      0.63      0.58     13909
weighted avg       0.70      0.62      0.58     13909



[[2389 4838]
 [ 514 6168]]


0.6152131713279172
0.8229417843610058
0.3305659333056593
0.4716683119447187
0.6268214281912913
0.31182908917398083


In [7]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.74      0.44      0.55      7227
      NORMAL       0.58      0.84      0.68      6682

   micro avg       0.63      0.63      0.63     13909
   macro avg       0.66      0.64      0.62     13909
weighted avg       0.66      0.63      0.61     13909



[[3145 4082]
 [1092 5590]]


0.6280106405924222
0.7422704743922587
0.43517365435173655
0.5486741102581997
0.635874764919059
0.2949943434801865


In [8]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC



              precision    recall  f1-score   support

   DEPRESSED       0.76      0.00      0.00      7227
      NORMAL       0.48      1.00      0.65      6682

   micro avg       0.48      0.48      0.48     13909
   macro avg       0.62      0.50      0.33     13909
weighted avg       0.63      0.48      0.31     13909



[[  13 7214]
 [   4 6678]]


0.48105543173484794
0.7647058823529411
0.0017988100179881002
0.0035891772501380455
0.5006000934256358
0.017162252194358685


In [9]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', penalty='l2',alpha=1e-3, max_iter=5, random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC



              precision    recall  f1-score   support

   DEPRESSED       0.65      0.70      0.67      7227
      NORMAL       0.64      0.58      0.61      6682

   micro avg       0.64      0.64      0.64     13909
   macro avg       0.64      0.64      0.64     13909
weighted avg       0.64      0.64      0.64     13909



[[5056 2171]
 [2774 3908]]


0.644474800488892
0.6457215836526181
0.6995987269959872
0.6715813243009896
0.6422267804390291
0.2865145523245508


In [10]:
from xgboost import XGBClassifier
clf = XGBClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.74      0.54      0.62      7227
      NORMAL       0.61      0.79      0.69      6682

   micro avg       0.66      0.66      0.66     13909
   macro avg       0.68      0.67      0.66     13909
weighted avg       0.68      0.66      0.66     13909



[[3881 3346]
 [1381 5301]]


0.6601481055431735
0.7375522614975295
0.5370139753701397
0.6215069260949636
0.6651696635306251
0.3403175855569219


In [11]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.68      0.60      0.64      7227
      NORMAL       0.61      0.69      0.65      6682

   micro avg       0.64      0.64      0.64     13909
   macro avg       0.65      0.65      0.64     13909
weighted avg       0.65      0.64      0.64     13909



[[4337 2890]
 [2070 4612]]


0.6433963620677259
0.6769158732636179
0.600110696001107
0.6362036086254951
0.6451616036126456
0.2910034407575635


In [12]:
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.65      0.71      0.68      7227
      NORMAL       0.65      0.59      0.62      6682

   micro avg       0.65      0.65      0.65     13909
   macro avg       0.65      0.65      0.65     13909
weighted avg       0.65      0.65      0.65     13909



[[5112 2115]
 [2710 3972]]


0.6531023078582213
0.6535412937867553
0.7073474470734745
0.6793806897468271
0.6508901258115053
0.3039222688311758
