In [1]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('Cap/PartsOfSpeech.csv', delimiter = ',',encoding='utf-8')

In [3]:
decode_map = {0: "NORMAL", 1: "DEPRESSED"}
def decode_sentiment(label):
    return decode_map[int(label)]

df.Depressed = df.Depressed.apply(lambda x: decode_sentiment(x))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Depressed','Body','Username']), df.Depressed, test_size=0.2)

In [5]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.52      0.53      0.52      7203
      NORMAL       0.49      0.48      0.48      6706

   micro avg       0.51      0.51      0.51     13909
   macro avg       0.50      0.50      0.50     13909
weighted avg       0.51      0.51      0.51     13909



[[3788 3415]
 [3468 3238]]


0.5051405564742253
0.5220507166482911
0.5258919894488407
0.5239643128847085
0.5043715837491743
0.008745806848547855


In [6]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.61      0.46      0.52      7203
      NORMAL       0.54      0.68      0.60      6706

   micro avg       0.57      0.57      0.57     13909
   macro avg       0.57      0.57      0.56     13909
weighted avg       0.57      0.57      0.56     13909



[[3300 3903]
 [2146 4560]]


0.5651017326910633
0.6059493206022769
0.45814244064972925
0.5217803778954858
0.5690652555172296
0.14140901968624717


In [7]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.63      0.52      0.57      7203
      NORMAL       0.56      0.67      0.61      6706

   micro avg       0.59      0.59      0.59     13909
   macro avg       0.60      0.59      0.59     13909
weighted avg       0.60      0.59      0.59     13909



[[3748 3455]
 [2225 4481]]


0.5916313178517507
0.627490373346727
0.5203387477439956
0.5689131754705525
0.594273161524846
0.19033097063386367


In [8]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC



              precision    recall  f1-score   support

   DEPRESSED       0.60      0.70      0.64      7203
      NORMAL       0.60      0.50      0.54      6706

   micro avg       0.60      0.60      0.60     13909
   macro avg       0.60      0.60      0.59     13909
weighted avg       0.60      0.60      0.60     13909



[[5019 2184]
 [3378 3328]]


0.600115033431591
0.5977134690961058
0.6967930029154519
0.6434615384615385
0.5965324990718028
0.19723107692960454


In [9]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', penalty='l2',alpha=1e-3, max_iter=5, random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC



              precision    recall  f1-score   support

   DEPRESSED       0.61      0.57      0.59      7203
      NORMAL       0.57      0.61      0.59      6706

   micro avg       0.59      0.59      0.59     13909
   macro avg       0.59      0.59      0.59     13909
weighted avg       0.59      0.59      0.59     13909



[[4096 3107]
 [2626 4080]]


0.587820835430297
0.6093424576019042
0.5686519505761488
0.5882944344703769
0.5885311646707169
0.1770482256635464


In [12]:
from xgboost import XGBClassifier
clf = XGBClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.65      0.56      0.60      7203
      NORMAL       0.59      0.67      0.63      6706

   micro avg       0.61      0.61      0.61     13909
   macro avg       0.62      0.62      0.61     13909
weighted avg       0.62      0.61      0.61     13909



[[4056 3147]
 [2223 4483]]


0.6139190452225178
0.6459627329192547
0.5630987088713036
0.6016911437472184
0.6158022622793738
0.23255624735145747


In [13]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.64      0.56      0.59      7203
      NORMAL       0.58      0.66      0.62      6706

   micro avg       0.61      0.61      0.61     13909
   macro avg       0.61      0.61      0.61     13909
weighted avg       0.61      0.61      0.61     13909



[[4010 3193]
 [2289 4417]]


0.6058667050111438
0.6366089855532624
0.5567124809107317
0.5939860761368686
0.6076881820002511
0.21620134444043673


In [14]:
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

   DEPRESSED       0.62      0.69      0.65      7203
      NORMAL       0.62      0.54      0.58      6706

   micro avg       0.62      0.62      0.62     13909
   macro avg       0.62      0.61      0.61     13909
weighted avg       0.62      0.62      0.61     13909



[[4937 2266]
 [3082 3624]]


0.6155007549068948
0.6156628008479861
0.6854088574205193
0.6486664038891079
0.6129102145736656
0.22836732048978453
