In [1]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df4=pd.read_csv('Cap/Empath_variables.csv', delimiter = ',',encoding='utf-8')
df5=df4.copy()
df4.drop(df4.columns.difference(['Username','Subreddit','Depressed']), 1, inplace=True)
df4.drop_duplicates(keep='first',inplace=True) 
df4.reset_index(drop=True)
df4 = df4.astype(str)
df6=df4.groupby(['Username','Depressed'])['Subreddit'].apply(lambda x:','.join(x)).reset_index(name ='Subreddits')

In [3]:
df6['Subreddits'] = df6['Subreddits'].str.replace('depression,', '')

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df6.Subreddits)
X_train_counts.shape

(4479, 8586)

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts).toarray()
X_train_tfidf.shape

(4479, 8586)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, list(map(int,df6.Depressed)), test_size=0.2)

In [7]:
#print("AUC:", roc_auc_score(y_test, y_pred))
#print("Precision:", precision_score(y_test, y_pred))
#print("Recall:", recall_score(y_test, y_pred))
#print("F1 Score:", f1_score(y_test, y_pred))

#from sklearn.metrics import roc_auc_score
#from sklearn.metrics import precision_score
#from sklearn.metrics import recall_score
#from sklearn.metrics import f1_score

In [8]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

           0       0.42      0.43      0.43       389
           1       0.55      0.54      0.55       507

   micro avg       0.49      0.49      0.49       896
   macro avg       0.49      0.49      0.49       896
weighted avg       0.50      0.49      0.49       896



[[168 221]
 [233 274]]


0.49330357142857145
0.5535353535353535
0.5404339250493096
0.5469061876247504
0.4861552658665572
-0.02760060554702006


In [9]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

           0       0.76      0.61      0.68       389
           1       0.74      0.85      0.79       507

   micro avg       0.75      0.75      0.75       896
   macro avg       0.75      0.73      0.74       896
weighted avg       0.75      0.75      0.74       896



[[238 151]
 [ 74 433]]


0.7488839285714286
0.7414383561643836
0.854043392504931
0.7937671860678278
0.7329342926534939
0.48468377930615264


In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

           0       0.98      0.16      0.27       389
           1       0.61      1.00      0.76       507

   micro avg       0.63      0.63      0.63       896
   macro avg       0.80      0.58      0.51       896
weighted avg       0.77      0.63      0.55       896



[[ 62 327]
 [  1 506]]


0.6339285714285714
0.6074429771908764
0.9980276134122288
0.7552238805970151
0.578705323415626
0.3051547316640906


In [11]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC



              precision    recall  f1-score   support

           0       0.96      0.89      0.92       389
           1       0.92      0.97      0.95       507

   micro avg       0.94      0.94      0.94       896
   macro avg       0.94      0.93      0.93       896
weighted avg       0.94      0.94      0.94       896



[[348  41]
 [ 16 491]]


0.9363839285714286
0.9229323308270677
0.9684418145956607
0.9451395572666025
0.9315216785060565
0.8709733895792744


In [12]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', penalty='l2',alpha=1e-3, max_iter=5, random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC



              precision    recall  f1-score   support

           0       0.95      0.92      0.93       389
           1       0.94      0.96      0.95       507

   micro avg       0.94      0.94      0.94       896
   macro avg       0.95      0.94      0.94       896
weighted avg       0.94      0.94      0.94       896



[[357  32]
 [ 18 489]]


0.9441964285714286
0.9385796545105566
0.9644970414201184
0.9513618677042802
0.9411174153116015
0.8863974225219698


In [14]:
from xgboost import XGBClassifier
clf = XGBClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

           0       0.97      0.90      0.93       389
           1       0.93      0.98      0.95       507

   micro avg       0.94      0.94      0.94       896
   macro avg       0.95      0.94      0.94       896
weighted avg       0.94      0.94      0.94       896



[[349  40]
 [ 12 495]]


0.9419642857142857
0.9252336448598131
0.9763313609467456
0.9500959692898273
0.9367517987253007
0.8826997148544918


In [15]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

           0       0.96      0.90      0.93       389
           1       0.93      0.97      0.95       507

   micro avg       0.94      0.94      0.94       896
   macro avg       0.94      0.94      0.94       896
weighted avg       0.94      0.94      0.94       896



[[351  38]
 [ 16 491]]


0.9397321428571429
0.9281663516068053
0.9684418145956607
0.9478764478764479
0.935377719637162
0.8776353509387709


In [16]:
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(metrics.confusion_matrix(y_test, y_pred)) 
print("\n")

y_test1= [0 if x=="NORMAL" else x for x in y_test]
y_test1= [1 if x=="DEPRESSED" else x for x in y_test1]

y_pred= [0 if x=="NORMAL" else x for x in y_pred]
y_pred= [1 if x=="DEPRESSED" else x for x in y_pred]

print(metrics.accuracy_score(y_test1, y_pred))            #Accuracy
print(precision_score(y_test1, y_pred))                   #Precision
print(recall_score(y_test1, y_pred))                      #Recall
print(f1_score(y_test1, y_pred))                          #F1 Score
print(roc_auc_score(y_test1, y_pred))                     #AUC
print(matthews_corrcoef(y_test1, y_pred))                 #MCC

              precision    recall  f1-score   support

           0       0.93      0.92      0.92       389
           1       0.94      0.95      0.94       507

   micro avg       0.94      0.94      0.94       896
   macro avg       0.93      0.93      0.93       896
weighted avg       0.94      0.94      0.94       896



[[357  32]
 [ 26 481]]


0.9352678571428571
0.9376218323586745
0.9487179487179487
0.9431372549019608
0.9332278689605168
0.8680946763317605
