In [2]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [46]:
df = pd.read_csv('../data/combined.csv')

In [47]:
df.drop(['Unnamed: 0', 'score'], axis=1, inplace=True)

In [48]:
df

Unnamed: 0,id,title,text,label
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0
1,149j9jo,omfg guys I'm actually so happy.. my mood has ...,,0
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0
...,...,...,...,...
5938,en197s,I am 1 year self-harm free!,"As of today, I am one year self-harm free! Tha...",1
5939,du3c70,I drew myself and how I feel recently. Its har...,,1
5940,bdlqts,🖤,,1
5941,asvm6s,Who can relate,,1


In [53]:
df[df['text'].notnull()]['label'].value_counts()

0    1759
1    1755
Name: label, dtype: int64

In [54]:
df = df[df['text'].notnull()]

In [55]:
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')
df['combined'] = df['title'] + " " + df['text']

In [56]:
df

Unnamed: 0,id,title,text,label,combined
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0,Mania Coping Skills Many people experience inc...
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0,What are your telltale signs you’re going into...
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0,How do you cope when you go from manic to depr...
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0,Quit my job last week during an episode. Regre...
5,149a229,Relationships?,Does anyone feel like they will be a lone fore...,0,Relationships? Does anyone feel like they will...
...,...,...,...,...,...
5893,9clec2,I think this should be a safe space again.,I used to love browsing this subreddit. Part a...,1,I think this should be a safe space again. I u...
5898,v6fevb,I did it!,I struggled a lot with my weight over the year...,1,I did it! I struggled a lot with my weight ove...
5924,12oays0,the amount of this stuff I see is annoying.,does this make anyone else uncomfortable?,1,the amount of this stuff I see is annoying. do...
5932,jcsgwz,Schizophrenia_specia is a scam,I need the mods to see this! \n\nThere is a pe...,1,Schizophrenia_specia is a scam I need the mods...


In [57]:
#preprocessing the combined text

def preprocesstext(text):
    text = text.lower() #convert to lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text) #remove symbols
    tokens = word_tokenize(text)
    stopword = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token not in stopword]
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [58]:
def listofwords(text):
    return text.split()

In [59]:
df['processed_combined'] = df['combined'].apply(preprocesstext)

In [60]:
df['list_words'] = df['processed_combined'].apply(listofwords)

In [61]:
df

Unnamed: 0,id,title,text,label,combined,processed_combined,list_words
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0,Mania Coping Skills Many people experience inc...,mania coping skill many people experience incr...,"[mania, coping, skill, many, people, experienc..."
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0,What are your telltale signs you’re going into...,telltale sign going hypo manic episode noticed...,"[telltale, sign, going, hypo, manic, episode, ..."
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0,How do you cope when you go from manic to depr...,cope go manic depressed overnight manic two we...,"[cope, go, manic, depressed, overnight, manic,..."
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0,Quit my job last week during an episode. Regre...,quit job last week episode regretting school t...,"[quit, job, last, week, episode, regretting, s..."
5,149a229,Relationships?,Does anyone feel like they will be a lone fore...,0,Relationships? Does anyone feel like they will...,relationship anyone feel like lone forever try...,"[relationship, anyone, feel, like, lone, forev..."
...,...,...,...,...,...,...,...
5893,9clec2,I think this should be a safe space again.,I used to love browsing this subreddit. Part a...,1,I think this should be a safe space again. I u...,think safe space used love browsing subreddit ...,"[think, safe, space, used, love, browsing, sub..."
5898,v6fevb,I did it!,I struggled a lot with my weight over the year...,1,I did it! I struggled a lot with my weight ove...,struggled lot weight year especially med final...,"[struggled, lot, weight, year, especially, med..."
5924,12oays0,the amount of this stuff I see is annoying.,does this make anyone else uncomfortable?,1,the amount of this stuff I see is annoying. do...,amount stuff see annoying make anyone else unc...,"[amount, stuff, see, annoying, make, anyone, e..."
5932,jcsgwz,Schizophrenia_specia is a scam,I need the mods to see this! \n\nThere is a pe...,1,Schizophrenia_specia is a scam I need the mods...,schizophrenia specia scam need mod see person ...,"[schizophrenia, specia, scam, need, mod, see, ..."


In [62]:
df['subreddit'] = df['label'].replace({0: 'bipolar', 1: 'schizophrenia'})


In [63]:
df_bipolar = df[df['subreddit'] == 'bipolar']
df_schizophrenia = df[df['subreddit'] == 'schizophrenia']


In [15]:
df_bipolar

Unnamed: 0,id,title,text,label,combined,processed_combined,list_words,subreddit
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0,Mania Coping Skills Many people experience inc...,mania coping skill many people experience incr...,"[mania, coping, skill, many, people, experienc...",bipolar
1,149j9jo,omfg guys I'm actually so happy.. my mood has ...,,0,omfg guys I'm actually so happy.. my mood has ...,omfg guy actually happy mood improved much sin...,"[omfg, guy, actually, happy, mood, improved, m...",bipolar
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0,What are your telltale signs you’re going into...,telltale sign going hypo manic episode noticed...,"[telltale, sign, going, hypo, manic, episode, ...",bipolar
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0,How do you cope when you go from manic to depr...,cope go manic depressed overnight manic two we...,"[cope, go, manic, depressed, overnight, manic,...",bipolar
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0,Quit my job last week during an episode. Regre...,quit job last week episode regretting school t...,"[quit, job, last, week, episode, regretting, s...",bipolar
...,...,...,...,...,...,...,...,...
2975,df2qtw,Hurry up and wait.,,0,Hurry up and wait.,hurry wait,"[hurry, wait]",bipolar
2976,d0wrwm,It might actually be day three of bedtime atti...,,0,It might actually be day three of bedtime atti...,might actually day three bedtime attire never ...,"[might, actually, day, three, bedtime, attire,...",bipolar
2977,qduhxm,Like I remember all the meds I've tried? I'm B...,,0,Like I remember all the meds I've tried? I'm B...,like remember med tried bipolar w adhd remembe...,"[like, remember, med, tried, bipolar, w, adhd,...",bipolar
2978,mm9b25,Drew this pre-diagnosis...it makes sense though,,0,Drew this pre-diagnosis...it makes sense though,drew pre diagnosis make sense though,"[drew, pre, diagnosis, make, sense, though]",bipolar


In [86]:
X = df['processed_combined']
y = df['subreddit']

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [89]:
tfid = TfidfVectorizer(min_df=0.005, max_df=0.90)

In [90]:
X_train_tfid = tfid.fit_transform(X_train)

In [91]:
X_test_tfid = tfid.transform(X_test)

In [92]:
X_train_df = pd.DataFrame(X_train_tfid.toarray(),
                          columns=tfid.get_feature_names_out())
X_train_df

Unnamed: 0,abandoned,abilify,ability,able,absolutely,abuse,abusive,accept,accepted,accepting,...,yelling,yes,yesterday,yet,young,younger,youtu,youtube,zero,zombie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.069303,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.071181,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Define the models
model_lr = LogisticRegression()
model_nb = MultinomialNB()
model_svc = SVC()
model_rf = RandomForestClassifier()
model_dt = DecisionTreeClassifier()
model_knn = KNeighborsClassifier(n_neighbors=25, weights = 'distance')
model_gb = GradientBoostingClassifier()
model_ab = AdaBoostClassifier()

# List of models
models = [model_lr, model_nb, model_svc, model_rf, model_dt, model_knn, model_gb, model_ab]

# VotingClassifier for ensemble learning
model_vc = VotingClassifier(estimators=[('lr', model_lr), ('nb', model_nb), ('svc', model_svc), 
                                        ('rf', model_rf), ('dt', model_dt), ('knn', model_knn), 
                                        ('gb', model_gb), ('ab', model_ab)], voting='hard')
models.append(model_vc)

# Loop over the models
for model in models:
    model.fit(X_train_tfid, y_train)
    y_pred = model.predict(X_test_tfid)
    print(f'Model: {model.__class__.__name__}, Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Model: {model.__class__.__name__}, Train score is {model.score(X_train_tfid, y_train)}, Test score is {model.score(X_test_tfid, y_test)}')
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm, columns = ['Pred B', 'Pred S'], index = ['Actual B','Actual S'])
    display(cm_df)
    print()

Model: LogisticRegression, Accuracy: 0.842654028436019
Model: LogisticRegression, Train score is 0.9300528670191135, Test score is 0.842654028436019


Unnamed: 0,Pred B,Pred S
Actual B,446,84
Actual S,82,443



Model: MultinomialNB, Accuracy: 0.8350710900473933
Model: MultinomialNB, Train score is 0.8950793005286702, Test score is 0.8350710900473933


Unnamed: 0,Pred B,Pred S
Actual B,466,64
Actual S,110,415



Model: SVC, Accuracy: 0.8360189573459715
Model: SVC, Train score is 0.9947132980886539, Test score is 0.8360189573459715


Unnamed: 0,Pred B,Pred S
Actual B,440,90
Actual S,83,442



Model: RandomForestClassifier, Accuracy: 0.837914691943128
Model: RandomForestClassifier, Train score is 0.9987799918666125, Test score is 0.837914691943128


Unnamed: 0,Pred B,Pred S
Actual B,451,79
Actual S,92,433



Model: DecisionTreeClassifier, Accuracy: 0.809478672985782
Model: DecisionTreeClassifier, Train score is 0.9987799918666125, Test score is 0.809478672985782


Unnamed: 0,Pred B,Pred S
Actual B,422,108
Actual S,93,432



Model: KNeighborsClassifier, Accuracy: 0.7696682464454976
Model: KNeighborsClassifier, Train score is 0.9987799918666125, Test score is 0.7696682464454976


Unnamed: 0,Pred B,Pred S
Actual B,384,146
Actual S,97,428



Model: GradientBoostingClassifier, Accuracy: 0.8530805687203792
Model: GradientBoostingClassifier, Train score is 0.9280195201301342, Test score is 0.8530805687203792


Unnamed: 0,Pred B,Pred S
Actual B,444,86
Actual S,69,456



Model: AdaBoostClassifier, Accuracy: 0.8255924170616113
Model: AdaBoostClassifier, Train score is 0.8922326148840992, Test score is 0.8255924170616113


Unnamed: 0,Pred B,Pred S
Actual B,418,112
Actual S,72,453



Model: VotingClassifier, Accuracy: 0.8540284360189574
Model: VotingClassifier, Train score is 0.9894265961773079, Test score is 0.8540284360189574


Unnamed: 0,Pred B,Pred S
Actual B,460,70
Actual S,84,441





---
Testing
--- 

In [95]:
title = 'Is 4 months in the psych ward a long time?'
post = 'I keep hearing how people only stay for 3 days or a week. I just feel like a freak. 4 months is a long time and I just wonder why so long? I don\’t know anyone who has been there that long…people who freak out just stay for a tiny bit and they are free to go. I just don’t get it. Was I really that crazy? I just feel like I don\’t fit in society…'

In [96]:
combinepost = title + post

In [108]:
preprocesstext(combinepost)

'month psych ward long time keep hearing people stay day week feel like freak month long time wonder long know anyone long people freak stay tiny bit free go get really crazy feel like fit society'

In [115]:
series = pd.Series(preprocesstext(combinepost))

In [116]:
tfid.transform(series)

<1x1805 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [117]:
model_vc.predict(tfid.transform(series))

array(['bipolar'], dtype=object)