In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df=pd.read_csv('D:\\Datasets\\Depression Prediction\\mht.csv',index_col=0)

In [3]:
df.head(3)

Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label
0,637894677824413696,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1013187241,84,211,251,837,0,1
1,637890384576778240,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1013187241,84,211,251,837,1,1
2,637749345908051968,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1013187241,84,211,251,837,0,1


In [4]:
df=df.drop(labels=['post_id','post_created','user_id','followers','friends','favourites','statuses','retweets'],axis=1)

In [5]:
df.head(3)

Unnamed: 0,post_text,label
0,It's just over 2 years since I was diagnosed w...,1
1,"It's Sunday, I need a break, so I'm planning t...",1
2,Awake but tired. I need to sleep but my brain ...,1


In [6]:
# Steps to follow:
#    1. Convert text to lower case
#    2. keep only characters a-z, remove all other things (i.e. special characters etc.)
#    3. tokenise the statements
#    4. stemming of words
#    5. apply Countvectorizer to convert the text into numerical data

In [7]:
df['post_text'][0]

"It's just over 2 years since I was diagnosed with #anxiety and #depression. Today I'm taking a moment to reflect on how far I've come since."

In [8]:
ps=PorterStemmer()
stpwrds=stopwords.words('english')
print(stpwrds)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
def preprocessing(text):
    text=text.lower()
    text=re.sub(pattern='[^a-z]+',repl=' ',string=text)
    text=re.split(pattern=' ',string=text)
    text_stemmed=[ps.stem(word) for word in text if word not in stpwrds]
    text=' '.join(text_stemmed)
    return text

In [10]:
df['preprocessed']=df['post_text'].apply(preprocessing)

In [11]:
df.head(5)

Unnamed: 0,post_text,label,preprocessed
0,It's just over 2 years since I was diagnosed w...,1,year sinc diagnos anxieti depress today take m...
1,"It's Sunday, I need a break, so I'm planning t...",1,sunday need break plan spend littl time possibl
2,Awake but tired. I need to sleep but my brain ...,1,awak tire need sleep brain idea
3,RT @SewHQ: #Retro bears make perfect gifts and...,1,rt sewhq retro bear make perfect gift great be...
4,It’s hard to say whether packing lists are mak...,1,hard say whether pack list make life easier re...


In [12]:
vectorizer=CountVectorizer(ngram_range=(1, 1))

In [13]:
vectorizer.fit(df['preprocessed'])

In [14]:
bow=vectorizer.transform(df['preprocessed'])

In [15]:
bow_table=bow.toarray()

In [16]:
X=bow_table

In [17]:
X.shape

(20000, 28758)

In [18]:
y=df['label']

In [19]:
y.shape

(20000,)

In [20]:
# Model building

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
model=LogisticRegression()

In [23]:
model.fit(X,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
model.score(X,y)

0.97495

# Pickle the model

In [25]:
import pickle

In [26]:
pickle.dump(model,open('depression.pkl','wb'))