# Machine Learning Approach for Depression Prediction Through Tweets

In [1]:
#!/usr/bin/env python3
import numpy as np
import pandas as pd

# load dataset

In [2]:
data = pd.read_csv("tweets_combined2.csv")

# Preprocessing

In [3]:
data.head()

Unnamed: 0,Index,tweet,Affect_Dimension,target
0,0,Best evening adult drink w/chocolate #satisfac...,joy,0
1,1,A SPELLBINDING performance by @DebsNewbold. El...,joy,0
2,2,@LogitechUK Been there done that,fear,0
3,3,I love #tattoos but seriously #justtattooofus ...,anger,0
4,4,Well. Its been like two weeks and I still feel...,Depression,1


In [4]:
data = data.drop(['Affect_Dimension'], axis=1)

In [5]:
# change long column names to short one
data.rename(columns={'tweet':'Message',
                     'target':'Label'}, inplace= True)

In [6]:
data.head()

Unnamed: 0,Index,Message,Label
0,0,Best evening adult drink w/chocolate #satisfac...,0
1,1,A SPELLBINDING performance by @DebsNewbold. El...,0
2,2,@LogitechUK Been there done that,0
3,3,I love #tattoos but seriously #justtattooofus ...,0
4,4,Well. Its been like two weeks and I still feel...,1


In [7]:
# basically index a non-informative parameter, so we can drop index column
data.drop(columns=["Index"], axis= 1, inplace= True)

In [8]:
# yes, we didi it, lets check new data
data.head()

Unnamed: 0,Message,Label
0,Best evening adult drink w/chocolate #satisfac...,0
1,A SPELLBINDING performance by @DebsNewbold. El...,0
2,@LogitechUK Been there done that,0
3,I love #tattoos but seriously #justtattooofus ...,0
4,Well. Its been like two weeks and I still feel...,1


In [9]:
# lets take a view of data fromall possible ways
data.shape

(1755, 2)

In [10]:
#we have 10314 row and 2 columns

In [11]:
data.describe()

Unnamed: 0,Label
count,1755.0
mean,0.354986
std,0.478645
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1755 entries, 0 to 1754
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Message  1755 non-null   object
 1   Label    1755 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 27.5+ KB


In [13]:
data.count()

Message    1755
Label      1755
dtype: int64

In [14]:
data['Label'].value_counts()

0    1132
1     623
Name: Label, dtype: int64

In [15]:
# processing

In [16]:
# import all required libraries here
! pip install nltk




In [17]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
# imported regular expressions
# object stopwords is not callable here to create object
ps = PorterStemmer()



In [18]:
#Create a empty corpus to hold vector value


In [19]:
# Lets start processing
corpus = []
for i in range(0, data.shape[0]):
#    if i < 10:
#        print("\n",data['Message'][i])
    msg = data["Message"][i]
    # email addess
    msg = re.sub("\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b", "emailaddr", data["Message"][i])
    
    
    # url 
    
    msg = re.sub("https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", "url", data['Message'][i])
    
    # Mobile no
    
    msg = re.sub("/^(\+\d{1,3}[- ]?)?\d{10}$/","mobile", data['Message'][i])
    
    # Number
    
    msg = re.sub("\d+(\.\d+)?","number",data['Message'][i])
    #Phone number
    msg = re.sub("\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b","phoneno",data['Message'][i])
    
    # punctuation
    
    msg = re.sub("[^\w\d\s]"," ", data['Message'][i])
    
    # convert to lower case
    
    msg = msg.lower()
    # craete tokens using split
    msg = msg.split()
    #print only for 10 messages
    #if i < 10:
        #print("Message-After Split: ",msg)
    
    # apply stemming on it to remove stopwords
    # porter stemmer
    
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    
    #print("Message-After Stemming: ",msg)
    # preparing message with remaining tokens
    msg = " ".join(msg)
    
    # preparing a corpus
    
    corpus.append(msg)
print(corpus)

['best even adult drink w chocol satisfact nestl dark hot chocol chili powder cinnamon whiskey delici 鈽曪笍', 'spellbind perform debsnewbold electr physic sheer awe pleasur watch thank ueafli', 'logitechuk done', 'love tattoo serious justtattooofu reveng amp', 'well like two week still feel cut friendship fail constantli reassur could best us die insid constantli think fault', 'look good news today find twitter bummer 馃槩 depress badnew', 'depress think instead want mind healthi take break', 'vin diesel pretti cool seen movi riddick pitch black must work someth smile', 'wanna die', 'liz_hanburi oh jumper turn colder 馃槦 terrif day馃檵馃徏馃尭馃尲馃尰馃挄馃挄', 'love one would openli share length breadth mental health issu us knew sure turn around make us', 'sad talk highli someon end disappoint u make u look like pendeja', 's1dharthm asli_jacquelin foxstarhindi krishdk rajnidimoru look fun 馃榾 cant wait watch', 'want sleep dream bizarr still escap problem anxieti depress tiredoflif', 'good morn famili to

In [20]:
# Prepare a vector using countvectorizer

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv.fit(corpus)
sparse_input = cv.fit_transform(corpus)

# during model implementation we need data in dense format and current input is a sparse data
# convert it to array
input = sparse_input.toarray()

In [21]:
input

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [22]:
data.head()

Unnamed: 0,Message,Label
0,Best evening adult drink w/chocolate #satisfac...,0
1,A SPELLBINDING performance by @DebsNewbold. El...,0
2,@LogitechUK Been there done that,0
3,I love #tattoos but seriously #justtattooofus ...,0
4,Well. Its been like two weeks and I still feel...,1


In [23]:
# select output i.e Label 

output = data.iloc[:,-1]

In [24]:
output.head()

0    0
1    0
2    0
3    0
4    1
Name: Label, dtype: int64

In [25]:
# we have input and output ready, now split data in input and output sets
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(input, output, test_size=0.2, random_state=0)

In [26]:
xtrain.shape

(1404, 4946)

In [27]:
ytrain.shape

(1404,)

In [28]:
xtest.shape

(351, 4946)

In [29]:
ytest.shape

(351,)

In [30]:
# Create and implement a model for prediction
from sklearn.naive_bayes import GaussianNB
model_gnb = GaussianNB()
# fit and predict values
model_gnb.fit(xtrain,ytrain)
pred = model_gnb.predict(xtest)

# Lets chec accuracy of model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

acc = accuracy_score(ytest,pred)
cm = confusion_matrix(ytest, pred)
cr = classification_report(ytest, pred)

print("Accuracy:\n ",acc)
print("Confusion Matrix:\n ",cm)
print("Classification rep[ort]:\n ",cr)

Accuracy:
  0.7863247863247863
Confusion Matrix:
  [[166  44]
 [ 31 110]]
Classification rep[ort]:
                precision    recall  f1-score   support

           0       0.84      0.79      0.82       210
           1       0.71      0.78      0.75       141

    accuracy                           0.79       351
   macro avg       0.78      0.79      0.78       351
weighted avg       0.79      0.79      0.79       351



In [31]:
# Create and implement a model for prediction
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier()
# fit and predict values
model_dt.fit(xtrain,ytrain)
pred = model_dt.predict(xtest)

# Lets chec accuracy of model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

acc = accuracy_score(ytest,pred)
cm = confusion_matrix(ytest, pred)
cr = classification_report(ytest, pred)

print("Accuracy:\n ",acc)
print("Confusion Matrix:\n ",cm)
print("Classification rep[ort]:\n ",cr)

Accuracy:
  0.792022792022792
Confusion Matrix:
  [[161  49]
 [ 24 117]]
Classification rep[ort]:
                precision    recall  f1-score   support

           0       0.87      0.77      0.82       210
           1       0.70      0.83      0.76       141

    accuracy                           0.79       351
   macro avg       0.79      0.80      0.79       351
weighted avg       0.80      0.79      0.79       351



In [32]:
# Create and implement a model for prediction
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()
# fit and predict values
model_rf.fit(xtrain,ytrain)
pred = model_rf.predict(xtest)

# Lets chec accuracy of model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

acc = accuracy_score(ytest,pred)
cm = confusion_matrix(ytest, pred)
cr = classification_report(ytest, pred)

print("Accuracy:\n ",acc)
print("Confusion Matrix:\n ",cm)
print("Classification rep[ort]:\n ",cr)

Accuracy:
  0.8575498575498576
Confusion Matrix:
  [[183  27]
 [ 23 118]]
Classification rep[ort]:
                precision    recall  f1-score   support

           0       0.89      0.87      0.88       210
           1       0.81      0.84      0.83       141

    accuracy                           0.86       351
   macro avg       0.85      0.85      0.85       351
weighted avg       0.86      0.86      0.86       351



In [33]:
import pickle

In [34]:
pickle.dump(model_rf,open('depression_model.pkl','wb'))

In [38]:
model=pickle.load(open('depression_model.pkl','rb'))

In [39]:
model.predict(xtest)

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,