#Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torch
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

#Getting data from GDrive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
downloaded = drive.CreateFile({'id':"1FYUnUbaHtu_P0Ap6DhnJtYYtFyMiGA93"})
downloaded.GetContentFile('d_tweets.csv')
df_pos = pd.read_csv("d_tweets.csv")
downloaded = drive.CreateFile({'id':"1ZKT-f-ZuVGWSWshvvr14JoaKaQsIX919"})
downloaded.GetContentFile('non_d_tweets.csv')
df_neg = pd.read_csv("non_d_tweets.csv")

#Since the depression and non depression dataa are separated in diffferent file, let's concat them

In [None]:
df_pos = df_pos[['tweet']]
df_pos['label'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pos['label'] = 1


In [None]:
df_pos.columns

Index(['tweet', 'label'], dtype='object')

In [None]:
df_neg = df_neg[['tweet']]
df_neg['label'] = 0
df_neg.columns

Index(['tweet', 'label'], dtype='object')

In [None]:
df = pd.concat([df_pos,df_neg],ignore_index=True)


#Final form of dataset after concatenation

In [None]:
df

Unnamed: 0,tweet,label
0,the real reason why you're sad? you're attache...,1
1,my biggest problem is overthinking everything,1
2,the worst sadness is the sadness you've taught...,1
3,i cannot make you understand. i cannot make an...,1
4,i don't think anyone really understands how ti...,1
...,...,...
8300,Cardi B wants to trademark her catchphrase “Ok...,0
8301,I’ll bet Kellyanne and George Conway have pret...,0
8302,Fans are always asking me how they can watch t...,0
8303,"Ray Romano is a hilarious comedian, a kind sou...",0


# Preprocessing

## Lowercase

In [None]:
df['tweet'] = df['tweet'].str.lower()
df['tweet']

0       the real reason why you're sad? you're attache...
1           my biggest problem is overthinking everything
2       the worst sadness is the sadness you've taught...
3       i cannot make you understand. i cannot make an...
4       i don't think anyone really understands how ti...
                              ...                        
8300    cardi b wants to trademark her catchphrase “ok...
8301    i’ll bet kellyanne and george conway have pret...
8302    fans are always asking me how they can watch t...
8303    ray romano is a hilarious comedian, a kind sou...
8304    mueller's report may be finished, but mine is ...
Name: tweet, Length: 8305, dtype: object

## Remove Links

In [None]:
def remove_links(text):
    return re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

df['tweet'] = df['tweet'].apply(remove_links)
df

Unnamed: 0,tweet,label
0,the real reason why you're sad? you're attache...,1
1,my biggest problem is overthinking everything,1
2,the worst sadness is the sadness you've taught...,1
3,i cannot make you understand. i cannot make an...,1
4,i don't think anyone really understands how ti...,1
...,...,...
8300,cardi b wants to trademark her catchphrase “ok...,0
8301,i’ll bet kellyanne and george conway have pret...,0
8302,fans are always asking me how they can watch t...,0
8303,"ray romano is a hilarious comedian, a kind sou...",0


## Removing Hashtag and Username

In [None]:
def remove_hashtags(text):
    return re.sub(r"#", '', text) #ini buat hapus simbol # nya aja
    #return re.sub(r"#\+w", '', text) #ini buat hapus seluruh hashtagnya
    return re.sub(r"@", '', text) #ini buat hapus simbol @ nya aja
    #return re.sub(r"@\+w", '', text) #ini buat hapus seluruh usernamenya

df['tweet'] = df['tweet'].apply(remove_hashtags)
df

Unnamed: 0,tweet,label
0,the real reason why you're sad? you're attache...,1
1,my biggest problem is overthinking everything,1
2,the worst sadness is the sadness you've taught...,1
3,i cannot make you understand. i cannot make an...,1
4,i don't think anyone really understands how ti...,1
...,...,...
8300,cardi b wants to trademark her catchphrase “ok...,0
8301,i’ll bet kellyanne and george conway have pret...,0
8302,fans are always asking me how they can watch t...,0
8303,"ray romano is a hilarious comedian, a kind sou...",0


## Tokenization + Removing Stopwords

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df['tweet'] = [word_tokenize(docs) for docs in df['tweet']]
df

Unnamed: 0,tweet,label
0,"[the, real, reason, why, you, 're, sad, ?, you...",1
1,"[my, biggest, problem, is, overthinking, every...",1
2,"[the, worst, sadness, is, the, sadness, you, '...",1
3,"[i, can, not, make, you, understand, ., i, can...",1
4,"[i, do, n't, think, anyone, really, understand...",1
...,...,...
8300,"[cardi, b, wants, to, trademark, her, catchphr...",0
8301,"[i, ’, ll, bet, kellyanne, and, george, conway...",0
8302,"[fans, are, always, asking, me, how, they, can...",0
8303,"[ray, romano, is, a, hilarious, comedian, ,, a...",0


In [None]:
nltk.download("stopwords")
stopwrd = stopwords.words('english')
stopwrd

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
def remove_stopwords(text):
    return ' '.join([word for word in text if word not in stopwrd])
    # return ' '.join([word for word in text if word not in stopwrd])

df['tweet'] = df['tweet'].apply(remove_stopwords)
df

Unnamed: 0,tweet,label
0,real reason 're sad ? 're attached people dist...,1
1,biggest problem overthinking everything,1
2,worst sadness sadness 've taught hide .,1
3,make understand . make anyone understand happe...,1
4,n't think anyone really understands tiring act...,1
...,...,...
8300,cardi b wants trademark catchphrase “ okurr ” ...,0
8301,’ bet kellyanne george conway pretty disturbin...,0
8302,"fans always asking watch `` old stuff , '' fin...",0
8303,"ray romano hilarious comedian , kind soul , ra...",0


## Removing Punctuation and Numbers

In [None]:
def remove_punctuation(text):
    whitelist = 'a-zA-Z '
    pattern = f"[^{whitelist}]"
    return re.sub(pattern, '', text)

df['tweet'] = df['tweet'].apply(remove_punctuation)
df

Unnamed: 0,tweet,label
0,real reason re sad re attached people distant...,1
1,biggest problem overthinking everything,1
2,worst sadness sadness ve taught hide,1
3,make understand make anyone understand happen...,1
4,nt think anyone really understands tiring act ...,1
...,...,...
8300,cardi b wants trademark catchphrase okurr th...,0
8301,bet kellyanne george conway pretty disturbing...,0
8302,fans always asking watch old stuff finally ...,0
8303,ray romano hilarious comedian kind soul rare...,0


In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(sentence):
    words = sentence.split()
    verbs = [lemmatizer.lemmatize(w, pos='v') for w in words]
    adjs = [lemmatizer.lemmatize(w, pos='a') for w in verbs]
    nouns = [lemmatizer.lemmatize(w, pos='n') for w in adjs]
    return ' '.join(nouns)

df['tweet'] = df['tweet'].apply(lemmatize_sentence)

In [None]:
df

Unnamed: 0,tweet,label
0,real reason re sad re attach people distant re...,1
1,big problem overthinking everything,1
2,bad sadness sadness ve teach hide,1
3,make understand make anyone understand happen ...,1
4,nt think anyone really understand tire act oka...,1
...,...,...
8300,cardi b want trademark catchphrase okurr think...,0
8301,bet kellyanne george conway pretty disturb mak...,0
8302,fan always ask watch old stuff finally answer ...,0
8303,ray romano hilarious comedian kind soul rare n...,0


In [None]:
X = df['tweet']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorization using TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)


# Logistic Regression (LR)

In [None]:
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression()
model_LR.fit(tfidf_train, y_train)
y_pred_LR = model_LR.predict(tfidf_test)

LR_acc = accuracy_score(y_test, y_pred_LR)
LR_F1 = f1_score(y_test,y_pred_LR,average='weighted')
LR_precision = precision_score(y_test,y_pred_LR,average='weighted')
LR_recall = recall_score(y_test,y_pred_LR,average='weighted')
LR_summary = [LR_acc,LR_F1,LR_precision,LR_recall]
LR_summary

[0.836677367576244, 0.8350349087897012, 0.8377909757789501, 0.836677367576244]

# SVM

In [None]:
from sklearn.svm import SVC

model_SVM = SVC()
model_SVM.fit(tfidf_train, y_train)
y_pred_SVM = model_SVM.predict(tfidf_test)

SVM_acc = accuracy_score(y_test, y_pred_SVM)
SVM_F1 = f1_score(y_test,y_pred_SVM,average='weighted')
SVM_precision = precision_score(y_test,y_pred_SVM,average='weighted')
SVM_recall = recall_score(y_test,y_pred_SVM,average='weighted')
SVM_summary = [SVM_acc,SVM_F1,SVM_precision,SVM_recall]
SVM_summary

[0.8595505617977528,
 0.8587367906274584,
 0.8597532087173745,
 0.8595505617977528]

# LSTM

In [None]:
import scipy
import torch.optim as optim

In [None]:
tfidf_train

<5813x6819 sparse matrix of type '<class 'numpy.float64'>'
	with 46588 stored elements in Compressed Sparse Row format>

In [None]:
# x_train_np = tfidf_train.to_numpy()
# x_test_np = X_test.to_numpy()
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)
x_train = torch.tensor(tfidf_train.toarray()).float()
x_test = torch.tensor(tfidf_test.toarray()).float()
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

  y_train = torch.tensor(y_train)
  y_test = torch.tensor(y_test)


In [None]:
from torch import nn
model = nn.Sequential(
             nn.Linear(x_train.shape[1], 64),
             nn.ReLU(),
             nn.Linear(64, df['label'].nunique()),
             nn.LogSoftmax(dim=1))# Define the loss
criterion = nn.NLLLoss()# Forward pass, log
logps = model(x_train)# Calculate the loss with the logits and the labels
loss = criterion(logps, y_train)
loss.backward()# Optimizers need parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 20
for e in range(epochs):
    optimizer.zero_grad()
    output = model.forward(x_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

In [None]:
with torch.no_grad():
    model.eval()
    log_ps = model(x_test)
    test_loss = criterion(log_ps, y_test)
    ps = torch.exp(log_ps)
    top_p, top_class = ps.topk(1, dim=1)
    predicted_labels = top_class.squeeze().cpu().numpy()
    true_labels = y_test.cpu().numpy()

In [None]:
LSTM_acc = accuracy_score(y_test, predicted_labels)
LSTM_F1 = f1_score(y_test, predicted_labels, average='weighted')
LSTM_precision = precision_score(y_test, predicted_labels, average='weighted')
LSTM_recall = recall_score(y_test, predicted_labels, average='weighted')
LSTM_summary = [LSTM_acc,LSTM_F1,LSTM_precision,LSTM_recall]
LSTM_summary

[0.8382825040128411,
 0.8390819410158599,
 0.8519050986147407,
 0.8382825040128411]

#Final result

In [None]:
metrics = ['Accuracy', 'F1-Score', 'Precision', 'Recall']
data = {'Metrics': metrics, 'Logistic Regression': LR_summary, 'SVM': SVM_summary, 'LSTM': LSTM_summary}
summary = pd.DataFrame(data)
summary

Unnamed: 0,Metrics,Logistic Regression,SVM,LSTM
0,Accuracy,0.836677,0.859551,0.838283
1,F1-Score,0.835035,0.858737,0.839082
2,Precision,0.837791,0.859753,0.851905
3,Recall,0.836677,0.859551,0.838283
