In [21]:
# !git clone https://github.com/Inusette/Identifying-depression.git

In [22]:
# !pip install contractions

In [1]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import bigrams, trigrams

import re, string, random
import contractions

import itertools
import collections
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from wordcloud import WordCloud
from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
stopwords_set = stopwords.words("english")
# add_custom = ["add new words"]

In [3]:
dep_data_path = "./Identifying-depression/Data_Collector/reddit_depression/"
non_dep_data_path = "./Identifying-depression/Data_Collector/reddit_non_depression/"

dep_txt_list = os.listdir(dep_data_path)
non_dep_txt_list = os.listdir(non_dep_data_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 16


In [4]:
porter = PorterStemmer()
stemming = False #reduce the words to their root form

def sentence_preprocess(sentence):

    """""""""""""""""""""""""""
    Decontract the words in given sentence, and remove stopwords, URLs, and punctuations
    Also, reduce the words to their root form, if applicable 

    """""""""""""""""""""""""""

    extend_sentence = contractions.fix(sentence)
    token_words = word_tokenize(extend_sentence)
    # rm_stop_words = [word for word in token_words if not word.lower in stopwords_set]
    
    if stemming == True:
        filtered_words = [porter.stem(word) for word in token_words if word.lower() not in stopwords_set]
 
    else:
        filtered_words = [word.lower() for word in token_words if word.lower() not in stopwords_set]

    sentence = re.sub(r'http\S+', '', " ".join(filtered_words)) #remove url
    sentence = re.sub(r'[^\w\s]', '', sentence) #remove puntuation
    sentence = sentence.replace("  "," ")

    return sentence

In [5]:
def tokenize(corpus):
    corpus_tokenized = []
    for document in corpus:
        corpus_tokenized.append(document.split(" "))
    return corpus_tokenized

def get_vocab(corpus):
    vocab = set()
    for document in tokenize(corpus):
        for word in document:
            vocab.add(word)
    return vocab

In [6]:
dep_txt_read = []
non_dep_txt_read = []
for txt_file in dep_txt_list:
    with open(dep_data_path+txt_file) as f:
        txt_sample =[line.strip() for line in f.readlines()]
    dep_txt_read.append(" ".join(txt_sample))
    
for txt_file in non_dep_txt_list:
    with open(non_dep_data_path+txt_file) as f:
        txt_sample =[line.strip() for line in f.readlines()]
    non_dep_txt_read.append(" ".join(txt_sample))

dep_processed_data = []
non_dep_processed_data = []

for sentence in dep_txt_read:
    dep_processed_data.append(sentence_preprocess(sentence))
    
for sentence in non_dep_txt_read:
    non_dep_processed_data.append(sentence_preprocess(sentence))

print(f"# of dep posts: {len(dep_processed_data)}")
print(f"# of non-dep posts: {len(non_dep_processed_data)}")

# of dep posts: 1293
# of non-dep posts: 548


In [7]:
d = {'text': dep_processed_data + non_dep_processed_data,
     'class': [1 for i in range(len(dep_processed_data))] + [0 for i in range(len(non_dep_processed_data))]}
df = pd.DataFrame(data=d)
df.head()

Unnamed: 0,text,class
0,exercise make feel better everyone says listen...,1
1,anyone talk time trying stop,1
2,carpe diem today day change sick tire letting ...,1
3,today going tough okay bit really slump couple...,1
4,even want talk anymore ever say word would reg...,1


In [8]:
from collections import Counter
PADDING_VALUE = 0
UNK_VALUE     = 1

def split_train_val_test(df, props=[.8, .1, .1]):
    assert round(sum(props), 2) == 1 and len(props) >= 2
    train_df, test_df, val_df = None, None, None
    
    train_idx = int(len(df)*props[0])
    valid_idx = int(len(df)*(props[0]+props[1]))

    train_df = df.iloc[:train_idx]
    val_df = df.iloc[train_idx:valid_idx]
    test_df = df.iloc[valid_idx:]  
    
    return train_df, val_df, test_df

def generate_vocab_map(df, cutoff=2):
    vocab          = {"": PADDING_VALUE, "UNK": UNK_VALUE}
    reversed_vocab = None

    word_count = Counter()
    for row in df['tokenized']:
      word_count += Counter(row)

    unique_id = len(vocab)
    for word, count in word_count.items():
      if count > cutoff:
        vocab[word] = unique_id
        unique_id += 1

    reversed_vocab = dict()
    for word, unique_id in vocab.items():
      reversed_vocab[unique_id] = word

    return vocab, reversed_vocab

In [9]:
df["tokenized"] = df["text"].apply(lambda x: nltk.word_tokenize(x.lower()))

df                         = df.sample(frac=1) # random shuffling
train_df, val_df, test_df  = split_train_val_test(df, props=[.8, .1, .1])
train_vocab, reverse_vocab = generate_vocab_map(train_df)

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

corpus = train_df["text"]
pipe = Pipeline([('count', CountVectorizer(ngram_range=(1, 2), max_features=6000)),
                 ('tfid', TfidfTransformer())]).fit(corpus)
# print(pipe['count'].transform(corpus).toarray().shape)
# print(pipe['tfid'].idf_)
# print(pipe.transform(corpus).shape)

In [11]:
train_df["ngram"] = train_df["text"].apply(lambda x: pipe.transform([x]).toarray()[0])
val_df["ngram"] = val_df["text"].apply(lambda x: pipe.transform([x]).toarray()[0])
test_df["ngram"] = test_df["text"].apply(lambda x: pipe.transform([x]).toarray()[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["ngram"] = train_df["text"].apply(lambda x: pipe.transform([x]).toarray()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df["ngram"] = val_df["text"].apply(lambda x: pipe.transform([x]).toarray()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["ngram"] = test_df["text"].

In [12]:
from torch.utils.data import Dataset
class HeadlineDataset(Dataset):    
    def __init__(self, vocab, df, max_length=50):
        self.vocab = vocab
        self.df = df
        self.max_length = max_length
        
    def __len__(self):
        df_len = len(self.df)

        return df_len

    def __getitem__(self, index: int):
        tokenized_word_tensor = list()

        for word in self.df.iloc[index]["tokenized"]:
          encoded_word = train_vocab.get(word) if train_vocab.get(word) != None else train_vocab.get("UNK")
          tokenized_word_tensor.append(encoded_word)

        tokenized_word_tensor = torch.LongTensor(tokenized_word_tensor[:self.max_length])
        curr_label = self.df.iloc[index]["class"]
        ngram_feature = self.df.iloc[index]["ngram"]

        return (tokenized_word_tensor,ngram_feature), curr_label

In [13]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import RandomSampler, DataLoader

def collate_fn(batch, padding_value=PADDING_VALUE):
    
    padded_tokens = [torch.LongTensor([PADDING_VALUE]*50)]
    ngram_features = []
    y_labels = []
    for row in batch:
      word_tensor, curr_label = row
      padded_tokens.append(word_tensor[0])
      ngram_features.append(word_tensor[1])
      y_labels.append(curr_label)
    
    padded_tokens = pad_sequence(padded_tokens,batch_first=True,padding_value=PADDING_VALUE)[1:]
    ngram_features = torch.FloatTensor(np.array(ngram_features))
    y_labels = torch.FloatTensor(y_labels)

    return (padded_tokens,ngram_features), y_labels

train_dataset = HeadlineDataset(train_vocab, train_df)
val_dataset   = HeadlineDataset(train_vocab, val_df)
test_dataset  = HeadlineDataset(train_vocab, test_df)

train_sampler = RandomSampler(train_dataset)
val_sampler   = RandomSampler(val_dataset)
test_sampler  = RandomSampler(test_dataset)

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator   = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)
test_iterator  = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

In [14]:
def accuracy(true, pred):
    incorrect_ratio = float(torch.sum((true*1 + pred*1==1))/len(true))
    acc = 1 - incorrect_ratio
    return acc

In [15]:
def train_loop(model, criterion, optim, iterator):
    model.train()
    total_loss = 0
    for x, y in tqdm(iterator):
        words, ngram = x
        optim.zero_grad()
        prediction = torch.squeeze(model(words.to(device), ngram.to(device)))
        loss = criterion(prediction, y.to(device))
        loss.backward()
        optim.step()
        total_loss += loss

    return total_loss

def val_loop(model, iterator):
    true, pred = [], []
    model.eval()
    for x, y in iterator:
        words, ngram = x
        prediction = torch.squeeze(model(words.to(device), ngram.to(device))).detach().cpu()
        pred.append(prediction)
        true.append(y)
    
    true = torch.hstack(true) >= 0.5
    pred = torch.hstack(pred) >= 0.5
    
    return true, pred

In [16]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, \
                 num_layers=1, bidirectional=True):
        super().__init__()

        self.lstm_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=hidden_dim,num_layers=num_layers,bidirectional=bidirectional,batch_first=True)

        self.fc_lstm = nn.Linear(hidden_dim*(bidirectional+1),256)

        self.fc1_ngrm = nn.Linear(6000,512)
        self.fc2_ngrm = nn.Linear(512,128)

        self.fc_final = nn.Linear(256+128,1)

        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, words, ngram):

        word_embeddings = self.lstm_embedding(words)
        out, hidden = self.lstm(word_embeddings)
        lstm_out = self.fc_lstm(out[:,-1,:])
        ngrm_out = self.fc2_ngrm(self.relu(self.fc1_ngrm(ngram)))

        final_out = self.fc_final(torch.cat((lstm_out,ngrm_out), dim=-1))
        pred_score = self.sigmoid(final_out)

        return pred_score

In [17]:
from torch.optim import Adam

model = LSTM(vocab_size=len(train_vocab.keys()), embedding_dim=256, hidden_dim=256, num_layers=1, bidirectional=True).to(device)

criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr = 1e-4)

In [18]:
TOTAL_EPOCHS = 5
for epoch in range(TOTAL_EPOCHS):
    train_loss = train_loop(model, criterion, optimizer, train_iterator)
    true, pred = val_loop(model, val_iterator)
    print(f"EPOCH: {epoch}")
    print(f"TRAIN LOSS: {train_loss}")
    print(f"VAL ACC: {accuracy(true, pred)}")

100%|██████████| 92/92 [00:03<00:00, 24.06it/s]


EPOCH: 0
TRAIN LOSS: 55.50115966796875
VAL ACC: 0.695652186870575


100%|██████████| 92/92 [00:02<00:00, 35.33it/s]


EPOCH: 1
TRAIN LOSS: 46.935935974121094
VAL ACC: 0.75


100%|██████████| 92/92 [00:02<00:00, 30.87it/s]


EPOCH: 2
TRAIN LOSS: 27.417816162109375
VAL ACC: 0.8913043513894081


100%|██████████| 92/92 [00:01<00:00, 64.40it/s]


EPOCH: 3
TRAIN LOSS: 11.725215911865234
VAL ACC: 0.9293478280305862


100%|██████████| 92/92 [00:01<00:00, 72.96it/s]


EPOCH: 4
TRAIN LOSS: 5.549252510070801
VAL ACC: 0.9239130467176437


100%|██████████| 92/92 [00:01<00:00, 72.12it/s]


EPOCH: 5
TRAIN LOSS: 2.949536085128784
VAL ACC: 0.9130434766411781


100%|██████████| 92/92 [00:01<00:00, 75.35it/s]


EPOCH: 6
TRAIN LOSS: 1.8431024551391602
VAL ACC: 0.9184782579541206


100%|██████████| 92/92 [00:01<00:00, 74.06it/s]


EPOCH: 7
TRAIN LOSS: 1.2072057723999023
VAL ACC: 0.9184782579541206


100%|██████████| 92/92 [00:01<00:00, 76.05it/s]


EPOCH: 8
TRAIN LOSS: 0.8619803190231323
VAL ACC: 0.9184782579541206


100%|██████████| 92/92 [00:01<00:00, 74.92it/s]


EPOCH: 9
TRAIN LOSS: 0.6663330793380737
VAL ACC: 0.9184782579541206


In [19]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [20]:
#See how your model does on the held out data
true, pred = val_loop(model, test_iterator)
print(f"TEST ACC: {accuracy(true, pred)}")

print("Precision", precision_score(true, pred))
print("Recall", recall_score(true, pred))
print("F1", f1_score(true, pred))

TEST ACC: 0.9459459446370602
Precision 0.9465648854961832
Recall 0.9763779527559056
F1 0.9612403100775194


In [None]:
# LSTM (w/unigram and bigram)
# TEST ACC: 0.9459459446370602
# Precision 0.9465648854961832
# Recall 0.9763779527559056
# F1 0.9612403100775194

In [None]:
# LSTM (base)
# TEST ACC: 0.9027027040719986
# Precision 0.9253731343283582
# Recall 0.9393939393939394
# F1 0.9323308270676692