In [28]:
import pandas as pd


In [29]:
data=pd.read_csv("/content/depression_dataset_reddit_cleaned.csv")
data.head()

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [30]:
data.value_counts().sum()

7731

stopword and stemming

In [31]:
import nltk
import re
import string
from nltk.corpus import stopwords
nltk.download("stopwords")
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [33]:
data["clean_text"] = data["clean_text"].apply(clean)

In [34]:
x = data["clean_text"]
y = data["is_depression"]

In [35]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.2)

Vectorization

In [36]:
round(sum([len(i.split()) for i in X_train])/len(X_train))

34

In [37]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

max_vocab_length = 10000 
max_length = 34 

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [38]:
text_vectorizer.adapt(X_train)

In [39]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:] 
print(f"Vocab size: {len(words_in_vocab)}")
print(f"Top 5 word: {top_5_words}")
print(f"Bottom 5 word: {bottom_5_words}")

Vocab size: 10000
Top 5 word: ['', '[UNK]', 'feel', 'wa', 'like']
Bottom 5 word: ['inkart', 'ink', 'injustic', 'inherit', 'inhalten']


Embedding

In [40]:
from tensorflow.keras import layers 

embedding = layers.Embedding(input_dim=max_vocab_length, 
                             output_dim=128, 
                             embeddings_initializer="uniform", 
                             input_length=max_length 
                             )

In [41]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype=tf.string) 
x = text_vectorizer(inputs) 
x = embedding(x) 
x = layers.GlobalAveragePooling1D()(x) 
outputs = layers.Dense(1, activation="sigmoid")(x) 
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

In [44]:
model_1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [45]:
model_1.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fcd46758a10>

With LSTM

In [48]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype=tf.string) 
x = text_vectorizer(inputs) 
x = embedding(x) 
x = layers.LSTM(64, activation="tanh")(x)
outputs = layers.Dense(1, activation="sigmoid")(x) 
model_2 = tf.keras.Model(inputs, outputs, name="model_2_lstm")


In [49]:
model_2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [50]:
model_2.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fcd4659aad0>