In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv('Dataset_LSTM2.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)
df.head()

Unnamed: 0,Text,Class
0,Ex Wife Threatening SuicideRecently I left my ...,depressed
1,Am I weird I don't get affected by compliments...,neutral
2,Finally 2020 is almost over... So I can never ...,neutral
3,i need helpjust help me im crying so hard,depressed
4,"I'm so lostHello, my name is Adam (16) and I'v...",suicide


In [8]:
df = df.iloc[:900]
df.head()

Unnamed: 0,Text,Class
0,Ex Wife Threatening SuicideRecently I left my ...,depressed
1,Am I weird I don't get affected by compliments...,neutral
2,Finally 2020 is almost over... So I can never ...,neutral
3,i need helpjust help me im crying so hard,depressed
4,"I'm so lostHello, my name is Adam (16) and I'v...",suicide


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    900 non-null    object
 1   Class   900 non-null    object
dtypes: object(2)
memory usage: 14.2+ KB


In [10]:
df.Class.value_counts()

neutral        467
suicide        257
depressed      173
depresssion      1
depresssed       1
Suicide          1
Name: Class, dtype: int64

In [5]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000

# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250

# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['Text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 9598 unique tokens.


In [6]:
X = tokenizer.texts_to_sequences(df['Text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (1034, 250)


In [None]:
X[0]

In [7]:
Y = pd.get_dummies(df['Class']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (1034, 3)


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(930, 250) (930, 3)
(104, 250) (104, 3)


In [9]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(tf.keras.layers.SpatialDropout1D(0.2))
model.add(tf.keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          5000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 3)                 303       
                                                                 
Total params: 5,080,703
Trainable params: 5,080,703
Non-trainable params: 0
_________________________________________________________________


2022-06-27 14:33:45.870088: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-06-27 14:33:45.870217: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [10]:
epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, 
                    epochs=epochs, 
                    batch_size=batch_size,validation_split=0.1,
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)],
                    verbose=1)

Epoch 1/5


2022-06-27 14:33:49.094303: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-06-27 14:33:50.046667: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-06-27 15:05:50.982754: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set/n  Loss: {:0.3f}/n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']   #if you are usinf tf version 1.x then use 'acc' instead of 'accuracy'
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

fig = plt.subplots(figsize=(30, 10))

plt.subplot(1, 2, 1)
plt.plot(acc)
plt.plot(val_acc)
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Training Accuracy', 'Validation Accuracy'])

plt.subplot(1, 2, 2)
plt.plot(loss)
plt.plot(val_loss)
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Training Loss', 'Validation Loss'])

plt.show()

In [None]:
new_complaint = ['Iâ€™m trashLol I normally cringe at the self loathing posts here but honestly Iâ€™m such trash. Like literally everything about me. I just wish I could muster up the courage to just follow through. This is it and Iâ€™m okay with that that: everyday here is worst than the last. I appreciate this community for letting me know Iâ€™m not alone.']
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['Depressed', 'Sucide', 'Neutral']
print(pred, labels[np.argmax(pred)])