In [1]:
#Import all the libraries needed
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [None]:
#Preview dataset

In [2]:
import pathlib
from google.colab import drive
drive.mount('/content/drive')
data_train = pathlib.Path('/content/drive/My Drive/phm_train.csv')
data_test = pathlib.Path('/content/drive/My Drive/phm_test.csv')
dtrain = pd.read_csv(data_train)
dtest = pd.read_csv(data_test)
print(dtrain)
print(dtest)

Mounted at /content/drive
          tweet_id  label                                              tweet
0     6.430000e+17      0  user_mention all i can tell you is i have had ...
1     6.440000e+17      0  my doctor told me stop he gave me sum pop i mi...
2     8.150000e+17      1  i take tylenol and i wake up in the middle of ...
3     6.820000e+17      0  i got xans in an advil bottle i dont take them...
4     6.440000e+17      1  mom says i need to stop eating so much bc ive ...
...            ...    ...                                                ...
9986  6.480000e+17      1                          that vicodin messed me up
9987  5.710000e+17      0                  user_mention get some tylenol lol
9988  6.470000e+17      0                          like a walking tamiflu ad
9989  6.990000e+17      0                         klay and steph on steroids
9990  8.230000e+17      0                    horrible pops another xanax url

[9991 rows x 3 columns]
          tweet_id  label

In [None]:
#Declaring the english stop words

In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Preprocessing and Encoding labels

In [4]:
def load_dataset():
  x_train = dtrain['tweet']     # tweets/Input
  y_train = dtrain['label']    # label/Output


    # PRE-PROCESS tweet
  x_train = x_train.replace({'<.*?>': ''}, regex = True)          # remove html tag
  x_train = x_train.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
  x_train = x_train.apply(lambda tweet: [w for w in tweet.split() if w not in english_stops])  # remove stop words
  x_train = x_train.apply(lambda tweet: [w.lower() for w in tweet])   # lower case

  return x_train, y_train

x_train, y_train = load_dataset()

print('tweet')
print(x_train, '\n')
print('label')
print(y_train)


tweet
0       [user, mention, tell, relapses, cure, hear, do...
1       [doctor, told, stop, gave, sum, pop, mix, w, a...
2       [take, tylenol, wake, middle, night, put, ice,...
3       [got, xans, advil, bottle, dont, take, shits, ...
4       [mom, says, need, stop, eating, much, bc, ive,...
                              ...                        
9986                                    [vicodin, messed]
9987                   [user, mention, get, tylenol, lol]
9988                         [like, walking, tamiflu, ad]
9989                              [klay, steph, steroids]
9990                [horrible, pops, another, xanax, url]
Name: tweet, Length: 9991, dtype: object 

label
0       0
1       0
2       1
3       0
4       1
       ..
9986    1
9987    0
9988    0
9989    0
9990    0
Name: label, Length: 9991, dtype: int64


In [5]:
def load_dataset2():
  x_test = dtest['tweet']     # tweets/Input
  y_test = dtest['label']   # label/Output

  # PRE-PROCESS tweet
  x_test = x_test.replace({'<.*?>': ''}, regex = True)          # remove html tag
  x_test = x_test.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
  x_test = x_test.apply(lambda tweet: [w for w in tweet.split() if w not in english_stops])  # remove stop words
  x_test = x_test.apply(lambda tweet: [w.lower() for w in tweet])   # lower case

  return x_test, y_test

x_test, y_test = load_dataset2()

print('tweet')
print(x_test, '\n')
print('label')
print(y_test)

tweet
0       [try, run, away, iv, needle, doctor, drug, w, ...
1       [knew, took, ambien, sleep, early, im, ready, ...
2       [mean, get, celexa, reason, behind, lot, weigh...
3       [call, dumb, dumb, one, time, dont, care, many...
4       [want, go, grocery, store, cant, pay, anyone, ...
                              ...                        
3326                           [fina, take, xanax, knock]
3327               [user, mention, yr, citalopram, right]
3328              [user, mention, yeah, im, going, norco]
3329              [user, mention, tylenol, w, codin, lol]
3330                [thats, determination, steroids, url]
Name: tweet, Length: 3331, dtype: object 

label
0       0
1       1
2       1
3       0
4       0
       ..
3326    0
3327    0
3328    0
3329    0
3330    0
Name: label, Length: 3331, dtype: int64


In [None]:
#Function for getting the maximum tweet length, by calculating the mean of all the tweets length (using numpy.mean)

In [6]:
def get_max_length():
    tweet_length = []
    for tweet in x_train:
        tweet_length.append(len(tweet))

    return int(np.ceil(np.mean(tweet_length)))

In [None]:
#Tokenize and Pad/Truncate tweets
#post, pad or truncate the words in the back of a sentence
#pre, pad or truncate the words in front of a sentence

In [7]:
# ENCODE tweet
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum tweet length: ', max_length)

Encoded X Train
 [[    2     1   200 ...   944  3624  1952]
 [  115   122   147 ...   193    40   322]
 [    6     3   330 ...   626  1710    29]
 ...
 [    7   529  1739 ...     0     0     0]
 [12658 12659     8 ...     0     0     0]
 [  645  1436   174 ...     0     0     0]] 

Encoded X Test
 [[  98  606  109 ...  193    4  318]
 [ 585   11   56 ...   16  707   55]
 [ 327   12 1209 ...  778    5   88]
 ...
 [   2    1  126 ...    0    0    0]
 [   2    1    3 ...    0    0    0]
 [  59    8    9 ...    0    0    0]] 

Maximum tweet length:  10


In [None]:
_#Build the model_LSTM

In [8]:
# ARCHITECTURE
EMBED_DIM = 64
LSTM_OUT = 128
model = Sequential()
model.add(Embedding(total_words, EMBED_DIM))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.build(input_shape=(None, max_length))

print(model.summary())

None


In [9]:
#Set hyperparameters
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
#Model Training

In [28]:
checkpoint = ModelCheckpoint(
    'models/LSTM.keras',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [None]:
#Model Training- LSTM

In [29]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
[1m77/79[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 32ms/step - accuracy: 0.9943 - loss: 0.0173
Epoch 1: accuracy improved from -inf to 0.99339, saving model to models/LSTM.keras
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.9943 - loss: 0.0174
Epoch 2/5
[1m77/79[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 31ms/step - accuracy: 0.9947 - loss: 0.0176
Epoch 2: accuracy improved from 0.99339 to 0.99450, saving model to models/LSTM.keras
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.9947 - loss: 0.0176
Epoch 3/5
[1m78/79[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 33ms/step - accuracy: 0.9962 - loss: 0.0137
Epoch 3: accuracy improved from 0.99450 to 0.99530, saving model to models/LSTM.keras
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 34ms/step - accuracy: 0.9962 - loss: 0.0138
Epoch 4/5
[1m77/79[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m

<keras.src.callbacks.history.History at 0x78dbbb0ab5d0>

In [18]:
# Build the Bi-LSTM model

model_bilstm = Sequential()
model_bilstm.add(Embedding(total_words, output_dim=128))
model_bilstm.add(Bidirectional(LSTM(64)))  # Bi-directional LSTM
model_bilstm.add(Dense(1, activation='sigmoid'))
model_bilstm.build(input_shape=(None, max_length))

print(model_bilstm.summary())

None


In [22]:
#Set hyperparameters - Bi_LSTM
model_bilstm.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [26]:
checkpoint1 = ModelCheckpoint(
    'models/BiLSTM.keras',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [None]:
#model training- Bi-LSTM

In [27]:
model_bilstm.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint1])

Epoch 1/5
[1m78/79[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 50ms/step - accuracy: 0.9766 - loss: 0.0781
Epoch 1: accuracy improved from -inf to 0.97087, saving model to models/BiLSTM.keras
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.9764 - loss: 0.0784
Epoch 2/5
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.9765 - loss: 0.0735
Epoch 2: accuracy improved from 0.97087 to 0.97478, saving model to models/BiLSTM.keras
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 45ms/step - accuracy: 0.9764 - loss: 0.0735
Epoch 3/5
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.9859 - loss: 0.0548
Epoch 3: accuracy improved from 0.97478 to 0.98218, saving model to models/BiLSTM.keras
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.9858 - loss: 0.0549
Epoch 4/5
[1m78/79[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0

<keras.src.callbacks.history.History at 0x78dbbb131910>

In [None]:
#Model testing-LSTM

In [30]:
pred = model.predict(x=x_test)
y_pred = (pred >= 0.5) * 1

#y_pred = model.predict(x_test)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1


print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
Correct Prediction: 2569
Wrong Prediction: 762
Accuracy: 77.12398679075353


In [None]:
#Model testing- Bi-LSTM

In [31]:
pred = model_bilstm.predict(x=x_test)
y_pred = (pred >= 0.5) * 1

#y_pred = model.predict(x_test)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1


print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Correct Prediction: 2619
Wrong Prediction: 712
Accuracy: 78.62503752626839
