# LSTM Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [42]:
from src.helpers import *
import pickle

In [36]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Activation, Flatten

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, precision_score, recall_score

import string

from nltk.corpus import stopwords
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [16]:
df_all = pd.read_csv('data/all_data.csv')
df_all = df_all[['text', 'label']]

In [17]:
# Text cleaning using imported functions
def clean(text):
    text = preprocessor(text)
    stop_words = stopwords_list()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [18]:
df_all_clean = df_all.copy()
df_all_clean['text'] = df_all_clean['text'].apply(lambda x: clean(x))

In [18]:
# # Save as CSV
# df_all_clean.to_csv('../df_all_clean.csv')

In [5]:
# # Load cleaned CSV
# df_all_clean = pd.read_csv('../df_all_clean.csv')

In [19]:
# splitting the data

#split data into target and features, stratify to maintain class balance
y = df_all_clean['label']
X = df_all_clean['text']

In [8]:
# 20000 words used like tfidf from before default
# padding required for LSTM to work, length being 50 max words default

def preprocessing(X, y, num_words=20000, max_len=50):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(X_train)
    train_sequences = tokenizer.texts_to_sequences(X_train)
    padded_train = pad_sequences(train_sequences, maxlen=max_len,padding='post', truncating='post')
    test_sequences = tokenizer.texts_to_sequences(X_test)
    padded_test = pad_sequences(test_sequences, maxlen=max_len,
                               padding='post',
                               truncating='post')
    return padded_train, padded_test, y_train, y_test

In [20]:
X_train, X_test, y_train, y_test = preprocessing(X, y)

In [21]:
model1 = tf.keras.Sequential()

#Non-trainable embeddidng layer
model1.add(tf.keras.layers.Embedding(20000, output_dim=300))
    
model1.add(tf.keras.layers.LSTM(units=128, return_sequences = True))
model1.add(tf.keras.layers.Dropout(0.2))
model1.add(tf.keras.layers.LSTM(units=64))
model1.add(tf.keras.layers.Dropout(0.1))
model1.add(tf.keras.layers.Dense(units = 32 , activation = 'relu'))
model1.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'Precision', 'Recall'])

In [22]:
model1.fit(X_train, y_train, batch_size=64, epochs=15, verbose=1, validation_split = 0.2)


Epoch 1/15
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x16792ea90>

In [23]:
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 300)         6000000   
_________________________________________________________________
lstm (LSTM)                  (None, None, 128)         219648    
_________________________________________________________________
dropout (Dropout)            (None, None, 128)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 3

In [24]:
y_hat1 = model1.predict(X_test)

In [25]:
y_hat1_classes = model1.predict_classes(X_test)
cf_matrix = confusion_matrix(y_hat1_classes, y_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [26]:
# [[TP FN
# [FP TN]]

cf_matrix

array([[7061,  613],
       [ 385, 5947]])

In [27]:
model2 = tf.keras.Sequential()

model2.add(tf.keras.layers.Embedding(20000, 300))
model2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(300)))
model2.add(tf.keras.layers.Dense(300, activation='relu'))
model2.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'Precision', 'Recall'])

In [28]:
model2.fit(X_train, y_train, batch_size=64, epochs=15, verbose=1, validation_split = 0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x16a19d890>

In [29]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         6000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 600)               1442400   
_________________________________________________________________
dense_2 (Dense)              (None, 300)               180300    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 301       
Total params: 7,623,001
Trainable params: 7,623,001
Non-trainable params: 0
_________________________________________________________________


In [30]:
y_pred = model2.predict(X_test)

In [31]:
y_pred_classes = model2.predict_classes(X_test)

In [32]:
y_pred_classes

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]], dtype=int32)

In [34]:
accuracy_score(y_test, y_pred_classes)

0.931243752677424

In [37]:
precision_score(y_test, y_pred_classes)

0.9217784476262245

In [39]:
recall_score(y_test, y_pred_classes)

0.9323170731707318

In [40]:
# [[TP FN
# [FP TN]]

cf_matrix = confusion_matrix(y_pred_classes, y_test)
cf_matrix

array([[6927,  444],
       [ 519, 6116]])

In [44]:
def get_pred_output(text):
    sequences = tokenizer.texts_to_sequences([text])
    data = pad_sequences(sequences, maxlen=50)
    predicted_val = model.predict(data)
#     predicted_val = model.predict(data)    
#     if predicted_val.max() > 0.5:
#         output = 1
#     else:
#          output = 0
    
    return predicted_val

In [None]:
# # testing an article

# text_to_check = 'article here'
# pred = get_pred_output(text_to_check)