In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from src.helpers import *

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Activation, Flatten

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import string

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [6]:
df_all = pd.read_csv('data/all_data.csv')
df_all = df_all[['text', 'label']]

In [7]:
# Text cleaning using imported functions
def clean(text):
    text = preprocessor(text)
    stop_words = stopwords_list()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [8]:
df_all_clean = df_all.copy()
df_all_clean['text'] = df_all_clean['text'].apply(lambda x: clean(x))

In [18]:
df_all_clean.to_csv('../df_all_clean.csv')

In [9]:
# splitting the data

#split data into target and features, stratify to maintain class balance
y = df_all_clean['label']
X = df_all_clean['text']

In [10]:
# 20000 words used like tfidf from before default
# padding required for LSTM to work, length being 40 max words default

def preprocessing(X, y, num_words=20000, max_len=40):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(X_train)
    train_sequences = tokenizer.texts_to_sequences(X_train)
    padded_train = pad_sequences(train_sequences, maxlen=max_len,padding='post', truncating='post')
    test_sequences = tokenizer.texts_to_sequences(X_test)
    padded_test = pad_sequences(test_sequences, maxlen=max_len,
                               padding='post',
                               truncating='post')
    return padded_train, padded_test, y_train, y_test

In [11]:
X_train, X_test, y_train, y_test = preprocessing(X, y)

In [16]:
model1 = tf.keras.Sequential()

#Non-trainable embeddidng layer
model1.add(tf.keras.layers.Embedding(20000, output_dim=300))
    
model1.add(tf.keras.layers.LSTM(units=128, return_sequences = True))
model1.add(tf.keras.layers.Dropout(0.2))
model1.add(tf.keras.layers.LSTM(units=64))
model1.add(tf.keras.layers.Dropout(0.1))
model1.add(tf.keras.layers.Dense(units = 32 , activation = 'relu'))
model1.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'Precision'])

In [17]:
model1.fit(X_train, y_train, batch_size=64, epochs=10, verbose=1, validation_split = 0.2)


Train on 44816 samples, validate on 11204 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7faffc048c50>

In [14]:
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 300)         6000000   
_________________________________________________________________
lstm (LSTM)                  (None, None, 128)         219648    
_________________________________________________________________
dropout (Dropout)            (None, None, 128)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 3

In [None]:
y_hat1 = model1.predict(X_test)

In [21]:
y_hat1_classes = model1.predict_classes(X_test)
cf_matrix = confusion_matrix(y_test,y_hat1_classes)

In [22]:
cf_matrix

array([[6918,  528],
       [ 493, 6067]])

In [24]:
model2 = tf.keras.Sequential()

model2.add(tf.keras.layers.Embedding(20000, 300))
model2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(300)))
model2.add(tf.keras.layers.Dense(300, activation='relu'))
model2.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'Precision'])

In [25]:
model2.fit(X_train, y_train, batch_size=64, epochs=10, verbose=1, validation_split = 0.2)

Train on 44816 samples, validate on 11204 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7faf6058fc50>

In [26]:
model2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 300)         6000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 600)               1442400   
_________________________________________________________________
dense_6 (Dense)              (None, 300)               180300    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 301       
Total params: 7,623,001
Trainable params: 7,623,001
Non-trainable params: 0
_________________________________________________________________


In [27]:
y_pred = model2.predict(X_test)

In [29]:
y_pred_classes = model2.predict_classes(X_test)

In [30]:
y_pred_classes

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]], dtype=int32)