In [None]:
# libriaries used are loaded

from google.colab import drive
import pandas as pd


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pprint import pprint

from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.compat.v1.keras.layers import CuDNNGRU


import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, Embedding, GRU, LSTM, RNN, SpatialDropout1D, Bidirectional

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.words('english'))
from nltk.tokenize import word_tokenize


In [63]:
# grab dataset from google drive
drive.mount('/content/drive')

xy_train_df = pd.read_csv('/content/drive/My Drive/xy_train.csv')
x_test_df = pd.read_csv('/content/drive/My Drive/x_test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [67]:
x = xy_train_df.text
y = xy_train_df.label

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2)

In [None]:
# maximum number of words from the resulting tokenized data which are to be used
vocab_size = 40000
max_len = 40


# build vocabulary from training set
tokenizer = Tokenizer(num_words=vocab_size)
# upadting the internal vocalulary based on the list of text, so it creates the vocabulary
# index based on word frequnecy so every word gets a unique interger value so lower integers 
# mean more frequent word.
tokenizer.fit_on_texts(x_train)


def _preprocess(list_of_text):
    # pads sequence to the same length (all sequences in a list to have the same length), it
    # does so by padding 0 in the beggining of each sequence until they have the same length as
    # the longest sequence. 
    return pad_sequences(
        # transforms each text in texts to a sequence of integers. It takes each word
        # in the text and replaces it with its corresponding integer value from the 
        # dictionary.
        tokenizer.texts_to_sequences(list_of_text),
        # takes in the pre-defined input (40) as maximum length of all sequences.
        maxlen=max_len,
        # does padding after each sequence
        padding='post',
    )
    

# padding is done inside: 
x_train = _preprocess(x_train)
x_valid = _preprocess(x_valid)

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

In [None]:
# Stop word removal before passing it to preprocessing
x = xy_train_df.text
def filter_stop_words(x, stop_words):
    for i, sentence in enumerate(x):
        new_sent = [word for word in sentence.split() if word not in stop_words]
        x[i] = ' '.join(new_sent)
    return x

stop_words = set(stopwords.words("english"))
train_sentences = filter_stop_words(x, stop_words)

In [None]:
# Preprocessing to include stop word removal

y = xy_train_df.label
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2)v

vocab_size = 40000
max_len = 40


# build vocabulary from training set
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)


def _preprocess(train_sentences):
    return pad_sequences(
        tokenizer.texts_to_sequences(train_sentences),
        maxlen=max_len,
        padding='post',
    )
    

# padding is done inside: 
x_train = _preprocess(x_train)
x_valid = _preprocess(x_valid)

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

In [None]:
# these match as we can see. It has 23 words and the rest is padded to match 40 which was a 
# predefined argument. 
print(x_train[:4])
pprint(tokenizer.sequences_to_texts(x_train[:4]))


In [None]:
# FULLY CONNECTED NETWORK (template version with comments)


# defines an input layer (instantiate a Keras tensor object) and allows for building a model.
# batch_shape basically means shape=(40,), this is useful for LSTMs as it lets you keppt the hidden state
# values in an LSTM across batches.
seq_in = keras.Input(batch_shape=(None, max_len))
# this layer can only be used as the first layer in a model. This is the first hidden layer of a 
# network and will learn an embessing for all of the words in the trainin dataset. Here we are giving 
# it input and output integers
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(seq_in)
# computes the mean of elements across dimensions of the vector. It reduced the input variables along
# the dimenions given in axis by computing the mean of elements across dimensions in the axis. Here in
# the provided code we are reducing the dimensions by one.
averaged = tf.reduce_mean(embedded, axis=1)
# just your everyday dense layer with a activation template parameter of sigmoid which transforms the
# input into a value between 0 and 1. 
pred = keras.layers.Dense(1, activation='sigmoid')(averaged)

model = keras.Model(
    inputs=seq_in,
    outputs=pred,
)

model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']
)



history = model.fit(x_train,
                    y_train,
                    epochs=35,
                    batch_size=64,
                    validation_data=(x_valid, y_valid),
                    verbose=1)

In [None]:
# FULLY CONNECTED NETWORK (without comments, easier to see)
seq_in = keras.Input(batch_shape=(None, max_len))
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(seq_in)
averaged = tf.reduce_mean(embedded, axis=1)
d1 = keras.layers.Dense(200)(averaged)
do1 = keras.layers.Dropout(0.2)(d1)
d2 = keras.layers.Dense(300)(do1)
pred = keras.layers.Dense(1, activation='sigmoid')(d2)

model = keras.Model(
    inputs=seq_in,
    outputs=pred,
)

model.compile(
    #optimizer=tf.keras.optimizers.RMSprop(),
    #optimizer=tf.keras.optimizers.Adagrad(),
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']
)



history = model.fit(x_train,
                    y_train,
                    epochs=30,
                    batch_size=70,
                    validation_data=(x_valid, y_valid),
                    verbose=1)

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

In [None]:
# Recurrent GRU
seq_in = keras.Input(batch_shape=(None, max_len))
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(seq_in)
#averaged = tf.reduce_mean(embedded, axis=1)

gru1 = GRU(units = 110, dropout = 0.2, recurrent_dropout = 0.2)(embedded)
do1 = Dropout(rate = 0.2)(gru1)
#d1 = keras.layers.Dense(100)(do1)
#do2 = Dropout(rate = 0.2)(d1)
pred = keras.layers.Dense(1)(do1)

model = keras.Model(
    inputs=seq_in,
    outputs=pred,
)

model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']
)



history = model.fit(x_train,
                    y_train,
                    epochs=30,
                    batch_size=44,
                    validation_data=(x_valid, y_valid))



In [None]:
# Recurrent LSTM
seq_in = keras.Input(batch_shape=(None, max_len))
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(seq_in)
#averaged = tf.reduce_mean(embedded, axis=1)

lstm1 = LSTM(units = 100, dropout = 0.2, recurrent_dropout = 0.2, )(embedded)
do1 = Dropout(rate = 0.4)(lstm1)
d1 = keras.layers.Dense(100)(do1)
pred = keras.layers.Dense(1, activation='sigmoid')(d1)

model = keras.Model(
    inputs=seq_in,
    outputs=pred,
)

model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']
)



history = model.fit(x_train,
                    y_train,
                    epochs=27,
                    batch_size=65,
                    validation_data=(x_valid, y_valid),
                    verbose=1)

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

In [None]:
# Recurrent GRU Multi-layer
seq_in = keras.Input(batch_shape=(None, max_len))
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(seq_in)
#averaged = tf.reduce_mean(embedded, axis=1)

gru1 = GRU(units = 100, dropout = 0.2, recurrent_dropout = 0.2,return_sequences=True)(embedded)
do1 = Dropout(rate = 0.2)(gru1)
gru2 = GRU(units = 100,return_sequences=True)(do1)
do2 = Dropout(rate = 0.2)(gru2)
d1 = keras.layers.Dense(100)(do2)
do3 = Dropout(rate = 0.2)(d1)
pred = keras.layers.Dense(1, activation='sigmoid')(do3)

model = keras.Model(
    inputs=seq_in,
    outputs=pred,
)

model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']
)



history = model.fit(x_train,
                    y_train,
                    epochs=25,
                    batch_size=65,
                    validation_data=(x_valid, y_valid),
                    verbose=1)

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

In [None]:
# Recurrent LSTM Multi-layer
seq_in = keras.Input(batch_shape=(None, max_len))
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(seq_in)
#averaged = tf.reduce_mean(embedded, axis=1)

lstm1 = LSTM(units = 100, dropout = 0.2, recurrent_dropout = 0.2, return_sequences=True)(embedded)
do1 = Dropout(rate = 0.2)(lstm1)
lstm2 = LSTM(units = 100, dropout = 0.2, recurrent_dropout = 0.2, return_sequences=True)(do1)
do2 = Dropout(rate = 0.2)(lstm2)
d1 = keras.layers.Dense(100)(do2)
do3 = Dropout(rate = 0.2)(d1)
pred = keras.layers.Dense(1, activation='sigmoid')(do3)

model = keras.Model(
    inputs=seq_in,
    outputs=pred,
)

model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']
)



history = model.fit(x_train,
                    y_train,
                    epochs=25,
                    batch_size=65,
                    validation_data=(x_valid, y_valid))

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

In [None]:
# Bi_directional Recurrent GRU
seq_in = keras.Input(batch_shape=(None, max_len))
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(seq_in)
#averaged = tf.reduce_mean(embedded, axis=1)

bgru1 = Bidirectional(GRU(units = 100, dropout = 0.2, recurrent_dropout = 0.2))(embedded)
do1 = Dropout(rate = 0.2)(bgru1)
d1 = keras.layers.Dense(100)(do1)
pred = keras.layers.Dense(1, activation='sigmoid')(d1)

model = keras.Model(
    inputs=seq_in,
    outputs=pred,
)

model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']
)



history = model.fit(x_train,
                    y_train,
                    epochs=25,
                    batch_size=60,
                    validation_data=(x_valid, y_valid),
                    verbose=1)

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

In [None]:
# Bi_directional Recurrent LSTM
seq_in = keras.Input(batch_shape=(None, max_len))
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(seq_in)
#averaged = tf.reduce_mean(embedded, axis=1)

bgru1 = Bidirectional(LSTM(units = 100, dropout = 0.2, recurrent_dropout = 0.2))(embedded)
do1 = Dropout(rate = 0.2)(bgru1)
d1 = keras.layers.Dense(100)(do1)
pred = keras.layers.Dense(1, activation='sigmoid')(d1)

model = keras.Model(
    inputs=seq_in,
    outputs=pred,
)

model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']
)



history = model.fit(x_train,
                    y_train,
                    epochs=25,
                    batch_size=60,
                    validation_data=(x_valid, y_valid),
                    verbose=1)

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

In [72]:
x_test = _preprocess(x_test_df.text)
y_predict = np.squeeze(model.predict(x_test))


pd.DataFrame(
    {'id': x_test_df.index,
     'label':y_predict}).to_csv('RNN_biLSTM_stopword.csv', index=False)