# Google Colab related imports

In [1]:
import nltk
nltk.download('stopwords')
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Core imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import string
from nltk.corpus import stopwords
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import auc, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Flatten, Dense, LSTM, Conv1D, MaxPooling1D
from keras.layers import Embedding

from tensorflow.keras.models import load_model# save entire model to HDF5 
import tensorflow as tf

# Loading the data

In [3]:
glove_path = '/content/drive/My Drive/Colab Notebooks/'
data_path = '/content/drive/My Drive/Colab Notebooks/imdb_master.csv'
dataframe = pd.read_csv(data_path, encoding='latin1',usecols=['review','label'])
dataframe = dataframe.loc[dataframe.label != 'unsup']
dataframe.label.replace({'neg':0,'pos':1},inplace=True)
X = dataframe[['review']]
y = dataframe[['label']]
vec_size = 300

# Pre-processing block

In [4]:
#1.Remove any punctuation

def remove_punct(text):
    table = str.maketrans("","", string.punctuation)
    return text.translate(table)

#2. Remove stopwords
stop = set(stopwords.words('english'))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

X.review = X.review.map(lambda x: remove_punct(x))
X.review = X.review.map(remove_stopwords)

# 3. Label Encoding
y = pd.DataFrame(to_categorical(np.asarray(y['label'])))

# 4. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=16)

one_d_y_test = [1 if item == 1 else 0 for item in y_test[1]] # 1d vector of predictions for classification report

X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


#Keras preparation
##1.Tokenizer 
##2.Padding

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train.review)
word_index = tokenizer.word_index 
vocab_size = len(word_index)+1
max_length = 260  #!

# 1. Train tokenizer&padding
train_sequence = tokenizer.texts_to_sequences(X_train.review)
train_padded = pad_sequences(train_sequence, maxlen=max_length, padding='post',
                             truncating='post')
# 2. Test tokenizer&padding
test_sequence = tokenizer.texts_to_sequences(X_test.review)
test_padded = pad_sequences(test_sequence, maxlen=max_length, padding='post',
                            truncating='post')

# Making sure reverse operation produces the inverse of encoding
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode(text):
    return " ".join([reverse_word_index.get(i,"?") for i in text]) 

# Parse the glove file

In [None]:
embeddings_index = {}
f = open(os.path.join(glove_path, 'glove.42B.300d.txt'))
for line in f:
    # split every value on space
    values = line.split()
    # the word itself is the first item in every line
    word = values[0]
    # the actual vector is the vector that follows the word
    coefs = np.asarray(values[1:], dtype='float32')
    # dictionary mapping of the words to their vectors
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1917494 word vectors.


# Create Embedding Matrix

In [None]:
# np.array matrix that 
embedding_matrix = np.zeros((vocab_size, vec_size))
                # word index is a dictionary that contains tokenized words from the dataset
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [7]:
embedding_matrix = np.load('/content/drive/My Drive/Colab Notebooks/embedding_matrix.npy')

# Function to generate perforemance metrics stats

In [None]:
def performance_metrics(model, y_test):
    
    predicitons = model.predict_classes(test_padded)
    class_report = classification_report(y_test, predictions)
    conf_matrix = pd.DataFrame(confusion_matrix(one_d_y_test, predictions), 
                                    index=labels, columns=labels)
    print(class_report)

    return  conf_matrix

# DL Models

## 1.Plain vanilla LSTM 
- GloVe embeddings are trainable weights 

In [8]:
EPOCHS = 12
BATCH_SIZE = 128

In [10]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=vec_size,input_length=max_length,
weights=[embedding_matrix],trainable=False))
model.add(LSTM(64))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.summary()

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, 
                                      restore_best_weights=True,verbose=1)

model.fit(train_padded, y_train, validation_split=0.1,epochs=EPOCHS, 
          batch_size=BATCH_SIZE, callbacks=[es], verbose=1)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 260, 300)          44910300  
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 45,003,870
Trainable params: 93,570
Non-trainable params: 44,910,300
_________________________________________________________________
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 00011: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f1d83616f28>

In [15]:
def collect_metrics(model):
    predictions = model.predict_classes(test_padded)
    class_report = classification_report(one_d_y_test, predictions)
    fpr, tpr, thresholds = roc_curve(one_d_y_test, predictions)
    auc_ = auc(fpr, tpr)
    
    print(class_report)
    print(auc_)

collect_metrics(model_3)

              precision    recall  f1-score   support

           0       0.80      0.84      0.82      7494
           1       0.83      0.79      0.81      7506

    accuracy                           0.81     15000
   macro avg       0.81      0.81      0.81     15000
weighted avg       0.81      0.81      0.81     15000

0.8112875058906704


# Conv1D NN

In [12]:
model_2 = Sequential()
model_2.add(Embedding(input_dim=vocab_size, output_dim=vec_size,input_length=max_length,
weights=[embedding_matrix],trainable=False))
model_2.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
model_2.add(MaxPooling1D(pool_size=2))
model_2.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
model_2.add(MaxPooling1D(pool_size=2))
model_2.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
model_2.add(MaxPooling1D(pool_size=2))
model_2.add(Flatten())
model_2.add(Dense(256, activation='relu'))
model_2.add(Dense(2, activation='softmax'))
model_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_2.summary()


es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, 
                                      restore_best_weights=True,verbose=1)

model_2.fit(train_padded, y_train, validation_split=0.1,epochs=EPOCHS, 
          batch_size=BATCH_SIZE, callbacks=[es], verbose=1)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 260, 300)          44910300  
_________________________________________________________________
conv1d (Conv1D)              (None, 260, 128)          153728    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 130, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 130, 64)           32832     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 65, 64)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 65, 32)            8224      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 32, 32)           

<tensorflow.python.keras.callbacks.History at 0x7f1d81fb4978>

# Dense NN

In [14]:
model_3 = Sequential()
model_3.add(Embedding(input_dim=vocab_size, output_dim=vec_size,input_length=max_length,
weights=[embedding_matrix],trainable=False))
model_3.add(Flatten())
model_3.add(Dense(256, activation='relu'))
model_3.add(Dense(128, activation='relu'))
model_3.add(Dense(2, activation='softmax'))
model_3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_3.summary()


es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, 
                                      restore_best_weights=True,verbose=1)

model_3.fit(train_padded, y_train, validation_split=0.1,epochs=EPOCHS, 
          batch_size=BATCH_SIZE, callbacks=[es], verbose=1)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 260, 300)          44910300  
_________________________________________________________________
flatten_1 (Flatten)          (None, 78000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               19968256  
_________________________________________________________________
dense_5 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 258       
Total params: 64,911,710
Trainable params: 20,001,410
Non-trainable params: 44,910,300
_________________________________________________________________
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 00004: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f1d814fe7b8>