# Import Libraries

In [None]:
# Python
import sys
import os
import re
import numpy as np
import pandas as pd
from time import time
from sys import getsizeof
import itertools

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# TQDM
from tqdm import tqdm_notebook
from tqdm import tqdm

# sklearn
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Keras
import tensorflow as tf
import keras
import keras.backend as K
from keras.models import Model

# Keras Pre-processing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Keras Layers
from keras.layers import Input, Dense, Embedding, Conv1D, MaxPooling1D, Dropout, Concatenate, Flatten, Add

# Keras Optimizer
from keras.optimizers import Adadelta
from keras.constraints import max_norm

# Keras Model Saving
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping

# Load Data

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !ls

In [None]:
# Add path to your project directory
PROJECT_DIR = ''

In [None]:
# Add the path to your data directory
DATA_DIR = PROJECT_DIR + 'data/aclImdb/'

In [None]:
# Add the path to your model directory
MODEL_DIR = PROJECT_DIR + 'models/'

In [None]:
def load_data(directory):
    corpus = []

    for filename in os.listdir(directory):
        with open(directory + filename, 'r', encoding='latin1') as f:
            movie_review = f.read()
            movie_review = movie_review.replace('<br />', '')
            to_tokenize = '!\"#$%&\'()*+, -./:;<=>?@[\]^_`{|}~'
            # Add space before punctuations
            movie_review = re.sub(r'(['+to_tokenize+'])', r' \1 ', movie_review)
            corpus.append(movie_review)

    return corpus

## Load Train Data

In [None]:
# Load Positive Movie Reviews
positive_train_data = load_data(DATA_DIR + 'train/pos/')
print("Number of positive movie reviews in train data: {}".format(len(positive_train_data)))

In [None]:
# Load Negative Movie Reviews
negative_train_data = load_data(DATA_DIR + 'train/neg/')
print("Number of negative movie reviews in train data: {}".format(len(negative_train_data)))

In [None]:
# Combine into train data
train_data = positive_train_data + negative_train_data

train_labels = np.zeros(25000)
train_labels[0:len(positive_train_data) - 1] = 1

## Load Test Data

In [None]:
# Load Positive Movie Reviews
positive_test_data = load_data(DATA_DIR + 'test/pos/')
print("Number of positive movie reviews in test data: {}".format(len(positive_test_data)))

In [None]:
# Load Negative Movie Reviews
negative_test_data = load_data(DATA_DIR + 'train/neg/')
print("Number of negative movie reviews in test data: {}".format(len(negative_test_data)))

In [None]:
# Combine into test data
test_data = positive_test_data + negative_test_data

test_labels = np.zeros(25000)
test_labels[0:len(positive_test_data) - 1] = 1

## Test-Val Split

In [None]:
X_test, X_val, Y_test, Y_val = train_test_split(test_corpus, test_label, test_size=0.2, random_state=42)

# Preprocess 

## Max Sequence Length

In [None]:
def find_max_seq_len(corpus):
    corpus_len = []
    for review in corpus:
        review_list = (str(review)).split()
        corpus_len.append(len(review_list)
  
  return max(corpus_len)

In [None]:
MAX_SEQUENCE_LENGTH = max(find_max_seq_len(train_data),
                          find_max_seq_len(test_data))

print(MAX_SEQUENCE_LENGTH)

## Tokenizer

In [None]:
# Keep all the punctuations  -- Since, GloVe has embeddings for them
tokenizer = Tokenizer(filters='')

In [None]:
# Fit the tokenizer on entire train corpus
tokenizer.fit_on_texts(train_corpus)
word_index = tokenizer.word_index
print("Found %s unique tokens: " % len(word_index))

# Generate Indices

In [None]:
def generate_indices(train_corpus, val_corpus, test_corpus, max_sequence_len):
  
    train_sequences = tokenizer.texts_to_sequences(train_corpus)
    train_indices = pad_sequences(train_sequences, maxlen=max_sequence_len)

    val_sequences = tokenizer.texts_to_sequences(val_corpus)
    val_indices = pad_sequences(val_sequences, maxlen=max_sequence_len)

    test_sequences = tokenizer.texts_to_sequences(test_corpus)
    test_indices = pad_sequences(test_sequences, maxlen=max_sequence_len)

    sequences2 = tokenizer.texts_to_sequences(question2)
    indices2 = pad_sequences(sequences2, maxlen=max_sequence_len)

    return train_indices, val_indices, test_indices

In [None]:
train_indices, val_indices, test_indices = generate_indices(train_corpus, X_val, X_test, MAX_SEQUENCE_LENGTH)

## Reverse Map the indices

In [None]:
# Create a reverse dictionary 
reverse_word_map = {v : k for k, v in word_index.items()}

In [None]:
def indices_to_text(indices_list):
    text = []
    for i in indices_list:
        if i is not 0:
            text.append(reverse_word_map.get(i))
    return text

In [None]:
print(train_corpus[0])
print(train_indices[0])
print(indices_to_text(train_indices[0]))

# Prepare embedding layer

## Load the GloVe embedding vector

In [None]:
# Add the path to your GloVe embedding vector directory
GLOVE_DIR = ''

glove_embedding_index = {}

f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_embedding_index[word] = coefs
f.close()

print("Found %s word vectors" % len(glove_embedding_index))

## Create embedding matrix

In [None]:
# Vocabulary Size
VOCAB_SIZE = len(word_index)
print("Vocabulary size is: {}".format(VOCAB_SIZE))

In [None]:
# Randomly initialize the embedding matrix for unknown words
embedding_matrix = np.random.rand(VOCAB_SIZE + 1, 300)

# Padding will be ignored
embedding_matrix[0] = 0 

for word, i in word_index.items():
    glove_embedding_vector = glove_embedding_index.get(word)
    if glove_embedding_vector is not None:
        embedding_matrix[i] = glove_embedding_vector

# Training Model

In [None]:
batch_size = 50
num_epochs = 100

## Random Model

In [None]:
def train_random_model():
  
    # Input Layer
    input_vec = Input(shape=(MAX_SEQUENCE_LENGTH,), name='Input_Layer', dtype='int32')

    # Embedding Layer
    embeddings_random = np.random.rand(VOCAB_SIZE+1, 300)
    embedding_layer = Embedding(input_dim=VOCAB_SIZE+1, output_dim=300, input_length = MAX_SEQUENCE_LENGTH, weights=[embeddings_random], trainable=True)

    # Embedded version of the inputs
    embedding_out = embedding_layer(input_vec)

    kernel_size1 = 3
    conv1 = Conv1D(filters=100, kernel_size=kernel_size1, activation='relu', kernel_constraint=max_norm(3))(embedding_out)
    pool1 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size1 + 1)(conv1)
    out1 = Flatten()(pool1)

    kernel_size2 = 4
    conv2 = Conv1D(filters=100, kernel_size=kernel_size2, activation='relu', kernel_constraint=max_norm(3))(embedding_out)
    pool2 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size2 + 1)(conv2)
    out2 = Flatten()(pool2)

    kernel_size3 = 5
    conv3 = Conv1D(filters=100, kernel_size=kernel_size3, activation='relu', kernel_constraint=max_norm(3))(embedding_out)
    pool3 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size3 + 1)(conv3)
    out3 = Flatten()(pool3)

    final_out = Concatenate()([out1, out2, out3])

    final_out = Dropout(0.5)(final_out)

    final_out = Dense(1, activation='sigmoid')(final_out)

    # Pack it all up into a model
    model = Model(inputs=input_vec, outputs=final_out)

    return model 

### Model Definition

In [None]:
random_model = train_randon_model()

In [None]:
random_model.summary()

### Model Compilation

In [None]:
random_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Model Fit

In [None]:
# Create a checkpoint
filepath = MODEL_DIR + 'random_model_best.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
random_model_trained = random_model.fit(x=train_indices, y=train_labels,
                                  batch_size=batch_size, nb_epoch=num_epochs, callbacks=callbacks_list, verbose=2,
                                  validation_data=(val_indices, Y_val))

### Save Model

In [None]:
# Serialize model to JSON
random_model_json = random_model.to_json()

with open(MODEL_DIR + 'random_model_json.json', 'w') as json_file:
    json_file.write(random_model_json)

print('Saved Random model architecture to the disk')

### Load Model

In [None]:
# Load architecture
with open(MODEL_DIR + 'random_model_json.json', 'r') as json_file:
    model_json = json_file.read()

random_model = model_from_json(model_json)

# Load weights
random_model.load_weights(MODEL_DIR + 'random_model_best.hdf5')
print('Loaded Random model from the disk')

## Static Model

In [None]:
def train_static_model(embedding_pretrained):
  
    # Input Layer
    input_vec = Input(shape=(MAX_SEQUENCE_LENGTH,), name='Input_Layer', dtype='int32')

    # Embedding Layer
    embedding_layer = Embedding(input_dim=VOCAB_SIZE+1, output_dim=300, input_length = MAX_SEQUENCE_LENGTH, weights=[embedding_pretrained], trainable=False)

    # Embedded version of the inputs
    embedding_out = embedding_layer(input_vec)

    kernel_size1 = 3
    conv1 = Conv1D(filters=100, kernel_size=kernel_size1, activation='relu', kernel_constraint=max_norm(3))(embedding_out)
    pool1 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size1 + 1)(conv1)
    out1 = Flatten()(pool1)

    kernel_size2 = 4
    conv2 = Conv1D(filters=100, kernel_size=kernel_size2, activation='relu', kernel_constraint=max_norm(3))(embedding_out)
    pool2 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size2 + 1)(conv2)
    out2 = Flatten()(pool2)

    kernel_size3 = 5
    conv3 = Conv1D(filters=100, kernel_size=kernel_size3, activation='relu', kernel_constraint=max_norm(3))(embedding_out)
    pool3 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size3 + 1)(conv3)
    out3 = Flatten()(pool3)

    final_out = Concatenate()([out1, out2, out3])

    final_out = Dropout(0.5)(final_out)

    final_out = Dense(1, activation='sigmoid')(final_out)

    # Pack it all up into a model
    model = Model(inputs=input_vec, outputs=final_out)

    return model 

### Model Definition

In [None]:
static_model = train_static_model(embedding_matrix)

In [None]:
static_model.summary()

### Model Compilation

In [None]:
static_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Model Fit

In [None]:
# Create a checkpoint
filepath = MODEL_DIR + 'static_model_best.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
static_model_trained = static_model.fit(x=train_indices, y=train_labels,
                                  batch_size=batch_size, nb_epoch=num_epochs, callbacks=callbacks_list, verbose=2,
                                  validation_data=(val_indices, Y_val))

### Save Model

In [None]:
# Serialize model to JSON
static_model_json = static_model.to_json()

with open(MODEL_DIR + 'static_model_json.json', 'w') as json_file:
    json_file.write(static_model_json)

print('Saved Static model architecture to the disk')

### Load Model

In [None]:
# Load architecture
with open(MODEL_DIR + 'static_model_json.json', 'r') as json_file:
    model_json = json_file.read()

static_model = model_from_json(model_json)

# Load weights
static_model.load_weights(MODEL_DIR + 'static_model_best.hdf5')
print('Loaded Static model from the disk')

## Non-Static Model

In [None]:
def train_non_static_model(embedding_pretrained):
  
    # Input Layer
    input_vec = Input(shape=(MAX_SEQUENCE_LENGTH,), name='Input_Layer', dtype='int32')

    # Embedding Layer
    embedding_layer = Embedding(input_dim=VOCAB_SIZE+1, output_dim=300, input_length = MAX_SEQUENCE_LENGTH, weights=[embedding_pretrained], trainable=True)

    # Embedded version of the inputs
    embedding_out = embedding_layer(input_vec)

    kernel_size1 = 3
    conv1 = Conv1D(filters=100, kernel_size=kernel_size1, activation='relu', kernel_constraint=max_norm(3))(embedding_out)
    pool1 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size1 + 1)(conv1)
    out1 = Flatten()(pool1)

    kernel_size2 = 4
    conv2 = Conv1D(filters=100, kernel_size=kernel_size2, activation='relu', kernel_constraint=max_norm(3))(embedding_out)
    pool2 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size2 + 1)(conv2)
    out2 = Flatten()(pool2)

    kernel_size3 = 5
    conv3 = Conv1D(filters=100, kernel_size=kernel_size3, activation='relu', kernel_constraint=max_norm(3))(embedding_out)
    pool3 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size3 + 1)(conv3)
    out3 = Flatten()(pool3)

    final_out = Concatenate()([out1, out2, out3])

    final_out = Dropout(0.5)(final_out)

    final_out = Dense(1, activation='sigmoid')(final_out)

    # Pack it all up into a model
    model = Model(inputs=input_vec, outputs=final_out)

    return model 

### Model Definition

In [None]:
non_static_model = train_non_static_model(embedding_matrix)

In [None]:
non_static_model.summary()

### Model Compilation

In [None]:
non_static_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Model Fit

In [None]:
# Create a checkpoint
filepath = MODEL_DIR + 'non_static_model_best.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
non_static_model_trained = non_static_model.fit(x=train_indices, y=train_labels,
                                  batch_size=batch_size, nb_epoch=num_epochs, callbacks=callbacks_list, verbose=2,
                                  validation_data=(val_indices, Y_val))

### Save Model

In [None]:
# Serialize model to JSON
non_static_model_json = non_static_model.to_json()

with open(MODEL_DIR + 'non_static_model_json.json', 'w') as json_file:
    json_file.write(non_static_model_json)

print('Saved Non Static model architecture to the disk')

### Load Model

In [None]:
# Load architecture
with open(MODEL_DIR + 'non_static_model_json.json', 'r') as json_file:
    model_json = json_file.read()

non_static_model = model_from_json(model_json)

# Load weights
non_static_model.load_weights(MODEL_DIR + 'non_static_model_best.hdf5')
print('Loaded Non Static model from the disk')

## Multi-Channel Model

In [None]:
def train_multichannel_model(embedding_pretrained):
  
    # Input Layer
    input_vec = Input(shape=(MAX_SEQUENCE_LENGTH,), name='Input_Layer', dtype='int32')

    # Embedding Layers
    embedding_layer1 = Embedding(input_dim=VOCAB_SIZE+1, output_dim=300, input_length = MAX_SEQUENCE_LENGTH, weights=[embedding_pretrained], trainable=False)
    embedding_layer2 = Embedding(input_dim=VOCAB_SIZE+1, output_dim=300, input_length = MAX_SEQUENCE_LENGTH, weights=[embedding_pretrained], trainable=True)

    # Embedded version of the inputs
    embedding_out1 = embedding_layer1(input_vec)
    embedding_out2 = embedding_layer2(input_vec)

    kernel_size1 = 3
    conv1 = Conv1D(filters=100, kernel_size=kernel_size1, activation='relu', kernel_constraint=max_norm(3))
    conv11 = conv1(embedding_out1)
    conv12 = conv1(embedding_out2)
    add1 = Add()([conv11, conv12])
    pool1 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size1 + 1)(add1)
    out1 = Flatten()(pool1)

    kernel_size2 = 4
    conv2 = Conv1D(filters=100, kernel_size=kernel_size2, activation='relu', kernel_constraint=max_norm(3))
    conv21 = conv2(embedding_out1)
    conv22 = conv2(embedding_out2)
    add2 = Add()([conv21, conv22])
    pool2 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size1 + 1)(add2)
    out2 = Flatten()(pool2)

    kernel_size3 = 5
    conv3 = Conv1D(filters=100, kernel_size=kernel_size3, activation='relu', kernel_constraint=max_norm(3))
    conv31 = conv3(embedding_out1)
    conv32 = conv3(embedding_out2)
    add3 = Add()([conv31, conv32])
    pool3 = MaxPooling1D(MAX_SEQUENCE_LENGTH - kernel_size1 + 1)(add3)
    out3 = Flatten()(pool3)

    final_out = Concatenate()([out1, out2, out3])

    final_out = Dropout(0.5)(final_out)

    final_out = Dense(1, activation='sigmoid')(final_out)

    # Pack it all up into a model
    model = Model(inputs=input_vec, outputs=final_out)

    return model 

### Model Definition

In [None]:
multi_channel_model = train_multichannel_model(embedding_matrix)

In [None]:
multi_channel_model.summary()

### Model Compilation

In [None]:
multi_channel_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Model Fit

In [None]:
# Create a checkpoint
filepath = MODEL_DIR + 'multi_channel_model_best.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
multi_channel_model_trained = multi_channel_model.fit(x=train_indices, y=train_labels,
                                  batch_size=batch_size, nb_epoch=num_epochs, callbacks=callbacks_list, verbose=2,
                                  validation_data=(val_indices, Y_val))

### Save Model

In [None]:
# Serialize model to JSON
multi_channel_model_json = multi_channel_model.to_json()

with open(MODEL_DIR + 'multi_channel_model_json.json', 'w') as json_file:
    json_file.write(multi_channel_model)

print('Saved Multi Channel model architecture to the disk')

### Load Model

In [None]:
# Load architecture
with open(MODEL_DIR + 'multi_channel_model_json.json', 'r') as json_file:
    model_json = json_file.read()

multi_channel_model = model_from_json(model_json)

# Load weights
multi_channel_model.load_weights(MODEL_DIR + 'multi_channel_model_best.hdf5')
print('Loaded Non Static model from the disk')