### Importing Libraries

In [None]:
import re  # For preprocessing
import pandas as pd  # For data handling
import numpy as np
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing
from sklearn.model_selection import train_test_split
# Tools for building a model
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences

# Modules for visualization
import matplotlib.pyplot as plt
import seaborn as sb

# Tools for assessing the quality of model prediction
from sklearn.metrics import accuracy_score, confusion_matrix

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

### Loading Dataset

In [None]:
#Reading the Data
dataset= pd.read_csv("../input/encoded-rusa/Cleaned_Encoded RUSA Dataset.csv")
dataset = dataset.drop(['Unnamed: 0', 'Review', 'Cleaned_Review','Soundex', 'RefinedSoundex', 'MetaSoundex',
       'FuzzySoundex', 'LEIN', 'NYSIIS', 'Caverphone', 'SoundD', 'Metaphone'], axis=1)
dataset.rename(columns={'Standardized_Review': 'Review'}, inplace=True)
dataset['Sentiment']=dataset['Sentiment'].replace('positive',0)
dataset['Sentiment']=dataset['Sentiment'].replace('negative',1)
dataset.head()

### Preparing model for Training the Embeddings

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
sentences = [row.split() for row in dataset['Review']]

In [None]:
phrases = Phrases(sentences, min_count=5, progress_per=100)

In [None]:
bigram = Phraser(phrases)

In [None]:
sentences = bigram[sentences]

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

In [None]:
import multiprocessing

from gensim.models import Word2Vec

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
print(cores)

In [None]:
from gensim.models import FastText

In [None]:
w2v_model = FastText(window=10,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
t = time()

w2v_model.build_vocab(sentences, progress_per=1000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
total_examples=w2v_model.corpus_count
print(total_examples)

In [None]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
w2v_model.wv.most_similar(positive=["ghatia"])

### Some helper functions

In [None]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    index2word_set = set(vector.wv.index_to_key)
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector.wv[word] if word in index2word_set else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector.wv[word] if word in index2word_set else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, data, generate_missing=False):
    embeddings = data.apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

### Preparing Data and getting Word Embeddings

In [None]:
X= dataset.loc[:,dataset.columns=='Review']
y = dataset.Sentiment

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
comments = X.copy()
comments['Review'] = comments['Review'].astype('str') 
comments.dtypes
comments["tokens"] = comments["Review"].apply(tokenizer.tokenize)
comments.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    comments, y, test_size=0.2, random_state=111)

In [None]:
X_train['Review'].head()

In [None]:
X_train_embeddings = pd.DataFrame(get_word2vec_embeddings(w2v_model, X_train['tokens']))
X_test_embeddings = pd.DataFrame(get_word2vec_embeddings(w2v_model, X_test['tokens']))

In [None]:
# X_train_embeddings.shape

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

# RNN

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
X_train['Review'].apply(lambda x:len(str(x).split())).max()

In [None]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
token.fit_on_texts(list(X_train['Review']) + list(X_test['Review']))
word_index = token.word_index

In [None]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 450

token.fit_on_texts(list(X_train['Review']) + list(X_test['Review']))
xtrain_seq = token.texts_to_sequences(X_train['Review'])
xvalid_seq = token.texts_to_sequences(X_test['Review'])

#zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [None]:
%%time
with strategy.scope():
    # A simpleRNN without any pretrained embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     input_length=max_len))
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model.summary()

In [None]:
history = model.fit(xtrain_pad, y_train, epochs=5, 
          batch_size=64, validation_data=(xvalid_pad, y_test)) #Multiplying by ?Strategy to run on TPU's

In [None]:
def plot_confusion_matrix(y_true, y_pred, ax, class_names, vmax=None,
                          normed=True, title='Confusion matrix'):
    matrix = confusion_matrix(y_true,y_pred)
    if normed:
        matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
    sb.heatmap(matrix, vmax=vmax, annot=True, square=True, ax=ax,
               cmap=plt.cm.Blues_r, cbar=False, linecolor='black',
               linewidths=1, xticklabels=class_names)
    ax.set_title(title, y=1.20, fontsize=16)
    #ax.set_ylabel('True labels', fontsize=12)
    ax.set_xlabel('Predicted labels', y=1.10, fontsize=12)
    ax.set_yticklabels(class_names, rotation=0)

In [None]:
%%time
y_train_pred = model.predict_classes(xtrain_pad)
y_test_pred = model.predict_classes(xvalid_pad)

In [None]:
fig, (axis1, axis2) = plt.subplots(nrows=1, ncols=2, figsize=(10,6))
plot_confusion_matrix(y_train, y_train_pred, ax=axis1,
                      title='Confusion matrix (train data)',
                      class_names=['Positive', 'Negative'])
plot_confusion_matrix(y_test, y_test_pred, ax=axis2,
                      title='Confusion matrix (test data)',
                      class_names=['Positive', 'Negative'])

In [None]:
# history.history

In [None]:
fig, (axis1, axis2) = plt.subplots(nrows=1, ncols=2, figsize=(16,6))

# summarize history for accuracy
axis1.plot(history.history['accuracy'], label='Train', linewidth=3)
axis1.plot(history.history['val_accuracy'], label='Validation', linewidth=3)
axis1.set_title('Model accuracy', fontsize=16)
axis1.set_ylabel('accuracy')
axis1.set_xlabel('epoch')
axis1.legend(loc='upper left')

# summarize history for loss
axis2.plot(history.history['loss'], label='Train', linewidth=3)
axis2.plot(history.history['val_loss'], label='Validation', linewidth=3)
axis2.set_title('Model loss', fontsize=16)
axis2.set_ylabel('loss')
axis2.set_xlabel('epoch')
axis2.legend(loc='upper right')
plt.savefig("RNN_Graphs")
plt.show()

# LSTM

In [None]:
X_train_embeddings = np.array(X_train_embeddings)
X_test_embeddings = np.array(X_test_embeddings)

In [None]:
# type(X_train_embeddings)

In [None]:
X_train_embeddings = X_train_embeddings.reshape((X_train_embeddings.shape[0], X_train_embeddings.shape[1], 1))
X_test_embeddings = X_test_embeddings.reshape((X_test_embeddings.shape[0], X_test_embeddings.shape[1], 1))

In [None]:
# X_train_embeddings.shape

In [None]:
# X_test_embeddings.shape

In [None]:
%%time
with strategy.scope():
    
    # A simple LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(LSTM(150, dropout=0.3, recurrent_dropout=0.3, input_shape = (300, 1)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
model.summary()

In [None]:
history = model.fit(X_train_embeddings, y_train, epochs=50, 
          batch_size=64, validation_data=(X_test_embeddings, y_test))

In [None]:
%%time
y_train_pred = model.predict_classes(X_train_embeddings)
y_test_pred = model.predict_classes(X_test_embeddings)

In [None]:
fig, (axis1, axis2) = plt.subplots(nrows=1, ncols=2, figsize=(10,6))
plot_confusion_matrix(y_train, y_train_pred, ax=axis1,
                      title='Confusion matrix (train data)',
                      class_names=['Positive', 'Negative'])
plot_confusion_matrix(y_test, y_test_pred, ax=axis2,
                      title='Confusion matrix (test data)',
                      class_names=['Positive', 'Negative'])

In [None]:
fig, (axis1, axis2) = plt.subplots(nrows=1, ncols=2, figsize=(16,6))

# summarize history for accuracy
axis1.plot(history.history['accuracy'], label='Train', linewidth=3)
axis1.plot(history.history['val_accuracy'], label='Validation', linewidth=3)
axis1.set_title('Model accuracy', fontsize=16)
axis1.set_ylabel('accuracy')
axis1.set_xlabel('epoch')
axis1.legend(loc='upper left')

# summarize history for loss
axis2.plot(history.history['loss'], label='Train', linewidth=3)
axis2.plot(history.history['val_loss'], label='Validation', linewidth=3)
axis2.set_title('Model loss', fontsize=16)
axis2.set_ylabel('loss')
axis2.set_xlabel('epoch')
axis2.legend(loc='upper right')
plt.savefig("LSTM_Graphs")
plt.show()

# Bidirectional LSTM

In [None]:
%%time
with strategy.scope():
    # A simple bidirectional LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3, input_shape = (300, 1))))

    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
    
# model.summary()
history = model.fit(X_train_embeddings, y_train, epochs=50, 
          batch_size=64, validation_data=(X_test_embeddings, y_test))

In [None]:
%%time
y_train_pred = model.predict_classes(X_train_embeddings)
y_test_pred = model.predict_classes(X_test_embeddings)

In [None]:
fig, (axis1, axis2) = plt.subplots(nrows=1, ncols=2, figsize=(10,6))
plot_confusion_matrix(y_train, y_train_pred, ax=axis1,
                      title='Confusion matrix (train data)',
                      class_names=['Positive', 'Negative'])
plot_confusion_matrix(y_test, y_test_pred, ax=axis2,
                      title='Confusion matrix (test data)',
                      class_names=['Positive', 'Negative'])

In [None]:
fig, (axis1, axis2) = plt.subplots(nrows=1, ncols=2, figsize=(16,6))

# summarize history for accuracy
axis1.plot(history.history['accuracy'], label='Train', linewidth=3)
axis1.plot(history.history['val_accuracy'], label='Validation', linewidth=3)
axis1.set_title('Model accuracy', fontsize=16)
axis1.set_ylabel('accuracy')
axis1.set_xlabel('epoch')
axis1.legend(loc='upper left')

# summarize history for loss
axis2.plot(history.history['loss'], label='Train', linewidth=3)
axis2.plot(history.history['val_loss'], label='Validation', linewidth=3)
axis2.set_title('Model loss', fontsize=16)
axis2.set_ylabel('loss')
axis2.set_xlabel('epoch')
axis2.legend(loc='upper right')
plt.savefig("Bi_LSTM_Graphs")
plt.show()