# Sentiment Analysis using Deep Contextual Word Representations

Output has been removed intentionally as ELMo is compute intensive. Training was carried on a GPU-accelerated environment.
Model requires downloading ELMo representations performed in the second cell.

# Import

In [None]:
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import tensorflow_hub as hub
import tensorflow as tf
import keras
import html
import nltk
from nltk.corpus import stopwords
import numpy as np
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import sklearn.metrics as sklm
from keras import optimizers

In [None]:
#Instantiating ELMo(Embeddings from Language Models)
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [None]:
sess = tf.InteractiveSession()

In [None]:
sess.run(tf.global_variables_initializer())

In [None]:
#Testing ELMo
embeddings = elmo_model(
    ["the cat is on the mat", "dogs are in the fog"],
    signature="default",
    as_dict=True
)["elmo"]

In [None]:
e = sess.run(embeddings)
e.shape

# Data Preprocessing

In [None]:
set(stopwords.words('english'))

In [None]:
#Regular Expressions
whitespace_re = re.compile('\s+')
word_re = re.compile('\w+')

In [None]:
#Data Preparation begins from here
#Not removing stopwords to encompass semantics and polysemy using ELMo model
x = list()
y = list()
with open('amazon_reviews.txt', 'r',encoding='utf-8') as fr:
    for i, line in enumerate(fr):
        if i == 0:
            continue
        classification, line = whitespace_re.split(line, maxsplit=1)
        line = line.lower()
        line = line.strip()
        line = html.unescape(line)
        line = ' '.join(word_re.findall(line))
        #line = pattern.sub('', line)
        x.append(line)
        y.append(classification)

In [None]:
#Binarizing Labels
y = list(map(int, y))
lb = LabelBinarizer()
lb.fit(y)
y = lb.transform(y)

In [None]:
num_classes = y.shape[-1]

In [None]:
#Always a good practice to validate code, therefore checking Y and X
y[:10]

In [None]:
x[:100]

In [None]:
#Computing Max sentence length in order to enable padding
max_sentence_length = 0
for line in x:
    max_sentence_length = max(max_sentence_length, len(line.split()))
max_sentence_length

In [None]:
#Padding Function Definition
def pad(e, sentence_length=max_sentence_length):
    # https://stackoverflow.com/questions/35751306/python-how-to-pad-numpy-array-with-zeros
    num_sentences, old_sentence_length, embedding_length = e.shape
    e2 = np.zeros((num_sentences, sentence_length, embedding_length))
    e2[:, :old_sentence_length, :] = e
    return e2

# Embeddings for input data using ELMo

In [None]:
%%time
all_embeddings = list()
step = 500
for i in range(int(len(x)/step)+1):
    left = i*step
    right = (i+1)*step
    this_x = x[left:right]

    # due to the +1 in the range(...+1), we can end up
    # with an empty row at the end. just skip it.
    if not this_x:
        continue

    embeddings = elmo_model(
        this_x,
        signature="default",
        as_dict=True
    )["elmo"]
    e = sess.run(embeddings)
    e = pad(e)
    all_embeddings.append(e)
all_embeddings = np.concatenate(all_embeddings)
all_embeddings.shape

In [None]:
all_embeddings.shape

In [None]:
embedding_vector_length = all_embeddings.shape[-1]

In [None]:
all_embeddings.shape

# Custom implementation of Confusion Matrix

In [None]:
class Metrics(Callback):

    def __init__(self):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
        self.confusion = []
    def on_train_begin(self, logs={}):
        pass

    def on_epoch_end(self, epoch, logs={}):
        score = np.asarray(self.model.predict(self.validation_data[0]))
        predict = np.round(np.asarray(self.model.predict(self.validation_data[0])))
        targ = self.validation_data[1]

        self.val_f1s.append(sklm.f1_score(targ, predict, average='micro'))
        self.confusion.append(sklm.confusion_matrix(targ.argmax(axis=1),predict.argmax(axis=1)))
        print('F1: %s confusion:\n%s' % (self.val_f1s[-1], self.confusion[-1]))
        return


In [None]:
metrics = Metrics()

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
X_train, X_test, y_train, y_test = train_test_split(
    all_embeddings,
    y[:all_embeddings.shape[0]],
    test_size=.01,
    stratify=y.argmax(axis=1)[:all_embeddings.shape[0]],
)

# Building the model

In [None]:
# https://keras.io/layers/core/
# https://keras.io/getting-started/sequential-model-guide/
model = keras.Sequential()
model.add(keras.layers.LSTM(
    100,
    input_shape=(max_sentence_length,embedding_vector_length),
))
model.add(keras.layers.Dense(
    num_classes,
    # Hands-On Machine Learning with Scikit-Learn and TensorFlow, ed. 1 by Aurélien Géron
    activation='softmax',
))
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=False)
model.compile(
    # https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks
    #loss='categorical_crossentropy',
    loss = 'binary_crossentropy',
    #optimizer='adam',
    optimizer = sgd,
    metrics=['accuracy']
)
print(model.summary())

In [None]:
from keras.utils import to_categorical
y_binary = to_categorical([0, 1, 2])

In [None]:
y_binary

# Fitting the model

In [None]:
#Model training with batch size and epochs
model.fit(
    #x=all_embeddings,
    #y=y[:all_embeddings.shape[0]],
    x=X_train,
    y=y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=32,
    #epochs=10,
#    callbacks=[metrics],
#     verbose=1,
#     callbacks=None,
#     validation_split= 0.10,
#     validation_data=None,
#     shuffle=True,
#     class_weight=None,
#     sample_weight=None,
#     initial_epoch=0,
#     steps_per_epoch=None,
#     validation_steps=None,
)

# Prediction

In [None]:
#testing data
x_test = list()

with open('clean_test.txt', 'r', encoding='utf-8') as fr:
    for i, line in enumerate(fr):
        classification, line = whitespace_re.split(line, maxsplit=1)
        line = line.strip()
        line = html.unescape(line)
        line = ' '.join(word_re.findall(line))
        #line = pattern.sub('', line)
        x_test.append(line)

In [None]:
x_test[:10]

In [None]:
%%time
#Test Embeddings
test_embeddings = list()
step = 500
for i in range(int(len(x_test)/step)+1):
    left = i*step
    right = (i+1)*step
    this_xtest = x_test[left:right]

    # due to the +1 in the range(...+1), we can end up
    # with an empty row at the end. just skip it.
    if not this_xtest:
        continue

    embeddings = elmo_model(
        this_xtest,
        signature="default",
        as_dict=True
    )["elmo"]
    e_test = sess.run(embeddings)
    e_test = pad(e_test)
    test_embeddings.append(e_test)
test_embeddings = np.concatenate(test_embeddings)
test_embeddings.shape

In [None]:
test_embeddings.shape

In [None]:
y_test = model.predict(test_embeddings)

In [None]:
y_classes = y_test.argmax(axis=1)

In [None]:
#Generating Prediction File
with open("prediction.txt","w") as file:
    for i in y_classes:
        file.write(str(i+1)+"\n")

In [None]:
sess.close()