# Problem 6 - Sentiment Analysis using recurrent models 20 points

In this problem, you will compare the performance of RNN, LSTM, GRU and BiLSTM for the task of sentiment analysis. You’ll use the IMDB sentiment analysis dataset for this task - Sentiment Analysis of IMDB Movie Reviews. For each model, use a single cell, and keep the number of units fixed to 256. Train each model for 10 epochs using the Adam optimizer, batch size of 256, and a learning rate of 0.01.

In [5]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

def process_tokens(text):
    """
    function to process tokens, replace any unwanted chars
    """
    preprocessed_text = text.lower().replace(",", "").replace(".", "").replace(":", "").replace(")", "").replace("-", "").replace("(", "")
    preprocessed_text = ''.join([i for i in preprocessed_text if not preprocessed_text.isdigit()])
    return preprocessed_text

def preprocessing(data):
    """
    preprocessing data to list of tokens
    """
    nlp = English()
    tokenizer = Tokenizer(nlp.vocab)
    preprocessed_data = []
    for sentence in data:
        sentence = process_tokens(sentence)
        tokens = tokenizer(sentence)
        tlist = []
        for token in tokens:
            tlist.append(str(token))
        preprocessed_data.append(tlist)
    return preprocessed_data

1. Import the dataset and convert it into vector form using Bag of Words technique.(2) 

In [1]:
!gdown 1o5Hu9mOZsXxhIbPEov80LsxxCs5A2_7W -O './imdb.csv'

Downloading...
From: https://drive.google.com/uc?id=1o5Hu9mOZsXxhIbPEov80LsxxCs5A2_7W
To: /content/imdb.csv
100% 66.2M/66.2M [00:02<00:00, 32.8MB/s]


In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("imdb.csv", usecols=["review", "sentiment"], encoding='latin-1')
## 1 - positive, 0 - negative
df.sentiment = (df.sentiment == "positive").astype("int")
df.head()

val_size = int(df.shape[0] * 0.15)
test_size = int(df.shape[0] * 0.15)


def train_val_test_split(df=None, train_percent=0.7, test_percent=0.15, val_percent=0.15):
  df = df.sample(frac=1)
  train_df = df[: int(len(df)*train_percent)]
  test_df = df[int(len(df)*train_percent)+1 : int(len(df)*(train_percent+test_percent))]
  val_df = df[int(len(df)*(train_percent + test_percent))+1 : ]
  return train_df, test_df, val_df

train_df, test_df, val_df = train_val_test_split(df, 0.7, 0.15, 0.15)
train_labels, train_texts = train_df.values[:,1], train_df.values[:,0]
val_labels, val_texts = val_df.values[:,1], val_df.values[:,0]
test_labels, test_texts = test_df.values[:,1], test_df.values[:,0]
print(len(train_df), len(test_df), len(val_df))
print(len(train_texts), len(train_labels), len(val_df))


train_data = preprocessing(train_texts)
val_data = preprocessing(val_texts)
test_data = preprocessing(test_texts)

35000 7499 7499
35000 35000 7499


In [17]:
import numpy as np
import itertools

## Creating a vectorizer to vectorize text and create matrix of features
## Bag of words technique
class Vectorizer():
    def __init__(self, max_features):
        self.max_features = max_features
        self.vocab_list = None
        self.token_to_index = None

    def fit(self, dataset):
        word_dict = {}
        for sentence in dataset:
            for token in sentence:
                if token not in word_dict:
                    word_dict[token] = 1
                else:
                    word_dict[token] += 1
        word_dict = dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=True))
        end_to_slice = min(len(word_dict), self.max_features)
        word_dict = dict(itertools.islice(word_dict.items(), end_to_slice))
        self.vocab_list = list(word_dict.keys())
        self.token_to_index = {}
        counter = 0
        for token in self.vocab_list:
            self.token_to_index[token] = counter
            counter += 1


    def transform(self, dataset):
        data_matrix = np.zeros((len(dataset), len(self.vocab_list)))
        for i, sentence in enumerate(dataset):
            for token in sentence:
                if token in self.token_to_index:
                    data_matrix[i, self.token_to_index[token]] += 1
        return data_matrix

## max features - top k words to consider only
max_features = 2000 

vectorizer = Vectorizer(max_features=max_features)
vectorizer.fit(train_data)

## Checking if the len of vocab = k 
X_train = vectorizer.transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

vocab = vectorizer.vocab_list

In [None]:
vocab

2. Define an RNN model and train it on the dataset (4)

In [18]:
from tensorflow.keras.utils import to_categorical

y_train = y_train.astype('int')
y_val = y_val.astype('int')
y_test = y_test.astype('int')

y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
y_val = to_categorical(y_val, 2)

X_train = X_train.reshape(-1, 1, X_train.shape[1])
X_val = X_val.reshape(-1, 1, X_val.shape[1])
X_test = X_test.reshape(-1, 1, X_test.shape[1])

y_train = y_train.reshape(-1, 2)
y_val = y_val.reshape(-1, 2)
y_test = y_test.reshape(-1, 2)

print(f'X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}')

X_train.shape: (35000, 1, 2000), y_train.shape: (35000, 2)


In [29]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import SimpleRNN, Dropout
from tensorflow.keras.optimizers import Adam

model = None
model = Sequential()
model.add(SimpleRNN(128, input_shape=(1, max_features)))
model.add(Dense(2, activation='softmax'))

optimizer = Adam()
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history.history.keys())

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_5 (SimpleRNN)    (None, 128)               272512    
                                                                 
 dense_10 (Dense)            (None, 2)                 258       
                                                                 
Total params: 272,770
Trainable params: 272,770
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


In [30]:
score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Test loss: 0.34322747588157654
Test accuracy: 0.8715828657150269


3. Define a LSTM model and train it on the dataset (4)


In [31]:
from tensorflow.keras.layers import LSTM

model = None
model = Sequential()
model.add(LSTM(128, input_shape=(1, max_features)))
model.add(Dense(2, activation='softmax'))

optimizer = Adam()
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history.history.keys())

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_5 (LSTM)               (None, 128)               1090048   
                                                                 
 dense_11 (Dense)            (None, 2)                 258       
                                                                 
Total params: 1,090,306
Trainable params: 1,090,306
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


In [32]:
score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Test loss: 0.4387073218822479
Test accuracy: 0.8751833438873291


4. Define a GRU model and train it on the dataset (4)


In [33]:
from tensorflow.keras.layers import GRU

model = None
model = Sequential()
model.add(GRU(128, input_shape=(1, max_features)))
model.add(Dense(2, activation='softmax'))

optimizer = Adam()
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history.history.keys())

score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_1 (GRU)                 (None, 128)               817920    
                                                                 
 dense_12 (Dense)            (None, 2)                 258       
                                                                 
Total params: 818,178
Trainable params: 818,178
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
Test loss: 0.44197070598602295
Test accuracy: 0.8702493906021118


In [34]:
# check predictions
from tensorflow.keras.backend import argmax

y_pred = model.predict(X_test)
for i in range(5):
  print(f'Label predicted: {argmax(y_pred[i]).numpy()}, Actual label: {argmax(y_test[i]).numpy()}')
  print(f'text: {test_texts[i]}')

Label predicted: 1, Actual label: 1
text: Love Jones cleverly portrays young African-American men and women in a clear, positive, realistic sense. I feel that all of the actors and actresses were magnificent and really did a great job at capturing the mood. Nia Long and Larenz Tate worked well together and I hope to see more work from the two of them. As a matter of fact all of the actors/actresses did such a fine job it would be great to see another romantic-comedy from them. This movie can be compared to most any well-written, romantic comedy. If you have not seen this movie already I strongly recommend that you do, it can definitely give you another perspective on life and love.
Label predicted: 0, Actual label: 0
text: This version is very painful to watch. All of the acting is very stilted but especially that of Norma Shearer who is still acting as though she were in a silent movie instead of a talkie. Check out the 1937 version with Joan Crawford, Robert Montgomery and William Po

5. Define a BiLSTM model and train it on the dataset (4)


In [25]:
from tensorflow.keras.layers import Bidirectional

model = None
model = Sequential()
model.add(Bidirectional(LSTM(256, return_sequences=True), input_shape=(1, max_features)))
model.add(Bidirectional(LSTM(256)))
model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history.history.keys())

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 1, 512)           4622336   
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 512)              1574912   
 nal)                                                            
                                                                 
 dense_7 (Dense)             (None, 2)                 1026      
                                                                 
Total params: 6,198,274
Trainable params: 6,198,274
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


In [27]:
score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Test loss: 0.3505217730998993
Test accuracy: 0.8695825934410095



6. Compare the performance of all the models. In which case do you get the best accuracy? (2)

Simple RNN:
Test loss: 0.34322747588157654
Test accuracy: 0.8715828657150269

LSTM:
Test loss: 0.4387073218822479
Test accuracy: 0.8751833438873291

GRU:
Test loss: 0.44197070598602295
Test accuracy: 0.8702493906021118

Bi LSTM
Test loss: 0.3505217730998993
Test accuracy: 0.8695825934410095

Best Accuracy was given by the simple RNN model. 