# Sentiment Classification with IMDB Data




## Setup

In [None]:
import numpy as np
from tensorflow import keras

import h5py
import matplotlib.pyplot as plt
import time
import scipy
from PIL import Image
from scipy import ndimage

max_features = 5000  # Only consider the top 5k words
maxlen = 200  # Only consider the first 200 words of each movie review


## Load the IMDB movie review sentiment data

In [None]:
(x_train_orig, y_train_orig), (x_val_orig, y_val_orig) = keras.datasets.imdb.load_data(
    num_words=max_features
)
print(len(x_train_orig), "Training sequences")
print(len(x_val_orig), "Validation sequences")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
25000 Training sequences
25000 Validation sequences


In [None]:
print(x_train_orig.shape, y_train_orig.shape)

(25000,) (25000,)


In [None]:
# Retrieve the word index file mapping words to indices
word_index = keras.datasets.imdb.get_word_index()
# Reverse the word index to obtain a dict mapping indices to words
inverted_word_index = dict((i, word) for (word, i) in word_index.items())
# Example: decode the first sequence in the dataset
decoded_sequence = " ".join(inverted_word_index[i] for i in x_train_orig[0])
print(decoded_sequence)
print(y_train_orig[0])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room and it so heart shows to years of every never going and help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of 70s and with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other and in of seen over and for anyone of and br show's to whether from than out themselves history he name half some br of and odd was two most of mean for 1 any an boat she he should is thought and but 

In [None]:
print(x_train_orig[0][:])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]


In [None]:
x_train = keras.preprocessing.sequence.pad_sequences(x_train_orig, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(x_val_orig, maxlen=maxlen)

print(x_train.shape, x_val.shape)

(25000, 200) (25000, 200)


In [None]:
print(y_train_orig.shape)
print(y_val_orig.shape)
y_train_cat = keras.utils.to_categorical(y_train_orig, num_classes=2)
y_val_cat = keras.utils.to_categorical(y_val_orig, num_classes=2)

print(y_train_cat.shape)
print(y_val_cat.shape)

In [None]:
print(x_train[2,:])
print(y_train_cat[2])

## Train and evaluate the model

In [None]:
max_features = 5000
inputs = keras.Input(shape=(None,), dtype="int32")
x = keras.layers.Embedding(max_features,
                           256,
                           embeddings_initializer=keras.initializers.Identity(gain=0.5),
                           trainable=True)(inputs)
x = keras.layers.SimpleRNN(32)(x)
outputs = keras.layers.Dense(1, activation='sigmoid')(x)
model0 = keras.Model(inputs, outputs)
model0.summary()

In [None]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = keras.layers.Embedding(max_features, 128)(inputs)
# Add a simpleRNN layer
x = keras.layers.SimpleRNN(64)(x)
# Add a classifier
outputs = keras.layers.Dense(2, activation="softmax")(x)
model1 = keras.Model(inputs, outputs)
model1.summary()

Use BinaryCrossEntropy when $y \in \{0,1\}$. If $y$ is a two dimensional vector with one of the entries 1, then use CategoricalCrossEntropy. Use the y_train and y_val accordingly.

In [None]:
def train_model(model, epochs):
  #optimizer = keras.optimizers.SGD(learning_rate=0.1)
  optimizer = keras.optimizers.Adam(learning_rate=0.001)
  bce = keras.losses.BinaryCrossentropy(from_logits=False)
  # cce = keras.losses.CategoricalCrossentropy()
  model.compile(optimizer=optimizer,
              loss=cce,
              # loss=bce,
              metrics=['accuracy'])
  return model.fit(x_train, y_train_orig, batch_size=256, epochs=epochs, validation_data=(x_val, y_val_orig))

def plot_training(trained_model, epochs):
  # Print the learning curve
  plt.plot(range(1, epochs+1), trained_model.history['loss'] , 'r', range(1, epochs+1), trained_model.history['val_loss'], 'b')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  _ = plt.legend(['Training Loss', 'Validation Loss'])

In [None]:
trained_model1 = train_model(model1, 3)

In [None]:
plot_training(trained_model1, 3)

In [None]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = keras.layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True))(x)
x = keras.layers.Bidirectional(keras.layers.LSTM(64))(x)
# Add a classifier
outputs = keras.layers.Dense(1, activation="sigmoid")(x)
model2 = keras.Model(inputs, outputs)
model2.summary()

In [None]:
trained_model2 = train_model(model2, 3)

In [None]:
plot_training(trained_model2, 3)