# Introduction

## IMDB Movie reviews sentiment classification

Dataset of 25,000 movies reviews from IMDB, labeled by sentiment (positive/negative). Reviews have been preprocessed, and each review is encoded as a sequence of word indexes (integers). For convenience, words are indexed by overall frequency in the dataset, so that for instance the integer "3" encodes the 3rd most frequent word in the data. This allows for quick filtering operations such as: "only consider the top 10,000 most common words, but eliminate the top 20 most common words".

As a convention, "0" does not stand for a specific word, but instead is used to encode any unknown word.

Source: https://keras.io/datasets/

# Prepare Dataset

In [None]:
import numpy as np
np.random.seed(1)

## Download Dataset

In [None]:
import keras
word_to_idx = keras.datasets.imdb.get_word_index()
idx_to_word = {idx : word for word, idx in word_to_idx.items()}
(x_trn, y_trn), (x_tst, y_tst) = keras.datasets.imdb.load_data()

## Dataset Statistics

In [None]:
review_sizes = [size for size in map(len, x_trn)]
print('  Vocabulary Size: %d' % len(word_to_idx))
print('Training Examples: %d' % x_trn.shape[0])
print('    Test Examples: %d' % x_tst.shape[0])
print(' Min. Review Size: %0.2f' % np.min(review_sizes))
print(' Max. Review Size: %0.2f' % np.max(review_sizes))
print(' Avg. Review Size: %0.2f' % np.mean(review_sizes))
print(' Std. Review Size: %0.2f' % np.std(review_sizes))

## Truncate Vocabulary

In [None]:
def truncate_vocabulary(x, vocabulary_size):
    for i, review in enumerate(x):
        review = np.array(review)
        unk_words = review >= vocabulary_size
        review[unk_words] = vocabulary_size - 1
        x[i] = review
    return x

In [None]:
vocabulary_perc = 0.05
vocabulary_size = int(vocabulary_perc * len(word_to_idx))
x_trn = truncate_vocabulary(x_trn, vocabulary_size)
x_tst = truncate_vocabulary(x_tst, vocabulary_size)

In [None]:
print('Vocabulary Size: %d' % vocabulary_size)

## Truncate Reviews

In [None]:
from keras.preprocessing.sequence import pad_sequences
review_size = 500
x_trn = pad_sequences(x_trn, maxlen=review_size)
x_tst = pad_sequences(x_tst, maxlen=review_size)

In [None]:
x_trn.shape, x_tst.shape

## Split Training and Validation Sets

In [None]:
from sklearn.model_selection import train_test_split
x_trn, x_val, y_trn, y_val = train_test_split(x_trn, y_trn, test_size=0.3, random_state=1)

In [None]:
(x_trn.shape, y_trn.shape), (x_val.shape, y_val.shape), (x_tst.shape, y_tst.shape)

# Train Model

In [None]:
from keras.layers.convolutional import Conv1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.pooling import MaxPooling1D
from keras.layers.core import Flatten, Dropout
from keras.layers import Dense

In [None]:
def evaluate(model, x_trn, y_trn, x_val, y_val, x_tst, y_tst):
    loss_trn, acc_trn = model.evaluate(x_trn, y_trn)
    loss_val, acc_val = model.evaluate(x_val, y_val)
    loss_tst, acc_tst = model.evaluate(x_tst, y_tst)
    print()
    print('    Train / Validation / Test Loss: %f / %f / %f' % (loss_trn, loss_val, loss_tst))
    print('Train / Validation / Test Accuracy: %f / %f / %f' % (acc_trn, acc_val, acc_tst))

## Neural Network (Single Hidden Layer)

In [None]:
nn_model = keras.models.Sequential()
nn_model.add(Embedding(vocabulary_size, 32, input_length=review_size))
nn_model.add(Flatten())
nn_model.add(Dense(100, activation='relu'))
nn_model.add(Dropout(0.7, seed=1))
nn_model.add(Dense(1, activation='sigmoid'))

nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
nn_model.summary()

In [None]:
nn_model.fit(x_trn, y_trn, batch_size=32, epochs=1, validation_data=(x_val, y_val))

In [None]:
evaluate(nn_model, x_trn, y_trn, x_val, y_val, x_tst, y_tst)

## Convolutional Neural Network

In [None]:
cnn_model = keras.models.Sequential()
cnn_model.add(Embedding(vocabulary_size, 32, input_length=review_size))
cnn_model.add(Conv1D(64, 7, padding='same', activation='relu'))
cnn_model.add(MaxPooling1D())
cnn_model.add(Dropout(0.2, seed=1))
cnn_model.add(Flatten())
cnn_model.add(Dense(100, activation='relu'))
cnn_model.add(Dropout(0.7, seed=1))
cnn_model.add(Dense(1, activation='sigmoid'))

cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.summary()

In [None]:
cnn_model.fit(x_trn, y_trn, batch_size=32, epochs=2, validation_data=(x_val, y_val))

In [None]:
evaluate(cnn_model, x_trn, y_trn, x_val, y_val, x_tst, y_tst)

## Long Short Term Memory (LSTM)

In [None]:
lstm_model = keras.models.Sequential()
lstm_model.add(Embedding(vocabulary_size, 64, input_length=review_size, mask_zero=True))
lstm_model.add(LSTM(100, dropout=0.0, recurrent_dropout=0.0))
lstm_model.add(Dense(500, activation='relu'))
lstm_model.add(Dropout(0.5, seed=1))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

In [None]:
lstm_model.fit(x_trn, y_trn, batch_size=64, epochs=1, validation_data=(x_val, y_val))

In [None]:
evaluate(lstm_model, x_trn, y_trn, x_val, y_val, x_tst, y_tst)