In [1]:
# Credits: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
# LSTM for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)


In [2]:
#Refer: https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification

# load the dataset but will keep only the top 5000 words by frequency in each review, will keep zero for the rest words present in the review
# so if a review has any word from top 5000 most frequent words, the corresponding index will be placed. 
# if the word of the review is not in top 5000 words, we will skip its index
# a total of 25000 training data and 25000 test data
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
print(X_train[1])
print(type(X_train[1]))
print(len(X_train[1]))

[1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369, 2, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 2, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 2, 5, 163, 11, 3215, 2, 4, 1153, 9, 194, 775, 7, 2, 2, 349, 2637, 148, 605, 2, 2, 15, 123, 125, 68, 2, 2, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 2, 5, 2, 656, 245, 2350, 5, 4, 2, 131, 152, 491, 18, 2, 32, 2, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]
<class 'list'>
189


In [4]:
# Number of training samples
X_train.shape

(25000,)

In [5]:
# Number of test samples
X_test.shape

(25000,)

In [18]:
import numpy as np
np.unique(y_train) # positive and negative reviews

array([0, 1])

In [6]:
max(numpy.max(X_test))
# as we have choosen top 5000 words no index is greater than 5000

4998

In [7]:
max(numpy.max(X_train))

4987

#### Purpose of Padding
* to convert each review, to 600 length vectors. 
  * case-1: if the number of words in the review is less than 600, pre-pad with 0's
  * case-2: if the number of words are more than 600, keep only first 600 word's indexes

* Padding helps to do SGD with batch size > 1. If we don't use Padding, we need to send one review at a time.
  * As padding converts all the reviews to same size, we can use the same network to train using multiple reviews

In [8]:
## Before padding
print(X_train.shape)
print(X_train[1])

(25000,)
[1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369, 2, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 2, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 2, 5, 163, 11, 3215, 2, 4, 1153, 9, 194, 775, 7, 2, 2, 349, 2637, 148, 605, 2, 2, 15, 123, 125, 68, 2, 2, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 2, 5, 2, 656, 245, 2350, 5, 4, 2, 131, 152, 491, 18, 2, 32, 2, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]


In [9]:
# truncate and/or pad input sequences
max_review_length = 600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [12]:
print(X_train.shape)
print(len(X_train[1]))
print(X_train[1])

(25000, 600)
600
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0

In [11]:
# creating the model
embedding_vecor_length = 32
top_words = 5000 # top frequencty
max_review_length = 600 # padding length

model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length)) 
# Embedding layer to converts  positive integers (indexes) into dense vectors of fixed size. [https://keras.io/api/layers/core_layers/embedding/], [https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/]
# input_dim (first parameter, here top_words): Integer. Size of the vocabulary, in our case it is 600
# output_dim (second parameter, here embedding_vector_length): Integer. Dimension of the dense embedding i.e., length of each vector
# So each review {which has 600 indices} will be of size (600, 32) i.e., each word/word_index will become a 32 length vector. The resultant vector is a dense one with having real values instead of just 0's and 1's.

model.add(LSTM(100)) # adding number of layers in deep LSTM

model.add(Dense(1, activation='sigmoid')) # final output layer to take input from each LSTM (100 are there)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
#Refer: https://datascience.stackexchange.com/questions/10615/number-of-parameters-in-an-lstm-model

# (None, 600, 32) --> Each of the 600 indexes are converted to 32-length vectors. None is the batch size, which we supply during training
# 160000 = 5000 * 32; for each 5000 n-frequent words, getting a vector of size 32
# 53200 = 4 * (n*m + n*n + n); where n-number of LSTM layers, m-length of vector; +n - represents biases, 1 bias for each LSTM layer
# number of parameters from LSTM = 4* (100 * 32 + 100 * 100 + 100)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 600, 32)           160000    
                                                                 
 lstm (LSTM)                 (None, 100)               53200     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
model.fit(X_train, y_train, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe23a6d5c10>

In [14]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 85.68%
