### Binary classification on IMDB dataset using a basic LSTM

In [1]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
def batch_iter(data, labels, batch_size, shuffle=True):
    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1

    def data_generator():
        data_size = len(data)
        while True:
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
                shuffled_labels = labels[shuffle_indices]
            else:
                shuffled_data = data
                shuffled_labels = labels

            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                X, y = shuffled_data[start_index: end_index], shuffled_labels[start_index: end_index]
                yield X, y

    return num_batches_per_epoch, data_generator()

In [3]:
# def main():
max_features = 20000
maxlen = 80
batch_size = 32

In [4]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

In [5]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(25000,)
(25000,)
(25000,)
(25000,)


In [6]:
print([len(item) for item in x_train[:10]])
print(x_train[0])

[218, 189, 141, 550, 147, 43, 123, 562, 233, 130]
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32

In [7]:
print(y_train)
print(len(y_train))

[1 0 0 ... 0 1 0]
25000


In [8]:
print([len(item) for item in x_test[:10]])
print(x_test[0])

[68, 260, 603, 181, 108, 132, 761, 180, 134, 370]
[1, 591, 202, 14, 31, 6, 717, 10, 10, 18142, 10698, 5, 4, 360, 7, 4, 177, 5760, 394, 354, 4, 123, 9, 1035, 1035, 1035, 10, 10, 13, 92, 124, 89, 488, 7944, 100, 28, 1668, 14, 31, 23, 27, 7479, 29, 220, 468, 8, 124, 14, 286, 170, 8, 157, 46, 5, 27, 239, 16, 179, 15387, 38, 32, 25, 7944, 451, 202, 14, 6, 717]


In [9]:
print(y_test)
print(len(y_test))

[0 1 1 ... 0 0 0]
25000


In [10]:
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

In [11]:
print([len(item) for item in x_train[:10]])

[80, 80, 80, 80, 80, 80, 80, 80, 80, 80]


In [12]:
print([len(item) for item in x_test[:10]])

[80, 80, 80, 80, 80, 80, 80, 80, 80, 80]


In [13]:
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
train_steps, train_batches = batch_iter(x_train, y_train, batch_size)

In [15]:
print(train_steps)

782


In [16]:
# See how does a train_batch look like.
for tb in train_batches:
    print(tb)
    break

(array([[   4,   22,   43, ...,    8,   30,  110],
       [   4,  109,    7, ...,  212,  883,  512],
       [  63,    9,   55, ...,  228,    5, 4176],
       ...,
       [  51,   59,   16, ...,  215,   67,   20],
       [   4,   64, 1801, ...,   34,    4,   96],
       [  14,   22,  165, ...,   47,    8, 1464]], dtype=int32), array([0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 1]))


In [17]:
valid_steps, valid_batches = batch_iter(x_test, y_test, batch_size)

In [18]:
print(valid_steps)

782


In [19]:
# See how does a valid_batch look like.
for vb in valid_batches:
    print(vb)
    break

(array([[   5,    5,   41, ...,   12,    9,  120],
       [ 243,    7, 2547, ...,   61,  375, 1189],
       [   0,    0,    0, ...,   24,  386,   12],
       ...,
       [ 567,   88,   48, ...,   32,  134,  153],
       [   6,  147, 1726, ...,  209, 6935,  105],
       [ 812,    7, 1268, ...,    7, 1751,  438]], dtype=int32), array([1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 0]))


In [None]:
model.fit_generator(train_batches, train_steps, epochs=1, validation_data=valid_batches, validation_steps=valid_steps)

### Output of the above training is:
Epoch 1/1
782/782 [==============================] - 119s 152ms/step - loss: 0.4733 - accuracy: 0.7738 - val_loss: 0.3023 - val_accuracy: 0.8216

In [21]:
# Evaluate the trained model on some examples.
token_list = [ 1, 591, 202, 14, 31, 6, 717, 10, 10, 18142, 10698, 5]
token_list = pad_sequences([token_list], maxlen=13, padding='pre')
token_list.shape
predicted = model.predict_classes(token_list, verbose=0)
print(predicted)

[[1]]


In [22]:
token_list = [ 27, 239, 16, 179, 15387, 38, 32, 25, 7944, 451, 202, 14, 6,]
token_list = pad_sequences([token_list], maxlen=13, padding='pre')
token_list.shape
predicted = model.predict_classes(token_list, verbose=0)
print(predicted)

[[1]]
