In [None]:
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
import numpy as np

The imdb.load_data function in Keras is used to load the IMDb dataset, which is a dataset for binary sentiment classification containing a set of 25,000 highly polar movie reviews for training and 25,000 for testing.

In [None]:
# Load the dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
X_train.shape

(25000,)

In [None]:
y_train.shape

(25000,)

In [None]:
y_train[0:10]

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0])

In [None]:
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [None]:
# Pad the sequences to a maximum length of 50
X_train = pad_sequences(X_train, padding='post', maxlen=50)
X_test = pad_sequences(X_test, padding='post', maxlen=50)


In [None]:
# Build the model
model = Sequential()
model.add(SimpleRNN(32, input_shape=(50, 1)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

50 represents the number of time steps in each input sequence.

1 represents the number of features at each time step.

[1, 14, 22, 16, 43, ..., 0, 0, 0]

[[1], [14], [22], [16], [43], ..., [0], [0], [0]]

In [None]:
X_train.shape

(25000, 50)

So, the RNN layer's input_shape of (50, 1) tells the model that it will receive sequences of 50 time steps, where each time step consists of a single integer feature. This matches the shape of the data you have after padding and reshaping.

In [None]:
# Reshape the input data to match the input shape of the model
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

In [None]:
X_train.shape

(25000, 50, 1)

In [None]:
X_test.shape

(25000, 50, 1)

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x78d188262d70>

# Testing

In [None]:
# Example new reviews (preprocessed as integers)
new_reviews = [[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 29, 2, 38, 34, 6, 2, 839, 5, 2, 121, 29, 9, 108, 8, 12, 100, 24, 12, 4, 2, 3761],
               [1, 591, 202, 14, 31, 6, 717, 10, 10, 2, 2, 42, 30, 413, 2, 40, 2, 13, 161, 2, 15, 2, 2, 6, 2, 19, 14, 22, 4, 192, 15, 2, 2, 2, 2, 8, 67, 2, 5, 4, 2, 2, 15, 2, 2, 2, 2, 2, 2, 2]]

In [None]:
# Pad the new reviews
X_new = pad_sequences(new_reviews, padding='post', maxlen=50)
X_new = np.expand_dims(X_new, axis=-1)

In [None]:
X_new.shape

(2, 50, 1)

In [None]:
# Make predictions
predictions = model.predict(X_new)
print(predictions)

[[0.51440793]
 [0.5387626 ]]


In [None]:
# Interpret the predictions
for review, prediction in zip(new_reviews, predictions):
    sentiment = "positive" if prediction > 0.5 else "negative"
    print(f"Review: {review[:10]}... - Sentiment: {sentiment}")

Review: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]... - Sentiment: positive
Review: [1, 591, 202, 14, 31, 6, 717, 10, 10, 2]... - Sentiment: positive
