End To End Deep Learning project using the Simple RNN Text Classification   

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb      # import the built-in imdb dataset in keras
from tensorflow.keras.preprocessing import sequence 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN     # import the necessary layers Dense for fully connected layer, Embedding for embedding layer, SimpleRNN for RNN layer





In [2]:
## Load imdb dataset
max_features = 10000     # the maximum number of words to consider as features
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)   # load the imdb dataset with the top 10000 most frequent words


print(f'Training data shape:{X_train.shape}, Training label shape:{y_test.shape}')   # print the shape of the training and testing data
print(f'Testing data shape:{X_test.shape}, Testing label shape:{y_test.shape}')

Training data shape:(25000,), Training label shape:(25000,)
Testing data shape:(25000,), Testing label shape:(25000,)


In [3]:
## Inspect sample reviews and its label
sample_review = X_train[0]   # get the first review
sample_label = y_train[0]    # get the label of the first review

print(f'Sample review (as integers):{sample_review}')   # print the first review as integers

print(f'Sample label:{sample_label}')   # print the label of the first review

Sample review (as integers):[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
Sample label:1


In [5]:
## Mapping word index back to words
word_index = imdb.get_word_index()   # get the word index from the imdb dataset
reverse_word_index = {value: key for key, value in word_index.items()}   # reverse the word index to get the word from the index

In [6]:
## Decoded review of sample review( for our understanding)
decoded_review=' '.join([reverse_word_index.get(i-3,'?') for i in sample_review])
print(f'Decoded review of the sample review:{decoded_review}')   # print the decoded review of the first review

Decoded review of the sample review:? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised f

In [7]:
## Applying the padding sequence to the reviews to have same length of each review such that in RNN we can have fixed input size

max_len=500   # maximum length of the review

X_train=sequence.pad_sequences(X_train,maxlen=max_len)   # pad the training data to have the maximum length of 500
X_test=sequence.pad_sequences(X_test,maxlen=max_len)   # pad the testing data to have the maximum length of 500
X_train



array([[   0,    0,    0, ...,   19,  178,   32],
       [   0,    0,    0, ...,   16,  145,   95],
       [   0,    0,    0, ...,    7,  129,  113],
       ...,
       [   0,    0,    0, ...,    4, 3586,    2],
       [   0,    0,    0, ...,   12,    9,   23],
       [   0,    0,    0, ...,  204,  131,    9]])

In [None]:
## TRain Simple RNN
model=Sequential()   # create a sequential model
model.add(Embedding(max_features,128,input_length=max_len))   # add an embedding layer with 128 units
model.add(SimpleRNN(128,activation='relu'))  # add a simple RNN layer with 128 units
model.add(Dense(1,activation='sigmoid'))   # add a dense layer with 1 unit and sigmoid activation function




In [None]:
model.summary()

In [None]:
## Create an instance for EarlyStopping callback
from tensorflow.keras.callbacks import EarlyStopping
earlystopping=EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)   # create an instance for EarlyStopping callback
earlystopping

In [None]:
## Compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])   # compile the model with adam optimizer, binary_crossentropy loss function and accuracy as the metric 

In [12]:
## Train the model
model.fit(X_train,y_train,epochs=10,batch_size=32,validation_split=0.2,callbacks=[earlystopping])   # train the model with 10 epochs, batch size of 128, validation split of 0.2 and earlystopping callback

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.src.callbacks.History at 0x2c5215b1f50>

In [13]:
## save the model file
model.save('simple_rnn_imdb.h5')   # save the model to a file

  saving_api.save_model(
