Sentiment Analysis with an Recurrent Neural Networks (RNN)


In [2]:
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
import numpy as np

Getting reviews with words that come under 5000

most occuring words in the entire

corpus of textual review data

In [3]:
vocab_size = 5000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = vocab_size)
print(x_train[0])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25

getting all the words from word_index dictionary

In [4]:
word_idx = imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


originally the index number of a value and not a key,

hence converting the index as key and the words as values

In [5]:
word_idx = {i: word for word, i in word_idx.items()}

again printing the review

In [6]:
print([word_idx[i] for i in x_train[0]])

['the', 'as', 'you', 'with', 'out', 'themselves', 'powerful', 'lets', 'loves', 'their', 'becomes', 'reaching', 'had', 'journalist', 'of', 'lot', 'from', 'anyone', 'to', 'have', 'after', 'out', 'atmosphere', 'never', 'more', 'room', 'and', 'it', 'so', 'heart', 'shows', 'to', 'years', 'of', 'every', 'never', 'going', 'and', 'help', 'moments', 'or', 'of', 'every', 'chest', 'visual', 'movie', 'except', 'her', 'was', 'several', 'of', 'enough', 'more', 'with', 'is', 'now', 'current', 'film', 'as', 'you', 'of', 'mine', 'potentially', 'unfortunately', 'of', 'you', 'than', 'him', 'that', 'with', 'out', 'themselves', 'her', 'get', 'for', 'was', 'camp', 'of', 'you', 'movie', 'sometimes', 'movie', 'that', 'with', 'scary', 'but', 'and', 'to', 'story', 'wonderful', 'that', 'in', 'seeing', 'in', 'character', 'to', 'of', '70s', 'and', 'with', 'heart', 'had', 'shadows', 'they', 'of', 'here', 'that', 'with', 'her', 'serious', 'to', 'have', 'does', 'when', 'from', 'why', 'what', 'have', 'critics', 'they'

get the minimum and the maximum length of reviews

In [7]:
print("Max length of a review :: ", len(max((x_train + x_test), key = len)))
print("Min length of a review :: ", len(min((x_train + x_test), key = len)))

Max length of a review ::  2697
Min length of a review ::  70


In [8]:
from tensorflow.keras.preprocessing import sequence
#keeping a fixed length of all reviews to max 400 words
max_words = 400

x_train = sequence.pad_sequences(x_train, maxlen = max_words)
x_test = sequence.pad_sequences(x_test, maxlen = max_words)

x_valid, y_valid = x_train[:64], y_train[:64]
x_train_, y_train_ = x_train[64:], y_train[64:]

Build SimpleRNN

In [11]:
#fixing every words embedding size to be 32
embd_len = 32

#creating a rnn model
RNN_model = Sequential(name = "Simple_RNN")
RNN_model.add(Embedding(vocab_size, embd_len, input_length = max_words))

#in case of a stacked (more than one layer of RNN)
#use return_sequences = true
RNN_model.add(SimpleRNN(128, activation = 'tanh', return_sequences = False))
RNN_model.add(Dense(1, activation = 'sigmoid'))

#printing model summary
print(RNN_model.summary())

#compiling model
RNN_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

#training the model
history = RNN_model.fit(x_train_, y_train_, batch_size = 64, epochs = 5, verbose =1, validation_data = (x_valid, y_valid))

#printing model score on test data
print()
print("Simple_RNN Score ---->", RNN_model.evaluate(x_test, y_test, verbose = 0))

None
Epoch 1/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 195ms/step - accuracy: 0.5321 - loss: 0.6871 - val_accuracy: 0.5938 - val_loss: 0.6658
Epoch 2/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 197ms/step - accuracy: 0.7445 - loss: 0.5249 - val_accuracy: 0.7812 - val_loss: 0.4976
Epoch 3/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 198ms/step - accuracy: 0.7145 - loss: 0.5398 - val_accuracy: 0.5781 - val_loss: 0.6648
Epoch 4/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 199ms/step - accuracy: 0.6665 - loss: 0.6053 - val_accuracy: 0.5781 - val_loss: 0.6866
Epoch 5/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 195ms/step - accuracy: 0.6980 - loss: 0.5675 - val_accuracy: 0.6562 - val_loss: 0.6400

Simple_RNN Score ----> [0.6024094820022583, 0.6680799722671509]


Gated recurrent units(gru)

In [12]:
# Defining GRU model
gru_model = Sequential(name="GRU_Model")
gru_model.add(Embedding(vocab_size, embd_len, input_length=max_words))
gru_model.add(GRU(128, activation='tanh', return_sequences=False))
gru_model.add(Dense(1, activation='sigmoid'))

# Printing the Summary
print(gru_model.summary())

# Compiling the model
gru_model.compile( loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])

# Training the GRU model
history2 = gru_model.fit(x_train_, y_train_, batch_size=64, epochs=5, verbose=1, validation_data=(x_valid, y_valid))

# Printing model score on test data
print()
print("GRU model Score---> ", gru_model.evaluate(x_test, y_test, verbose=0))

None
Epoch 1/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 794ms/step - accuracy: 0.6517 - loss: 0.6005 - val_accuracy: 0.8906 - val_loss: 0.3184
Epoch 2/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 794ms/step - accuracy: 0.8634 - loss: 0.3240 - val_accuracy: 0.9219 - val_loss: 0.2112
Epoch 3/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 792ms/step - accuracy: 0.9060 - loss: 0.2344 - val_accuracy: 0.9062 - val_loss: 0.2196
Epoch 4/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 794ms/step - accuracy: 0.9351 - loss: 0.1742 - val_accuracy: 0.8750 - val_loss: 0.2624
Epoch 5/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 793ms/step - accuracy: 0.9487 - loss: 0.1380 - val_accuracy: 0.8906 - val_loss: 0.2074

GRU model Score--->  [0.3149426281452179, 0.8835999965667725]


Long short term memory(lstm)

In [13]:
#defining lstm model
lstm_model = Sequential(name="LSTM_Model")
lstm_model.add(Embedding(vocab_size, embd_len, input_length=max_words))
lstm_model.add(LSTM(128, activation='relu', return_sequences=False))
lstm_model.add(Dense(1, activation='sigmoid'))

# Printing Model Summary
print(lstm_model.summary())

# Compiling the model
lstm_model.compile( loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])

# Training the model
history3 = lstm_model.fit(x_train_, y_train_, batch_size=64, epochs=5, verbose=2, validation_data=(x_valid, y_valid))

# Displaying the model accuracy on test data
print()
print("LSTM model Score---> ", lstm_model.evaluate(x_test, y_test, verbose=0))

None
Epoch 1/5
390/390 - 305s - 782ms/step - accuracy: 0.5061 - loss: nan - val_accuracy: 0.6094 - val_loss: nan
Epoch 2/5
390/390 - 303s - 776ms/step - accuracy: 0.4997 - loss: nan - val_accuracy: 0.6094 - val_loss: nan
Epoch 3/5
390/390 - 299s - 767ms/step - accuracy: 0.4997 - loss: nan - val_accuracy: 0.6094 - val_loss: nan
Epoch 4/5
390/390 - 321s - 824ms/step - accuracy: 0.4997 - loss: nan - val_accuracy: 0.6094 - val_loss: nan
Epoch 5/5
390/390 - 328s - 841ms/step - accuracy: 0.4997 - loss: nan - val_accuracy: 0.6094 - val_loss: nan

LSTM model Score--->  [nan, 0.5]
