# RNN Sentiment Analysis with Keras

In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Integer Encoding and Padding

In [2]:
docs = [
    'Well done!',
    'Good work',
    'Great effort',
    'nice work',
    'Excellent!',
    'Weak',
    'Poor effort!',
    'not good',
    'poor work',
    'Could have done better.'
]

In [3]:
tokenizer = Tokenizer(oov_token='<nothing>')

In [4]:
tokenizer.fit_on_texts(docs)

In [5]:
tokenizer.word_index

{'<nothing>': 1,
 'work': 2,
 'done': 3,
 'good': 4,
 'effort': 5,
 'poor': 6,
 'well': 7,
 'great': 8,
 'nice': 9,
 'excellent': 10,
 'weak': 11,
 'not': 12,
 'could': 13,
 'have': 14,
 'better': 15}

In [6]:
tokenizer.word_counts

OrderedDict([('well', 1),
             ('done', 2),
             ('good', 2),
             ('work', 3),
             ('great', 1),
             ('effort', 2),
             ('nice', 1),
             ('excellent', 1),
             ('weak', 1),
             ('poor', 2),
             ('not', 1),
             ('could', 1),
             ('have', 1),
             ('better', 1)])

In [7]:
tokenizer.document_count

10

In [8]:
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[7, 3],
 [4, 2],
 [8, 5],
 [9, 2],
 [10],
 [11],
 [6, 5],
 [12, 4],
 [6, 2],
 [13, 14, 3, 15]]

In [9]:
sequences = pad_sequences(sequences, padding='post')
sequences

array([[ 7,  3,  0,  0],
       [ 4,  2,  0,  0],
       [ 8,  5,  0,  0],
       [ 9,  2,  0,  0],
       [10,  0,  0,  0],
       [11,  0,  0,  0],
       [ 6,  5,  0,  0],
       [12,  4,  0,  0],
       [ 6,  2,  0,  0],
       [13, 14,  3, 15]])

## Using Keras imdb dataset and SimpleRNN for Sentiment Analysis

In [10]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN

In [None]:
(X_train, y_train), (X_test, y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 6us/step


Data is already preprocessed in the imdb dataset. The reviews are already converted to integers where each integer represents a specific word in a dictionary.

In [12]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [13]:
X_train[:3]

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

In [14]:
y_train[:3]

array([1, 0, 0], dtype=int64)

In [15]:
len(X_train[0]), len(X_train[1]), len(X_train[2])

(218, 189, 141)

But the length of each review is different. So we need to pad the sequences to make them of equal length.

In [None]:
# Trimming to 50 words, cause dataset is big, it will take time (just remove maxlen to use full length)
X_train = pad_sequences(X_train, padding='post', maxlen=50)
X_test = pad_sequences(X_test, padding='post', maxlen=50)

In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((25000, 50), (25000, 50), (25000,), (25000,))

In [19]:
len(X_train[0]), len(X_train[1]), len(X_train[2])

(50, 50, 50)

In [None]:
model = Sequential([
    SimpleRNN(32, input_shape=(50, 1), return_sequences=False),
    Dense(1, activation='sigmoid')
])

# in RNN = (1, 32), 32 bias, (32, 32) = 32 + 32 + 32 * 32 = 1088
# in Dense = (32, 1), 1 bias = 32 + 1 = 33
# Trainable params = 1121

  super().__init__(**kwargs)


In [21]:
model.summary()

In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.5104 - loss: 0.6970 - val_accuracy: 0.5051 - val_loss: 0.6984
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.5042 - loss: 0.6931 - val_accuracy: 0.5059 - val_loss: 0.6971
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.5145 - loss: 0.6924 - val_accuracy: 0.5037 - val_loss: 0.6942
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.5057 - loss: 0.6932 - val_accuracy: 0.5042 - val_loss: 0.6937
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.5050 - loss: 0.6928 - val_accuracy: 0.5046 - val_loss: 0.6934


<keras.src.callbacks.history.History at 0x1d282dc1be0>

## Using Embedding Layer

In natural language processing, a word embedding is a representation of a word. The embedding is used in text analysis. Typically, the representation is a real-valued vector that encodes the meaning of the word in such a way that the words that are closer in the vector space are expected to be similar in meaning.

In [39]:
# Using previous docs as examples
docs = [
    'Well done!',
    'Good work',
    'Great effort',
    'nice work',
    'Excellent!',
    'Weak',
    'Poor effort!',
    'not good',
    'poor work',
    'Could have done better.'
]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)
print(f'Unique words: {len(tokenizer.word_index)}')

Unique words: 14


In [42]:
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[6, 2],
 [3, 1],
 [7, 4],
 [8, 1],
 [9],
 [10],
 [5, 4],
 [11, 3],
 [5, 1],
 [12, 13, 2, 14]]

In [43]:
sequences = pad_sequences(sequences, padding='post')
sequences

array([[ 6,  2,  0,  0],
       [ 3,  1,  0,  0],
       [ 7,  4,  0,  0],
       [ 8,  1,  0,  0],
       [ 9,  0,  0,  0],
       [10,  0,  0,  0],
       [ 5,  4,  0,  0],
       [11,  3,  0,  0],
       [ 5,  1,  0,  0],
       [12, 13,  2, 14]])

We need to sent in Integer encoded sequences to the Embedding layer.

In [None]:
model = Sequential()
model.add(Embedding(input_dim=15, output_dim=2, input_length=4))
model.summary()

# Input dimension = number of unique words + 1
# [7, 3, 0, 0] 4 works (every review), total 14 words
# in Embedding (14, 2), cause every word goes in as 14 dimentional vectors, 2 output dimentions
# We get 2 numbers for every word. 
# So, [x1 x2 x3 x4] to [[x1 x2], [x3 x4], [x5 x6], [x7 x8]]

In [48]:
model.compile('adam', 'accuracy')

Not using training, just to see how Embedding layer works.

In [50]:
pred = model.predict(sequences)
print(pred)

# For every word we get 2 numbers, so for 4 words we get 8 numbers.
# [-0.04132749 -0.02476964] for 'well

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[[[-0.04132749 -0.02476964]
  [ 0.04586729 -0.02956395]
  [ 0.03472282  0.03190008]
  [ 0.03472282  0.03190008]]

 [[ 0.01382596  0.03090755]
  [-0.04630166  0.03762997]
  [ 0.03472282  0.03190008]
  [ 0.03472282  0.03190008]]

 [[ 0.011032    0.01981426]
  [ 0.01648802 -0.03180061]
  [ 0.03472282  0.03190008]
  [ 0.03472282  0.03190008]]

 [[-0.03635175  0.04230921]
  [-0.04630166  0.03762997]
  [ 0.03472282  0.03190008]
  [ 0.03472282  0.03190008]]

 [[-0.03837468 -0.02227776]
  [ 0.03472282  0.03190008]
  [ 0.03472282  0.03190008]
  [ 0.03472282  0.03190008]]

 [[-0.01129283  0.01839573]
  [ 0.03472282  0.03190008]
  [ 0.03472282  0.03190008]
  [ 0.03472282  0.03190008]]

 [[ 0.008283    0.04248099]
  [ 0.01648802 -0.03180061]
  [ 0.03472282  0.03190008]
  [ 0.03472282  0.03190008]]

 [[ 0.03632057 -0.04154035]
  [ 0.01382596  0.03090755]
  [ 0.03472282  0.03190008]
  [ 0.03472282  0.03190008]]

 [[ 0.008283    

## Same imdb dataset but using Embedding layer

In [78]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

In [79]:
X_train = pad_sequences(X_train, padding='post', maxlen=50)
X_test = pad_sequences(X_test, padding='post', maxlen=50)

In [80]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((25000, 50), (25000,), (25000, 50), (25000,))

In [81]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=2, input_length=50),
    SimpleRNN(32, return_sequences=False),
    Dense(1, activation='sigmoid')
])

# in Embedding = (10000, 2) = 20000
# in RNN = (2, 32), 32 bias, (32, 32) = 64 + 32 + 32 * 32 = 1120
# in Dense = (32, 1), 1 bias = 32 + 1 = 33
# Trainable params = 21153

In [82]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [83]:
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 16ms/step - accuracy: 0.5577 - loss: 0.6658 - val_accuracy: 0.7816 - val_loss: 0.4711
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 17ms/step - accuracy: 0.8296 - loss: 0.3984 - val_accuracy: 0.8115 - val_loss: 0.4199
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.8710 - loss: 0.3242 - val_accuracy: 0.8099 - val_loss: 0.4293
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.8901 - loss: 0.2776 - val_accuracy: 0.8015 - val_loss: 0.4530
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 20ms/step - accuracy: 0.9056 - loss: 0.2527 - val_accuracy: 0.7990 - val_loss: 0.4955


<keras.src.callbacks.history.History at 0x1d28d0ed1f0>

In [84]:
model.summary()