<a href="https://colab.research.google.com/github/bhargav23/AIML-DL-Lab/blob/main/9_IMDB_Word_Embedding_with_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

9. Implement word embedding for IMDB dataset.

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
import numpy as np

In [2]:
# --- 1. Load the IMDB dataset ---
# The dataset is already preprocessed, and the reviews are converted to sequences of integers.
# Each integer represents a specific word in a dictionary.
# We'll only consider the top 10,000 most common words in the dataset.
vocab_size = 10000
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [3]:
# --- 2. Explore the data ---
# The 'word_index' is a dictionary mapping words to an integer index.
word_index = imdb.get_word_index()

# The first indices are reserved for special characters.
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # Unknown
word_index["<UNUSED>"] = 3

# We can create a reverse word index to look up words from their integer representation.
reverse_word_index = {value: key for key, value in word_index.items()}

def decode_review(text):
    """
    This function takes a sequence of integers and returns the decoded review as a string.
    """
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

# Let's see an example review
print("--- Example Decoded Review ---")
print(decode_review(train_data[0]))
print("Label:", train_labels[0])
print("-" * 30)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
--- Example Decoded Review ---
<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were 

In [4]:
# --- 3. Preprocess the data ---
# The reviews have different lengths. We need to pad them so they all have the same length.
# We'll set a maximum length of 256 words. Reviews longer than this will be truncated,
# and shorter reviews will be padded with the <PAD> token.
max_length = 256

train_data = pad_sequences(train_data, value=word_index["<PAD>"], padding='post', maxlen=max_length)
test_data = pad_sequences(test_data, value=word_index["<PAD>"], padding='post', maxlen=max_length)

In [5]:
# --- 4. Build the model ---
# The model is a simple sequential model with three layers:
# 1. Embedding layer: This layer takes the integer-encoded vocabulary and looks up the embedding vector for each word-index.
#    These vectors are learned as the model trains. The `input_length` argument is deprecated,
#    so we build the model with an explicit input shape.
# 2. GlobalAveragePooling1D: This layer returns a fixed-length output vector for each example by averaging over the sequence dimension.
#    This allows the model to handle input of variable length in a simple way.
# 3. Dense layer: This is the output layer with a single neuron and a sigmoid activation function.
#    The output is a float between 0 and 1, representing the probability of the review being positive.
embedding_dim = 16

model = Sequential([
    Embedding(vocab_size, embedding_dim),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Explicitly build the model to see the summary correctly.
# This is necessary because the `input_length` argument in the Embedding layer is deprecated.
model.build(input_shape=(None, max_length))

model.summary()

In [6]:
# --- 5. Compile and train the model ---
# We'll use the 'adam' optimizer and 'binary_crossentropy' as the loss function,
# which is suitable for a binary classification problem.
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# We'll train the model for 30 epochs with a batch size of 512.
# We'll also use a validation set to monitor the model's performance on unseen data during training.
history = model.fit(
    train_data,
    train_labels,
    epochs=30,
    batch_size=512,
    validation_split=0.2,
    verbose=1
)

Epoch 1/30
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.5363 - loss: 0.6920 - val_accuracy: 0.6894 - val_loss: 0.6844
Epoch 2/30
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.6935 - loss: 0.6800 - val_accuracy: 0.7140 - val_loss: 0.6583
Epoch 3/30
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.7010 - loss: 0.6493 - val_accuracy: 0.7594 - val_loss: 0.6092
Epoch 4/30
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.7636 - loss: 0.5968 - val_accuracy: 0.7822 - val_loss: 0.5464
Epoch 5/30
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8092 - loss: 0.5262 - val_accuracy: 0.8166 - val_loss: 0.4821
Epoch 6/30
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.8346 - loss: 0.4593 - val_accuracy: 0.8378 - val_loss: 0.4301
Epoch 7/30
[1m40/40[0m [32m━━━━

In [7]:
# --- 6. Evaluate the model ---
# Finally, we'll evaluate the model's performance on the test set.
results = model.evaluate(test_data, test_labels, verbose=2)

print("\n--- Model Evaluation ---")
print(f"Test Loss: {results[0]}")
print(f"Test Accuracy: {results[1]}")
print("-" * 30)

782/782 - 1s - 2ms/step - accuracy: 0.8737 - loss: 0.3058

--- Model Evaluation ---
Test Loss: 0.30575504899024963
Test Accuracy: 0.8737199902534485
------------------------------


In [8]:
# --- 7. Make predictions on new data ---
# You can use the trained model to make predictions on new reviews.
# Here's an example of how you would preprocess a new review and make a prediction.
def preprocess_text(text):
    words = text.lower().split()
    encoded_review = [word_index.get(word, 2) for word in words] # 2 is for <UNK>
    padded_review = pad_sequences([encoded_review], value=word_index["<PAD>"], padding='post', maxlen=max_length)
    return padded_review

# Example of a positive review
positive_review = "this movie was fantastic I really enjoyed it and would recommend it to everyone"
preprocessed_positive = preprocess_text(positive_review)
prediction_positive = model.predict(preprocessed_positive)
print(f"Prediction for positive review: {prediction_positive[0][0]}")

# Example of a negative review
negative_review = "it was a terrible movie I would not recommend it to anyone"
preprocessed_negative = preprocess_text(negative_review)
prediction_negative = model.predict(preprocessed_negative)
print(f"Prediction for negative review: {prediction_negative[0][0]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
Prediction for positive review: 0.6568941473960876
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Prediction for negative review: 0.12901991605758667
