### Implement word embeddings for IMDB dataset.

In [1]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Load IMDB dataset (restrict vocabulary to 10,000 most frequent words)
num_words = 10000  # Vocabulary size limit
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)

In [3]:
# Pad sequences to a fixed length
max_length = 200
x_train = pad_sequences(x_train, maxlen=max_length)
x_test = pad_sequences(x_test, maxlen=max_length)

In [4]:
# Define vocabulary size correctly
vocab_size = num_words  # Fix vocab size

In [5]:
# Build a simple neural network with an embedding layer
embedding_dim = 50
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Flatten(),
    Dense(1, activation='sigmoid')
])



In [6]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [7]:
# Display the model summary
model.summary()

In [8]:
# Train the model
model.fit(x_train, y_train, epochs=3, batch_size=32)  # Train for a small number of epochs for demonstration

Epoch 1/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6799 - loss: 0.5662
Epoch 2/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9236 - loss: 0.2058
Epoch 3/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9770 - loss: 0.0938


<keras.src.callbacks.history.History at 0x2b57bd33da0>

In [9]:
# Step 9: Evaluate Model
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8677 - loss: 0.3245
Test Accuracy: 86.81%


In [19]:
# Extract the learned word embeddings
embedding_layer = model.layers[0]
weights = embedding_layer.get_weights()[0]  # Shape: (vocab_size, embedding_dim)

In [21]:
# Load IMDB word index and **correctly align indices**
imdb_word_index = imdb.get_word_index()


In [23]:
# Reconstruct word index **as per load_data() conventions**
word_index = {word: (index + 3) for word, index in imdb_word_index.items()}  # Shift indices by 3
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

In [25]:
# Reverse lookup dictionary for saving embeddings
reverse_word_index = {i: word for word, i in word_index.items()}

In [27]:
print("\nPrinting reverse word index first few items\n")
print(reverse_word_index.get(0), " ", reverse_word_index.get(1), " ", reverse_word_index.get(2), " ", reverse_word_index.get(3))


Printing reverse word index first few items

<PAD>   <START>   <UNK>   <UNUSED>


In [29]:
# Save the learned word embeddings correctly
with open("4_word_embeddings.txt", "w", encoding="utf-8") as file:
    for i in range(1, vocab_size):  # Skip padding index (0)
        word = reverse_word_index.get(i, "<UNK>")  # Use <UNK> for missing words
        embedding = " ".join(map(str, weights[i]))  # Convert embedding to space-separated string
        file.write(f"{word} {embedding}\n")

print("Word embeddings saved to word_embeddings.txt")

Word embeddings saved to word_embeddings.txt
