### 4.Implement word embeddings for IMDB dataset.

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

In [5]:
# Step 1: Load IMDB data
max_words = 10000
max_len = 200
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words)
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 0us/step


In [7]:
# Step 2: Load GloVe word embeddings (glove.6B.50d.txt must be in your directory)
embeddings_index = {}
with open("glove.6B.50d.txt", encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

In [9]:
# Step 3: Prepare embedding matrix
word_index = imdb.get_word_index()
word_index = {k: (v + 3) for k, v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [11]:
embedding_dim = 50
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        vec = embeddings_index.get(word)
        if vec is not None:
            embedding_matrix[i] = vec

In [13]:
# Step 4: Build the model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, weights=[embedding_matrix],
                    input_length=max_len, trainable=True))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))



In [15]:
# Step 5: Compile and train
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=3, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.6566 - loss: 0.6018 - val_accuracy: 0.8318 - val_loss: 0.3771
Epoch 2/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8930 - loss: 0.2737 - val_accuracy: 0.8440 - val_loss: 0.3620
Epoch 3/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9451 - loss: 0.1567 - val_accuracy: 0.8468 - val_loss: 0.3984


<keras.src.callbacks.history.History at 0x245b1d3ca40>

In [17]:
# Step 6: Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 872us/step - accuracy: 0.8464 - loss: 0.4009
Test Accuracy: 84.68%


In [19]:
# Step 7: Save updated word embeddings
updated_embeddings = model.layers[0].get_weights()[0]
inverse_word_index = {v: k for k, v in word_index.items()}

with open("updated_glove_embeddings.txt", "w", encoding="utf-8") as f:
    for i in range(1, max_words):  # skipping 0 (padding)
        word = inverse_word_index.get(i, "<UNK>")
        vector = updated_embeddings[i]
        vector_str = " ".join(map(str, vector))
        f.write(f"{word} {vector_str}\n")

print("Word embeddings saved to updated_glove_embeddings.txt")

Word embeddings saved to updated_glove_embeddings.txt
