In [1]:
import os, pathlib, shutil, random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [2]:

max_length = 150
max_tokens = 10000
embedding_dim = 100
batch_size = 32

In [20]:
# Download and extract IMDB dataset
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset_path = keras.utils.get_file("aclImdb_v1.tar.gz", url)
extracted_path = os.path.join(os.path.dirname(dataset_path), "aclImdb")

if not os.path.exists(extracted_path):
    with tarfile.open(dataset_path, "r:gz") as tar:
        tar.extractall(path=os.path.dirname(dataset_path))

base_dir = pathlib.Path(extracted_path)
train_dir = base_dir / "train"
test_dir = base_dir / "test"
val_dir = base_dir / "val"

# Clean up val_dir if it already exists (from previous runs)
if val_dir.exists():
    shutil.rmtree(val_dir)




In [21]:
# Create validation set (10,000 samples)
for category in ("neg", "pos"):
    os.makedirs(val_dir / category, exist_ok=True)
files = os.listdir(train_dir / category)
random.Random(1337).shuffle(files)
val_files = files[-12500:]  # 12500 x 2 = 25,000 -> val = 10,000
for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir / category / fname)


In [22]:
# Create datasets
train_ds = keras.utils.text_dataset_from_directory(
    train_dir, batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    val_dir, batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    test_dir, batch_size=batch_size
)

Found 62500 files belonging to 3 classes.
Found 12500 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [23]:
# Training to 100 samples
train_ds = train_ds.unbatch().take(100).batch(batch_size)

# Vectorization
text_only_train_ds = train_ds.map(lambda x, y: x)
vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)
vectorizer.adapt(text_only_train_ds)

# Vectorize datasets
int_train_ds = train_ds.map(lambda x, y: (vectorizer(x), y))
int_val_ds = val_ds.map(lambda x, y: (vectorizer(x), y))
int_test_ds = test_ds.map(lambda x, y: (vectorizer(x), y))

In [24]:
# 5a. With embedding layer
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=max_tokens, output_dim=128)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

callbacks = [keras.callbacks.ModelCheckpoint("modified_model.keras", save_best_only=True)]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)
model = keras.models.load_model("modified_model.keras")
print(f"Test accuracy (Embedding): {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10
      4/Unknown [1m4s[0m 101ms/step - accuracy: 0.0505 - loss: 0.5914



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4s/step - accuracy: 0.0464 - loss: 0.5776 - val_accuracy: 1.0000 - val_loss: 0.4264
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7s/step - accuracy: 0.0000e+00 - loss: -0.1104 - val_accuracy: 1.0000 - val_loss: 0.1231
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7s/step - accuracy: 0.0000e+00 - loss: -1.0288 - val_accuracy: 1.0000 - val_loss: 0.0152
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7s/step - accuracy: 0.0000e+00 - loss: -2.8811 - val_accuracy: 1.0000 - val_loss: 0.0039
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7s/step - accuracy: 0.0000e+00 - loss: -3.2506 - val_accuracy: 1.0000 - val_loss: 0.0015
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4s/step - accuracy: 0.0000e+00 - loss: -3.5893 - val_accuracy: 

In [27]:
import zipfile
import requests

glove_zip_path = "glove.6B.zip"
glove_url = "https://nlp.stanford.edu/data/glove.6B.zip"

if not os.path.exists("glove.6B.100d.txt"):
    # Download zip file
    with open(glove_zip_path, "wb") as f:
        response = requests.get(glove_url)
        f.write(response.content)
    # Extract only 100d file
    with zipfile.ZipFile(glove_zip_path, "r") as zip_ref:
        zip_ref.extract("glove.6B.100d.txt")
# 5b. Using pretrained word embeddings (e.g., GloVe)
# Load GloVe
embeddings_index = {}
with open("glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs

# Prepare embedding matrix
vocab = vectorizer.get_vocabulary()
word_index = dict(zip(vocab, range(len(vocab))))
embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
    if i >= max_tokens:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Build model with pretrained embeddings
embedding_layer = layers.Embedding(
    input_dim=max_tokens,
    output_dim=embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False
)

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

callbacks = [keras.callbacks.ModelCheckpoint("pretrained_model.keras", save_best_only=True)]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)
model = keras.models.load_model("pretrained_model.keras")
print(f"Test accuracy (Pretrained GloVe): {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 8s/step - accuracy: 0.0927 - loss: 0.3918 - val_accuracy: 1.0000 - val_loss: 0.2648
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7s/step - accuracy: 0.0000e+00 - loss: -0.6000 - val_accuracy: 1.0000 - val_loss: 0.1242
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4s/step - accuracy: 0.0000e+00 - loss: -0.8672 - val_accuracy: 1.0000 - val_loss: 0.0651
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7s/step - accuracy: 0.0000e+00 - loss: -1.7810 - val_accuracy: 1.0000 - val_loss: 0.0317
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 8s/step - accuracy: 0.0000e+00 - loss: -1.9208 - val_accuracy: 1.0000 - val_loss: 0.0180
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4s/step - accuracy: 0.0000e+00 - loss: -2.6927 - val_accuracy: 1.0000 - val_loss: 0.0066
Epoch 7/10
[1m4/4[0m [32