In [1]:
!pip install keras-hub tensorflow-text -q

In [2]:
import keras_hub

tokenizer = keras_hub.models.Tokenizer.from_preset("roberta_base_en")
backbone = keras_hub.models.Backbone.from_preset("roberta_base_en")

In [3]:
tokenizer("The quick brown fox")

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([  133,  2119,  6219, 23602], dtype=int32)>

In [4]:
# Clean up and start fresh
!rm -rf aclImdb aclImdb_v1.tar.gz aclImdb_v1.tar.gz_archive

# Download the dataset
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

# Extract it
!tar -xzf aclImdb_v1.tar.gz

# Remove unsupervised folder
!rm -rf aclImdb/train/unsup

# Verify extraction worked
!ls -la aclImdb/
!ls -la aclImdb/train/

# Define paths
train_dir = "aclImdb/train"
test_dir = "aclImdb/test"
val_dir = "aclImdb/test"

print("\nDirectories defined successfully!")

--2025-11-29 04:34:24--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2025-11-29 04:34:27 (27.5 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]

total 1732
drwxr-xr-x 4 7297 1000   4096 Jun 26  2011 .
drwxr-xr-x 1 root root   4096 Nov 29 04:34 ..
-rw-r--r-- 1 7297 1000 903029 Jun 11  2011 imdbEr.txt
-rw-r--r-- 1 7297 1000 845980 Apr 12  2011 imdb.vocab
-rw-r--r-- 1 7297 1000   4037 Jun 26  2011 README
drwxr-xr-x 4 7297 1000   4096 Nov 29 04:34 test
drwxr-xr-x 4 7297 1000   4096 Nov 29 04:34 train
total 65200
drwxr-xr-x 4 7297 1000     4096 Nov 29 04:34 .
drwxr-xr-x 4 7297 1000     4096 Jun 26  2011 ..
-rw-r--r-- 1 7297 1000 21021197 Apr 12  2011 labeledBow.feat
drwxr-xr-x 2 7297 1000   356352 Nov 29 04

In [5]:
from keras.utils import text_dataset_from_directory

batch_size = 16
train_ds = text_dataset_from_directory(train_dir, batch_size=batch_size)
val_ds = text_dataset_from_directory(val_dir, batch_size=batch_size)
test_ds = text_dataset_from_directory(test_dir, batch_size=batch_size)

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [6]:
def preprocess(text, label):
    packer = keras_hub.layers.StartEndPacker(
        sequence_length=512,
        start_value=tokenizer.start_token_id,
        end_value=tokenizer.end_token_id,
        pad_value=tokenizer.pad_token_id,
        return_padding_mask=True,
    )
    token_ids, padding_mask = packer(tokenizer(text))
    return {"token_ids": token_ids, "padding_mask": padding_mask}, label

preprocessed_train_ds = train_ds.map(preprocess)
preprocessed_val_ds = val_ds.map(preprocess)
preprocessed_test_ds = test_ds.map(preprocess)

In [7]:
next(iter(preprocessed_train_ds))

({'token_ids': <tf.Tensor: shape=(16, 512), dtype=int32, numpy=
  array([[    0,   713,    16, ...,     1,     1,     1],
         [    0,  7516,     6, ...,     1,     1,     1],
         [    0,   713,  1569, ...,     1,     1,     1],
         ...,
         [    0,  9335,    98, ...,     1,     1,     1],
         [    0,   565,  5593, ...,     1,     1,     1],
         [    0, 30115,  4186, ...,   160,    41,     2]], dtype=int32)>,
  'padding_mask': <tf.Tensor: shape=(16, 512), dtype=bool, numpy=
  array([[ True,  True,  True, ..., False, False, False],
         [ True,  True,  True, ..., False, False, False],
         [ True,  True,  True, ..., False, False, False],
         ...,
         [ True,  True,  True, ..., False, False, False],
         [ True,  True,  True, ..., False, False, False],
         [ True,  True,  True, ...,  True,  True,  True]])>},
 <tf.Tensor: shape=(16,), dtype=int32, numpy=array([1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1], dtype=int32)>)

In [8]:
from tensorflow import keras
from tensorflow.keras import layers
inputs = backbone.input
x = backbone(inputs)
# Uses the hidden representation of the first token
x = x[:, 0, :]
x = layers.Dropout(0.1)(x)
x = layers.Dense(768, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
classifier = keras.Model(inputs, outputs)

In [9]:
classifier.compile(
    optimizer=keras.optimizers.Adam(5e-5),
    loss="binary_crossentropy",
    metrics=["accuracy"],
)
classifier.fit(
    preprocessed_train_ds,
    validation_data=preprocessed_val_ds,
)

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3311s[0m 2s/step - accuracy: 0.8600 - loss: 0.3268 - val_accuracy: 0.9358 - val_loss: 0.1692


<keras.src.callbacks.history.History at 0x7ace0026e5d0>

In [10]:
classifier.evaluate(preprocessed_test_ds)

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m841s[0m 537ms/step - accuracy: 0.9369 - loss: 0.1644


[0.1691877543926239, 0.9358400106430054]