In [9]:
#downloading and unzipping the IMDB reviews data

!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

In [12]:
# There’s also a train/unsup subdirectory in there, which we don’t need. Let’sdelete it:
!rm -r aclImdb/train/unsup

Next, let’s prepare a validation set by setting apart 20% of the training text files in a
 new directory, aclImdb/val:

In [17]:
import os, pathlib, shutil, random

base_dir = pathlib.Path("aclImdb")
val_dir = base_dir/'val'
train_dir = base_dir/'train'

for category in ("neg", "pos"):
    os.makedirs(val_dir/category)
    files = os.listdir(train_dir/category)
    random.Random(1327).shuffle(files)
    num_val_samples = int(0.2*len(files))
    val_files = files[-num_val_samples: ]
    for fname in val_files:
        shutil.move(train_dir/category/fname, 
                    val_dir/category/fname)

In [5]:
from tensorflow import keras
batch_size = 32

train_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/train", 
    batch_size=batch_size)

validation_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size)

test_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size)


Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [6]:
for inputs, targets in train_dataset:
    print('shape', inputs.shape)
    print('datatype', inputs.dtype)
    print('targets shape', targets.shape)
    print('datatype', targets.shape)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets)
    break

shape (32,)
datatype <dtype: 'string'>
targets shape (32,)
datatype (32,)
inputs[0]: tf.Tensor(b'"9/11," hosted by Robert DeNiro, presents footage from outside and inside the Twin Towers in New York, on September 11, 2001.<br /><br />Never too grisly and gory, yet powerful and moving. "9/11" is a real treat. Anyone not moved by this television show is immune to anything.<br /><br />5/5 stars --<br /><br />', shape=(), dtype=string)
targets[0]: tf.Tensor([1 1 0 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 0 1 0 0 0 1 1 1], shape=(32,), dtype=int32)


**Preprocessing our datasets with a TextVectorization layer**

In [7]:
from keras.layers import TextVectorization

text_vectorization = TextVectorization(
    max_tokens=20000, output_mode='multi_hot')



In [8]:
text_only_train_ds = train_dataset.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_dataset.map(
        lambda x, y: (text_vectorization(x), y),
        num_parallel_calls=4
)

binary_1gram_validation_dataset = validation_dataset.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)

binary_1gram_test_dataset = test_dataset.map(
    lambda x, y: (text_vectorization(x), y)
)

In [11]:
for a, b in binary_1gram_test_dataset:
    print(a[0])
    print(b)
    break

tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
tf.Tensor([0 0 0 0 1 0 1 1 1 0 0 1 0 0 1 0 1 1 0 0 1 0 1 0 1 0 0 0 1 0 1 0], shape=(32,), dtype=int32)


In [12]:
#model building and compilation
#Training and testing the binary unigram model
from tensorflow import keras
from tensorflow.keras import layers


def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens, ))
    x = layers.Dense(hidden_dim, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(optimizer='rmsprop',
                 loss='binary_crossentropy',
                 metrics=["accuracy"])
    return model

In [13]:
#instantiating the model from get_model 

model = get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
#Note: I was getting an error when adding callbacks. So, I decided to train the model without callbacks.


#callbacks = ModelCheckpoint("binary_1gram.keras", 
#                             save_best_only=True)

In [14]:
# from tensorflow.keras.callbacks import ModelCheckpoint


model.fit(binary_1gram_train_ds.cache(),
         validation_data=binary_1gram_validation_dataset.cache(),
         epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x22128a09790>

In [18]:
print(f"Test acc: {model.evaluate(binary_1gram_test_dataset)[1]:.3f}")

Test acc: 0.882


# **So, our model achieved the accuracy of 89%** 
We can imporove this applying some other techniques.