In [9]:
#downloading and unzipping the IMDB reviews data

!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

In [12]:
# There’s also a train/unsup subdirectory in there, which we don’t need. Let’sdelete it:
!rm -r aclImdb/train/unsup

Next, let’s prepare a validation set by setting apart 20% of the training text files in a
 new directory, aclImdb/val:

In [17]:
import os, pathlib, shutil, random

base_dir = pathlib.Path("aclImdb")
val_dir = base_dir/'val'
train_dir = base_dir/'train'

for category in ("neg", "pos"):
    os.makedirs(val_dir/category)
    files = os.listdir(train_dir/category)
    random.Random(1327).shuffle(files)
    num_val_samples = int(0.2*len(files))
    val_files = files[-num_val_samples: ]
    for fname in val_files:
        shutil.move(train_dir/category/fname, 
                    val_dir/category/fname)

In [1]:
from tensorflow import keras
batch_size = 32

train_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/train", 
    batch_size=batch_size)

validation_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size)

test_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size)


Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [2]:
for inputs, targets in train_dataset:
    print('shape', inputs.shape)
    print('datatype', inputs.dtype)
    print('targets shape', targets.shape)
    print('datatype', targets.shape)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets)
    break

shape (32,)
datatype <dtype: 'string'>
targets shape (32,)
datatype (32,)
inputs[0]: tf.Tensor(b'As I am always looking for something new and unique, I watched this film online. I thought that it would be just another "B" rate movie but I was amazed at the acting by the two main characters. All of the actors in this film were very capable and well directed. The plot was wonderful and unique as well with an excellent moral to the story.<br /><br />This movie is definitely not for someone looking for a sex romp, "Dumb and Dumber" or blood and guts. This is a wonderfully poignant film showing some grim realities of life coupled with the kindness of the human heart and just enough frivolity to keep it interesting.<br /><br />I would prefer this movie to many "A" rate movies I have seen even a great number with high box office earnings.<br /><br />I highly recommend this movie.', shape=(), dtype=string)
targets[0]: tf.Tensor([1 1 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 1 1 1 1 0 1 1],

**Preprocessing our datasets with a TextVectorization layer**

In [3]:
from keras.layers import TextVectorization

text_vectorization = TextVectorization(
    max_tokens=20000, 
    ngrams=3,
    output_mode='multi_hot')

In [4]:
text_only_train_ds = train_dataset.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_3gram_train_ds = train_dataset.map(
        lambda x, y: (text_vectorization(x), y),
        num_parallel_calls=4
)

binary_3gram_validation_dataset = validation_dataset.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)

binary_3gram_test_dataset = test_dataset.map(
    lambda x, y: (text_vectorization(x), y)
)

In [6]:
for a, b in binary_3gram_test_dataset:
    print(a[1])
    print(b[0])
    break

tf.Tensor([1. 0. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
tf.Tensor(1, shape=(), dtype=int32)


In [7]:
#model building and compilation
#Training and testing the binary unigram model
from tensorflow import keras
from tensorflow.keras import layers


def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens, ))
    x = layers.Dense(hidden_dim, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(optimizer='rmsprop',
                 loss='binary_crossentropy',
                 metrics=["accuracy"])
    return model

In [8]:
#instantiating the model from get_model 

model = get_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
#callbacks = ModelCheckpoint("binary_1gram.keras", 
#                             save_best_only=True)


In [10]:
# from tensorflow.keras.callbacks import ModelCheckpoint


model.fit(binary_3gram_train_ds.cache(),
         validation_data=binary_3gram_validation_dataset.cache(),
         epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2008be06520>

In [12]:
print(f"Test acc: {model.evaluate(binary_3gram_test_dataset)[1]:.3f}")

Test acc: 0.894


**So, our model achieved the highest accuracy of 90%** 
We can still imporove this applying some other techniques.