In [9]:
#downloading and unzipping the IMDB reviews data

!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

In [12]:
# There’s also a train/unsup subdirectory in there, which we don’t need. Let’sdelete it:
!rm -r aclImdb/train/unsup

Next, let’s prepare a validation set by setting apart 20% of the training text files in a
 new directory, aclImdb/val:

In [17]:
import os, pathlib, shutil, random

base_dir = pathlib.Path("aclImdb")
val_dir = base_dir/'val'
train_dir = base_dir/'train'

for category in ("neg", "pos"):
    os.makedirs(val_dir/category)
    files = os.listdir(train_dir/category)
    random.Random(1327).shuffle(files)
    num_val_samples = int(0.2*len(files))
    val_files = files[-num_val_samples: ]
    for fname in val_files:
        shutil.move(train_dir/category/fname, 
                    val_dir/category/fname)

In [15]:
from tensorflow import keras
batch_size=32

train_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/train", 
    batch_size=batch_size)
validation_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/val",
     batch_size=batch_size)
test_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/test",
     batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [17]:
for inputs, targets in train_dataset:
    print('shape of inputs', inputs.shape)
    print("data type of inputs", inputs.dtype)
    print("targets shape", targets.shape)
    print("targets datatype": targets.dtype)
    print("inputs[0]", inputs[0])
    print("inputs[0]", targets[0])

SyntaxError: invalid syntax (3491071444.py, line 5)

In [18]:
for inputs, targets in train_dataset:
    print('shape', inputs.shape)
    print('datatype', inputs.dtype)
    print('targets shape', targets.shape)
    print('datatype', targets.shape)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

shape (32,)
datatype <dtype: 'string'>
targets shape (32,)
datatype (32,)
inputs[0]: tf.Tensor(b"I had my doubts about another love story wherein disabled individuals find meaning and redemption through honest communication. And it's still not at the top of my list. But the performances from Helena Bonham Carter and Kenneth Branagh and exemplary, almost stunning, and rescue this from being just another tear-jerker. Carter's depiction of an ALS victim is strong, perhaps even overdone at times (sometimes her dialog dissolves into undistinguishable mutterings). But the overall effect is commendable and rewarding. Branagh may be the perfect compliment to her performance.<br /><br />", shape=(), dtype=string)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


**Preprocessing our datasets with a TextVectorization layer**

In [19]:
from keras.layers import TextVectorization

text_vectorization = TextVectorization(max_tokens=20000,
                                      ngrams=2,
                                      output_mode="tf_idf")

In [21]:
text_only_dataset = train_dataset.map(lambda x, y:x)
text_vectorization.adapt(text_only_dataset)

tf_idf_2gram_train_dataset = train_dataset.map(lambda x, y: (text_vectorization(x), y),
                                               num_parallel_calls=4)

tf_idf_2gram_validation_dataset = validation_dataset.map(lambda x, y: (text_vectorization(x), y),
                                                        num_parallel_calls=4)

tf_idf_2gram_test_dataset = test_dataset.map(lambda x, y: (text_vectorization(x), y),
                                                        num_parallel_calls=4)

In [24]:
for feature, target in tf_idf_2gram_train_dataset:
    print(feature[0])
    print(target[0])
    break

tf.Tensor(
[373.19913     6.9732323   2.8415277 ...   0.          0.
   0.       ], shape=(20000,), dtype=float32)
tf.Tensor(0, shape=(), dtype=int32)


In [36]:
#model building
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens, ))
    features = layers.Dense(hidden_dim, activation='relu')(inputs)
    features = layers.Dropout(.5)(features)
    outputs = layers.Dense(1, activation='sigmoid')(features)
    
    model = keras.Model(inputs, outputs)
    
    #compilation
    model.compile(optimizer='rmsprop',
                 loss='binary_crossentropy',
                 metrics=["accuracy"])
    return model

In [37]:
#instantiating the model from get_model 

model = get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_10 (Dense)            (None, 16)                320016    
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_11 (Dense)            (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
#callbacks = ModelCheckpoint("binary_1gram.keras", 
#                             save_best_only=True)


In [38]:
# from tensorflow.keras.callbacks import ModelCheckpoint


model.fit(tf_idf_2gram_train_dataset.cache(),
         validation_data=tf_idf_2gram_validation_dataset.cache(),
         epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x200868e1550>

In [12]:
print(f"Test acc: {model.evaluate(binary_3gram_test_dataset)[1]:.3f}")

Test acc: 0.894


**So, our model achieved the highest accuracy of 89%** 
Not much improvement. But with other datasets it might be helpful in gaining more 1% accuracy.