In [9]:
#downloading and unzipping the IMDB reviews data

!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

In [12]:
# There’s also a train/unsup subdirectory in there, which we don’t need. Let’sdelete it:
!rm -r aclImdb/train/unsup

Next, let’s prepare a validation set by setting apart 20% of the training text files in a
 new directory, aclImdb/val:

In [17]:
import os, pathlib, shutil, random

base_dir = pathlib.Path("aclImdb")
val_dir = base_dir/'val'
train_dir = base_dir/'train'

for category in ("neg", "pos"):
    os.makedirs(val_dir/category)
    files = os.listdir(train_dir/category)
    random.Random(1327).shuffle(files)
    num_val_samples = int(0.2*len(files))
    val_files = files[-num_val_samples: ]
    for fname in val_files:
        shutil.move(train_dir/category/fname, 
                    val_dir/category/fname)

In [1]:
from tensorflow import keras
batch_size = 32

train_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/train", 
    batch_size=batch_size)

validation_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size)

test_dataset = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size)


Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [2]:
for inputs, targets in train_dataset:
    print('shape', inputs.shape)
    print('datatype', inputs.dtype)
    print('targets shape', targets.shape)
    print('datatype', targets.shape)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets)
    break

shape (32,)
datatype <dtype: 'string'>
targets shape (32,)
datatype (32,)
inputs[0]: tf.Tensor(b'Another turgid action/adventure flick from the Quinn Martin Productions factory. Roy Thinnes plays undercover agent Diamond Head (Mr. Head, to you), working for his G-Man handler "Aunt Mary", looking for "Tree", who\'s on a mission to...well, just watch the movie. <br /><br />This one deserved and got the full MST3K sendup. As the boys and various reviewers have pointed out, the movie "Fargo" had more Hawaiian locations than this film. Apparently shot on a puny budget, this movie highlights Hawaii\'s broken-down dive shops, gas stations, and cheapo hotels. Zulu -- later to star as Kono in Hawaii-Five-O -- appears as Thinnes\' lumpy, inept sidekick, while France Nguyen models the Jenny Craig diet gone horribly wrong. Others sharing the flickering screen include a drunken Richard Harris knockoff, a George Takai imitator, a not-so-smart hit-man with sprayed-on Sansabelt slacks, and the villain

**Preprocessing our datasets with a TextVectorization layer**

In [3]:
from keras.layers import TextVectorization

max_tokens= 20000
max_length= 600

text_vectorization = TextVectorization(max_tokens=max_tokens,
                                      output_mode='int',
                                      output_sequence_length=max_length
                                      )

In [4]:
text_only_train_dataset = train_dataset.map(lambda x, y: x)

text_vectorization.adapt(text_only_train_dataset)

int_train_dataset = train_dataset.map(lambda x, y: (text_vectorization(x), y),
                                     num_parallel_calls=4)

int_validation_dataset = validation_dataset.map(lambda x, y: (text_vectorization(x),y),
                                               num_parallel_calls=4)

int_test_dataset = test_dataset.map(lambda x, y: (text_vectorization(x),y),
                                               num_parallel_calls=4)

In [6]:
for inputs, targets in int_train_dataset:
    print(inputs[0])
    print(targets)
    break

tf.Tensor(
[   10   204   252 14963  5928   237   940     3    11    19 15901    12
    15   155   731   292    11     7     4  5795    19    12   207    53
   166    10   282     9    78     9    44  4237     8     9    18    28
   982   202    98   290    63    30    57  2329    33  9864     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0    

In [7]:
# training model

import tensorflow as tf
from tensorflow.keras import layers

inputs = keras.Input(shape=(None, ), dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs) #embedding layer to convert integer indices into vectors
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x= layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs, outputs)

#model compilation
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 256)         5120000   
                                                                 
 bidirectional (Bidirection  (None, 64)                73984     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 5194049 (19.81 MB)
Trainable params: 5194049 (19.81 MB)
Non-trainable params: 0 (0.00 Byte)
_____________________

In [8]:
#model training
model.fit(int_train_dataset,
         validation_data=int_validation_dataset,
         epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1fc95524c70>

**Maximum accuracy is 88%**

In [10]:
print(f"Test acc: {model.evaluate(int_test_dataset)[1]:.3f}")

Test acc: 0.842


**Okay, It's accuracy is low. We did not trained it on full data as we truncated each review after 600 words. However, thanks to embedding layer, now our model has to process only 256 dimentional vectors insteas of 20000 thousand as we saw in previous notebook where one-hot encoding was applied.**

In next notebook, we shall apply 'masking' so that LSTM only process non-zero indices and skip those indices where there is 0. 
It is done by setting "mask-zero=True" in embedding layer. **