### Imports

In [13]:
import tensorflow as tf
import numpy as np

import matplotlib.pyplot as plt

### Load data (raw imdb dataset)

In [14]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2023-10-12 06:16:34--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.1’


2023-10-12 06:16:39 (14.9 MB/s) - ‘aclImdb_v1.tar.gz.1’ saved [84125825/84125825]



In [15]:
# Remove train/unsup subdirectory
!rm -r aclImdb/train/unsup

In [None]:
# Prepare validation data (50% of test data, so alltogether we have )
# Imports
import os, shutil, random

BASE_DIR_TEST = "aclImdb/test"
ALL_DIR_POS = os.listdir(BASE_DIR_TEST + '/pos')
ALL_DIR_NEG = os.listdir(BASE_DIR_TEST + '/neg')

# print(len(ALL_DIR_POS), len(ALL_DIR_NEG))

# Define number of validation samples
VAL_SAMPLES = int(0.5 * len(ALL_DIR_POS + ALL_DIR_NEG))

# Shuffle files
random.shuffle(ALL_DIR_POS)
random.shuffle(ALL_DIR_NEG)

# Pick appropriate number of validation files
val_paths_pos = ALL_DIR_POS[:VAL_SAMPLES//2] # half samples will be positive
val_paths_neg = ALL_DIR_NEG[:VAL_SAMPLES//2] # half of samples will be negative

# Create directories appropriate for validation files
os.makedirs('aclImdb/val/pos')
os.makedirs('aclImdb/val/neg')

# Move all validation files into val/pos and val/neg directories
for file in val_paths_pos:
  shutil.move(src = "aclImdb/test/pos/" + file, dst = "aclImdb/val/pos/" + file)

for file in val_paths_neg:
  shutil.move(src = "aclImdb/test/neg/" + file, dst = "aclImdb/val/neg/" + file)

### Create train, validation and test datasets using `text_dataset_from_directory`

In [5]:
BATCH_SIZE = 32

# Create train dataset
train_data = tf.keras.utils.text_dataset_from_directory("aclImdb/train", batch_size = BATCH_SIZE)

# Create validation dataset
val_data = tf.keras.utils.text_dataset_from_directory("aclImdb/val", batch_size = BATCH_SIZE)

# Create test dataset
test_data = tf.keras.utils.text_dataset_from_directory("aclImdb/test", batch_size = BATCH_SIZE)

Found 25000 files belonging to 2 classes.
Found 12500 files belonging to 2 classes.
Found 12500 files belonging to 2 classes.


# First attempt: bag of words model

### Preprocess data with TextVectorization layer

In [6]:
# Only consider single words (ngram = 1) and multi-hot encode each sentence
tv = tf.keras.layers.TextVectorization(max_tokens = 20000,
                                       ngrams = 1,
                                       output_mode = 'multi_hot')

# Extract all messages from training data (leave targets out)
train_data_text_only = train_data.map(lambda x, y: x)

# Build vocabulary from training data using TextVectorizer
tv.adapt(train_data_text_only)

# Prepare processed versions of datasets (text multi-hot encoded using vocabulary trained on training data)
train_data_preprocessed_1_gram = train_data.map(lambda x, y: (tv(x), y), num_parallel_calls = 4)
val_data_preprocessed_1_gram = val_data.map(lambda x, y: (tv(x), y), num_parallel_calls = 4)
test_data_preprocessed_1_gram = test_data.map(lambda x, y: (tv(x), y), num_parallel_calls = 4)


In [24]:
# Check if everything OK
for inputs, targets in train_data_preprocessed_1_gram:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print("targets[0]:", targets[0])
  break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


In [26]:
### Build and train simple dense classification model

def build_model(num_tokens = 20000, hidden_units = 16):
  inputs = tf.keras.layers.Input(shape = (num_tokens, ))
  x = tf.keras.layers.Dense(hidden_units, activation = 'relu')(inputs)
  x = tf.keras.layers.Dropout(rate = 0.5)(x)
  outputs = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)

  model = tf.keras.models.Model(inputs = inputs, outputs = outputs)

  model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

  return model

model = build_model()

history = model.fit(train_data_preprocessed_1_gram,
          epochs = 10,
          validation_data = val_data_preprocessed_1_gram)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### We achieve ~ 89% accuracy. Let's try the same with 2-grams and 3-grams

In [28]:
# Build new TextVectorizers, one for 2-grams and one for 3-grams
tv2 = tf.keras.layers.TextVectorization(max_tokens = 20000,
                                     output_mode = 'multi_hot',
                                     ngrams = 2)

tv3 = tf.keras.layers.TextVectorization(max_tokens = 20000,
                                     output_mode = 'multi_hot',
                                     ngrams = 3)

# Train both both vectorizers on train data texsts
tv2.adapt(train_data_text_only)
tv3.adapt(train_data_text_only)

# Prepare preprocessed versions of all datasets
train_data_preprocessed_2_gram = train_data.map(lambda x, y: (tv2(x), y), num_parallel_calls = 4)
train_data_preprocessed_3_gram = train_data.map(lambda x, y: (tv3(x), y), num_parallel_calls = 4)

val_data_preprocessed_2_gram = val_data.map(lambda x, y: (tv2(x), y), num_parallel_calls = 4)
val_data_preprocessed_3_gram = val_data.map(lambda x, y: (tv3(x), y), num_parallel_calls = 4)

test_data_preprocessed_2_gram = test_data.map(lambda x, y: (tv2(x), y), num_parallel_calls = 4)
test_data_preprocessed_3_gram = test_data.map(lambda x, y: (tv3(x), y), num_parallel_calls = 4)

In [29]:
# Build and train models
model2 = build_model()

history2 = model.fit(train_data_preprocessed_2_gram,
          epochs = 10,
          validation_data = val_data_preprocessed_2_gram)

model3 = build_model()

history3 = model.fit(train_data_preprocessed_3_gram,
          epochs = 10,
          validation_data = val_data_preprocessed_3_gram)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### As we can see introduction of 2-grams and 3-grams did not help. It may be due to the fact, that for all experiments the same number of 20000 tokens was used

### Using tf-idf output mode instead of multi-hot encoding

In [31]:
# Create another TextVectorizer
tv4 = tf.keras.layers.TextVectorization(max_tokens = 20000,
                                        output_mode = 'tf-idf',
                                        ngrams = 1)

# Adapt TV to training dataset
tv4.adapt(train_data_text_only)

# Prepare preprocessed versions of all datasets
train_data_preprocessed_tfidf = train_data.map(lambda x, y: (tv4(x), y), num_parallel_calls = 4)
val_data_preprocessed_tfidf = val_data.map(lambda x, y: (tv4(x), y), num_parallel_calls = 4)
test_data_preprocessed_tfidf = test_data.map(lambda x, y: (tv4(x), y), num_parallel_calls = 4)

In [32]:
# Check if everything OK
for inputs, targets in train_data_preprocessed_tfidf:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print("targets[0]:", targets[0])
  break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([19.388176   6.2761817  1.4221123 ...  0.         0.         0.       ], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(0, shape=(), dtype=int32)


In [33]:
# Build and train model
model4 = build_model()

history4 = model.fit(train_data_preprocessed_tfidf,
          epochs = 10,
          validation_data = val_data_preprocessed_tfidf)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### We obtain ~85% accuracy

# Second attempt: sequence models

To implement sequence model, one has to do the following steps:
* represent samples as integer sequences (one integer for one word),
* map integer to a vector to obtain vector sequences,
* feed these vector sequences into 1-D CNN or RNN stack of layers

### Define TextVectorizer which for each sentence outputs sequence of integers

In [7]:
# Truncate each review to contain only 600 characters
tv5 = tf.keras.layers.TextVectorization(max_tokens = 20000,
                                        output_mode = 'int',
                                        output_sequence_length = 600)

# Adapting TextVectorizer
tv5.adapt(train_data_text_only)

# Transforming train, validation and test datasets
train_data_preprocessed_int = train_data.map(lambda x, y: (tv5(x), y), num_parallel_calls = 4)
val_data_preprocessed_int = val_data.map(lambda x, y: (tv5(x), y), num_parallel_calls = 4)
test_data_preprocessed_int = test_data.map(lambda x, y: (tv5(x), y), num_parallel_calls = 4)

# Check if everything OK
for inputs, targets in train_data_preprocessed_int:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print("targets[0]:", targets[0])
  break

inputs.shape: (32, 600)
inputs.dtype: <dtype: 'int64'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(
[   29     5    56   499    93     6  1321   500    15    34  1222   141
     2  1283   506     5     4   774     5   709   346    36 16439     2
   315     5   567     2     1  1818   156    30   676    78     4  2087
   257    36  2908  3429  2352    37 15857 14470    25 17791    78    65
   240  1687    19    45     2    86   357     5     2    20   717   239
    39    34   216    18     2   326   357  2809     2   507    78     4
   467   267    64    17   254   346   889     6    94     4  2122    12
    35   118    77   635    65   457  1447     8    53    84   763    31
     2   129     5     2    18   248   409     7  2046    28   889     6
   837    17     2 12067     5    49    67    41     6    28     4   176
  2345    21     2  1818    19  9332    78     4   537    16  4103     2
    18   542    47   179  3526  9368     7    53    74   521     3 

### Building RNN model

In [None]:
inputs = tf.keras.layers.Input(shape = (None, ), dtype = 'int64') # one input is sequence of integers
# Hot-encode each integer
encoded = tf.one_hot(inputs, depth = 20000)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(encoded)
x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.models.Model(inputs, outputs)

model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

history5 = model.fit(train_data_preprocessed_int,
          epochs = 10,
          validation_data = val_data_preprocessed_int)

#### The above model is very slow, since each review is encoded as 600 x 20000 matrix, which has 12000000 elements. Better encoding would be necessary, such as word embedding

### Same model as above but with embedding layer

In [12]:
inputs = tf.keras.layers.Input(shape = (600, ))
x = tf.keras.layers.Embedding(input_dim = 20000, output_dim = 256, mask_zero = True)(inputs) # Notice mask_zero parameter
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(x)
x = tf.keras.layers.Dropout(rate = 0.5)(x)
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)

model = tf.keras.models.Model(inputs = inputs, outputs = outputs)
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

history6 = model.fit(train_data_preprocessed_int,
          epochs = 10,
          validation_data = val_data_preprocessed_int)

Epoch 1/10

KeyboardInterrupt: ignored

The model with embedding layer is learns a lot faster, but still not fast and accurately enough to beat simple feed-forward model. Let's try transformers.

### Build transformer encoder

In [17]:
class TransformerEncoder(tf.keras.layers.Layer):
  def __init__(self, num_heads, embed_dim, hidden_units, **kwargs):
    super().__init__(**kwargs)
    # Assign attributes
    self.num_heads = num_heads
    self.embed_dim = embed_dim
    self.hidden_units = hidden_units

    # Define all model's layers
    self.mha = tf.keras.layers.MultiHeadAttention(num_heads = self.num_heads, key_dim = self.embed_dim)
    self.dense = tf.keras.models.Sequential([
        tf.keras.layers.Dense(units = self.hidden_units, activation = 'relu'),
        tf.keras.layers.Dense(units = self.embed_dim)
    ])
    # Normalization layer normalizes each sequence independently, opposite to BatchNormalization
    self.layer_normalization1 = tf.keras.layers.LayerNormalization()
    self.layer_normalization2 = tf.keras.layers.LayerNormalization()

  # Define forward propagation
  def call(self, inputs, mask = None):
    if mask is not None:
      # Since mask generated by embedding layer will be 2D we must expand its dims to be 3D (MHA expects 3D or 4D inputs)
      mask = mask[:, tf.newaxis, :]
    attention_output = self.mha(inputs, inputs, attention_mask = mask)
    proj_input = self.layer_normalization1(inputs + attention_output)
    proj_output = self.dense(proj_input)
    outputs = self.layer_normalization2(proj_input + proj_output)
    return outputs

  # Define get_config method, to be able to save and load model using this layer
  def get_config(self):
    config = super().get_config()
    config.update({
                  "embed_dim": self.embed_dim,
                  "num_heads": self.num_heads,
                  "dense_dim": self.hidden_units,
                  })
    return config


### Build PositionalEncoding layer to take into account word positions in sequence

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
