# Introduction
In this project we will look at the IMDB movie reviews data set and apply an Easy Data Augmentation to see if it improves the accuracy from the sentiment analysis. The work is unpolished, in the sense that there are more augmentation techniques that can be used, but this project focuses on using swap augmentation. 

In [1]:
!pip install -q tensorflow-text


[K     |████████████████████████████████| 4.9 MB 4.0 MB/s 
[?25h

In [2]:
!pip install -q tf-models-official

[K     |████████████████████████████████| 1.8 MB 4.3 MB/s 
[K     |████████████████████████████████| 352 kB 45.2 MB/s 
[K     |████████████████████████████████| 90 kB 10.1 MB/s 
[K     |████████████████████████████████| 99 kB 10.1 MB/s 
[K     |████████████████████████████████| 43 kB 1.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 25.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 19.9 MB/s 
[K     |████████████████████████████████| 47.6 MB 41 kB/s 
[K     |████████████████████████████████| 596 kB 40.6 MB/s 
[K     |████████████████████████████████| 213 kB 46.2 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [3]:
try:
  import textaugment
except ModuleNotFoundError:
  !pip install textaugment
  import textaugment

import os
import shutil
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import imdb

tf.get_logger().setLevel('ERROR')

Collecting textaugment
  Downloading textaugment-1.3.4-py3-none-any.whl (16 kB)
Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.0 MB/s 
Collecting hstspreload
  Downloading hstspreload-2021.11.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 8.7 MB/s 
[?25hCollecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.4 MB/s 
[?25hCollecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 3.7 MB/s 
[?25hCollecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 

In [4]:
# set parameters:
max_features = 5000
batch_size = 32
epochs = 5

In [5]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [6]:
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [7]:
def decoding(dataset):
  decoded_dataset = []
  for text in dataset:
    decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in text])
    decoded_dataset.append(decoded_review)
  return decoded_dataset

In [8]:
x_train_unprocessed = decoding(x_train)
x_test_unprocessed = decoding(x_test)


In [9]:
train_ds = tf.data.Dataset.from_tensor_slices((x_train_unprocessed, y_train)).batch(batch_size)
test_ds = tf.data.Dataset.from_tensor_slices((x_test_unprocessed, y_test)).batch(batch_size)

In [10]:
val_ds = train_ds.take(160)
real_train_ds = train_ds.skip(160)

In [11]:
AUTOTUNE = tf.data.AUTOTUNE
val_ds = val_ds.cache().prefetch(buffer_size = AUTOTUNE)
real_train_ds = real_train_ds.cache().prefetch(buffer_size = AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [12]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [13]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [14]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [15]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [16]:
classifier_model = build_classifier_model()

In [17]:
#Setting up the loss function
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [18]:
# Setting up the epochs and optimizer (AdamW optimizer) 
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [19]:
# Setting up the classifier model with the optimizer, loss and metrics that we decided on
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [20]:
# Training the model
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=real_train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
loss, accuracy = classifier_model.evaluate(test_ds)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 0.4512398838996887
Accuracy: 0.8636000156402588


We see that we get a baseline accuracy of 86%.  Now we will do the Easy Data Augmentation.  More specifically, we will apply swap augmentation, although it is easy to implement other augmentations as well, as they are still in the textaugment library. 

In [22]:
from textaugment import EDA
import nltk
nltk.download('stopwords')
t = EDA()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [23]:
#does the swap augmentation on an already decoded dataset
def swap_augment_ds(dataset):
  swapped = []
  for text in dataset:
    swapped.append(t.random_swap(text))
  return swapped
  

In [24]:
# Applying the swap augment twice because only once might not modify the training set enough
swapped_x_train_unprocessed = swap_augment_ds(swap_augment_ds(decoding(x_train)))

In [25]:
augmented_x_train = x_train_unprocessed + swapped_x_train_unprocessed
augmented_y_train = np.concatenate((y_train , y_train))

In [26]:
aug_train_ds = aug_train_ds = tf.data.Dataset.from_tensor_slices((augmented_x_train,augmented_y_train)).batch(batch_size)

In [27]:
aug_val_ds = aug_train_ds.take(300)
real_aug_train_ds = aug_train_ds.skip(300)

In [28]:
# Setting up the Bert Model again
aug_bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'
aug_tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
aug_tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'


In [29]:
aug_bert_preprocess_model = hub.KerasLayer(aug_tfhub_handle_preprocess)

In [30]:
aug_bert_model = hub.KerasLayer(aug_tfhub_handle_encoder)

In [31]:
def build_aug_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(aug_tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(aug_tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)


In [32]:
#Setting up the loss function
aug_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
aug_metrics = tf.metrics.BinaryAccuracy()

In [33]:
# Setting up the epochs and optimizer (AdamW optimizer) 

steps_per_epoch = tf.data.experimental.cardinality(aug_train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [34]:
#Compiling the augmented classifier model
aug_classifier_model = build_aug_classifier_model()
aug_classifier_model.compile(optimizer=optimizer,
                         loss=aug_loss,
                         metrics=aug_metrics)


In [35]:
#Training the model to the augmented dataset 
history = aug_classifier_model.fit(x=real_aug_train_ds,
                               validation_data=aug_val_ds,
                               epochs= epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
aug_loss, aug_accuracy = aug_classifier_model.evaluate(test_ds)



In [37]:
print(f'Aug Loss: {aug_loss}')
print(f'Aug Accuracy: {aug_accuracy}')

Aug Loss: 0.7237421274185181
Aug Accuracy: 0.8645600080490112


So there is a slight improvement in accuracy, but not significant enough. This could be due to the fact that we only used one augmentation technique (swap augmentation) and these movie reviews are fairly long.  So swapping a pair of words in a long movie review might not change the data sufficiently enough. Also the dataset is already very large at 25000 items so augmenting it might not help much, espcecially if our augmentation is subtle. 