<a href="https://colab.research.google.com/github/ami-doshi/RNN-with-TensorFlow---Reviews-for-IMDB/blob/master/RNN_with_TF2_0_Reviews_for_iMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Working with TensorFlow 2.0 Dataset API

The plan is 

<ol>
  <li>   Download the IMDB Reviews dataset using TensorFlow Datasets APIs
  <li>   Count the number of unique words in the vocabulary using a tokenizer
  <li>   Convert the words to integers so that they can be used in numerical calculations
  <li>   Prepare your training and validation datasets using TF2.0 dataset APIs
  <li>   Use word embeddings to convert the integers to meaningful representations i.e. feature vectors that maximize interclass similarity and minimize intraclass similarity
  <li>   Design, train, and evaluate RNN Models
      <ol>
      <li>   DNN
      <li>   LSTM
      <li>   GRU
      <li>   Bidirectional LSTM
      </ol>
</ol>

In [0]:
# Global Variables
LR = 0.0005  # learning rate
REG_LAMBDA = 0.00001  # regularization lambda
BATCH_SZ = 32 # batch size
EPOCHS = 10  # number of epochs
TRAIN_STEPS = 45  # steps per train epoch
VAL_STEPS = 45  # steps per validation epoch
EMBED_SZ = 32  # embedding vector size
RNN_SZ = 16  # number of units in RNN layer

### 1. Download the IMDB Reviews dataset

In [0]:
# import both tensorflow and tensorflow_datasets

import tensorflow as tf
import tensorflow_datasets as tfds

In [0]:
# download the imdb review dataset and allocate the training and testing splits to new variables
# remark, for this exercise, we need a supervised dataset with both reviews and labels
# remark, the information object holds dataset size among many other details
# remark, the imdb reviews is balanced dataset of positive and negative reviews

(ds_train, ds_test), info = tfds.load(
    name= 'imdb_reviews/plain_text',
    with_info=True,
    as_supervised = True,
    download = True,
    split=(tfds.Split.TRAIN, tfds.Split.TEST)
)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteQ9U0J5/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteQ9U0J5/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteQ9U0J5/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [0]:
# inspect the information object
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [0]:
# extract the number of reviews in the training set using object attributes and dictionary keys from the information object above
info.splits['train'].num_examples


25000

### 2. Create the Unique Vocabulary Set

In [0]:
# import the ceil rounding method from the math library

from math import ceil

# how many iterations does it it take to iterate over the complete training set?

total_size = info.splits['train'].num_examples

# remark use the batch size equals one for simplicity
batch_size = 1
num_takes = ceil(total_size / batch_size ) + 1

# create a tokenizer object from the tensorflow datasets text APIs
tokenizer = tfds.features.text.Tokenizer()

# create a unique set data structure to hold the tokenized words of the vocabulary
vocabulary = set()

# loop over the training data one batch at a time
for batch, label in ds_train.batch(batch_size).take(num_takes):

  # apply the tokenizer to the string reviews of the batch
  tokens = tokenizer.tokenize(str(batch[0].numpy()))

  # add the unique new words only to the vocabulary corpus
  vocabulary.update(tokens)

In [0]:
print(batch,tokenizer.tokenize(str(batch[0].numpy())))

tf.Tensor([b'First of all, Riget is wonderful. Good comedy and mystery thriller at the same time. Nice combination of strange \'dogma\' style of telling the story together with good music and great actors. But unfortunately there\'s no \'the end\'. As for me it\'s unacceptable. I was thinking... how it will be possible to continue the story without Helmer and Drusse? ...and I have some idea. I think Lars should make RIGET III a little bit different. I\'m sure that 3rd part without Helmer wouldn\'t be the same. So here\'s my suggestion. Mayble little bit stupid, maybe not. I know that Lars likes to experiment. So why not to make small experiment with Riget3? I think the only solution here is to create puppet-driven animation (like for example "team America" by Trey Parker) or even computer 3d animation. I know it\'s not the same as real actors, but in principle I believe it could work... only this way it\'s possible to make actors alive again. For Riget fans this shouldn\'t be so big di

In [0]:
# what is the vocabulary size?
len(vocabulary)


94406

In [0]:
# how many iterations does it it take to iterate over the complete testing set?

total_size = info.splits['test'].num_examples

# remark use a batch size larger than one to make your code more interesting
batch_size = 16
num_takes = ceil(total_size/batch_size) + 1

# loop over the testing data one batch at a time
for batch, label in ds_test.batch(batch_size).take(num_takes):
  
  # loop over the reviews in the batch one review at a time
  for i in range(batch.shape[0]):
    
    # apply the tokenizer to the string review
    tokens = tokenizer.tokenize(str(batch[i].numpy()))

    # add the unique new words only to the vocabulary corpus
    vocabulary.update(tokens)


In [0]:
# what is the vocabulary size?
len(vocabulary)

130846

In [0]:
# store vocabulary size for future usage

VOCAB_SZ = len(vocabulary)

### 3. Token Integer Indexer

In [0]:
# create a tokenizer and integer indexer object from the tensorflow datasets text APIs
# remark the encoder should take the vocabulary size as a constructor initializer
# remark feed the tokenizer from the previous step to the integer indexer
# wrap up, the encoder at this step will convert words to integers

encoder = tfds.features.text.TokenTextEncoder(
    vocab_list = vocabulary,
    tokenizer=tokenizer,
    lowercase= True 
)

In [0]:
# let's do a unit test

# loop over the batches in the training set and take only one batch for illustration purpose only
for batch, label in ds_train.take(1):

  # encode the text review using the encoder from the previous step\
  x = encoder.encode(batch.numpy())

  # print both the original review and the indexed review to observe what's happening here
  print(batch)
  print(x)

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
[122438, 78169, 90039, 92135, 120017, 89292, 74833, 107120, 109693, 95368, 119017, 129778, 114665, 127348, 129908, 83489, 89925, 92658, 118896, 83195, 109039, 125761, 122438, 102790, 91445, 109693, 60379, 124301, 64422, 119017, 95576, 127550, 60379, 83

### 4. TF2.0 Dataset APIs
####  Prepare the training and testing dataset iterators

**tf.py_funtion** allows you to run arbitrary python code, especially numpy, inside a graph session. It serves in the following context as a bridge between static graph of TFv1.0 and the eager execution mode of TFv2.0

You want to use **Dataset.map** to apply this function to each element of the dataset. *Dataset.map runs in graph mode.*

*   Graph tensors do not have a value.
*   In graph mode you can only use TensorFlow Ops and functions.

So you can't .map this function directly: You need to wrap it in a **tf.py_function**. The **tf.py_function** will pass regular tensors (with a value and a **.numpy()** method to access it), to the wrapped python function.

[Github Reference](https://github.com/tensorflow/tensorflow/issues/36979#issuecomment-594100272
)

In [0]:
# create a callback function to convert the string reviews to integer indices and return the X,y pairs
def preprocess(batch, labels):
  return encoder.encode(batch.numpy()), labels

# create a tf.py_function wrapper callback to inject into the underlying graph session
def encode_map_fn(batch, labels):
  # py_func doesn't set the shape of the returned tensors.
  # use tf.py_function to wrap the preprocess callback
  encoded_batch, labels = tf.py_function(
      preprocess,
      inp = [batch,labels],
      Tout = (tf.int64, tf.int64)
  ) 

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_batch.set_shape([None])
  labels.set_shape([])

  # return the outputs of the processing step
  return encoded_batch, labels


# apply the integer indexer preprocessing callback to the train and test data set iterators
# remark use the map method to process the samples in the dataset one by one on the fetch call
tok_ds_train = ds_train.map(encode_map_fn)
tok_ds_test = ds_test.map(encode_map_fn)

In [0]:
# let's test and observe
# take a few batches and observe the batch dimensions

for batch, labels in tok_ds_train.take(3):
  print(batch.shape)

(121,)
(112,)
(132,)


In [0]:
# shuffle the data for better stochastic sampling during the training

fin_ds_train = tok_ds_train.shuffle(buffer_size=2048)
fin_ds_test = tok_ds_test.shuffle(buffer_size=2048)

In [0]:
# avoid running out of data by using repeat( number of epochs * number of steps per epoch )

fin_ds_train = fin_ds_train.repeat(EPOCHS*TRAIN_STEPS)
fin_ds_test = fin_ds_test.repeat(EPOCHS*VAL_STEPS)

In [0]:
# oh, yes, you are right, the batches are of different size
# each review has a different length i.e. number of words
# now use the padding batch method of the tensorflow 2.0 dataset APIs
# remark drop the remainders of uneven batch size at the tail of the iteration over the dataset for sake of simplicity

fin_ds_train = fin_ds_train.padded_batch(BATCH_SZ,drop_remainder=True)
fin_ds_test = fin_ds_test.padded_batch(BATCH_SZ, drop_remainder= True)

In [0]:
# now test and observe
# padded batches means samples in same batch are of equal length
# it does not mean that all batches are of equal length though
# this is absolutely fine for the next manipulation

for batch, labels in fin_ds_train.take(3):
  print(batch.shape)
  for line in batch:
    assert line.shape[0] == batch.shape[1]


(32, 887)
(32, 677)
(32, 551)


### 5. Word Embeddings

Convert the meaningless integers to useful vector representations

In [0]:
# create a one layer model with embedding for illustration purposes only
# remark use vocab size + 0 of padding + unk for new words
# remark the IMDB reviews length varies a lot, some reviews are a few hundred word, but others are many thousands
# that's why it is better to use mask_zero equals true of the embedding layer
# this way the model will learn to ignore the zero padding at the end of the reviews
# the embedding layer is a masking producer while the RNN layer is a masking consumer
# the mask is passed automatically between producers and consumers in both Sequentional and Functional APIs
# read more about padding and masking on TF official website

model = tf.keras.layers.Embedding(
    input_dim = (VOCAB_SZ + 2),
    output_dim = EMBED_SZ,
    mask_zero = True
)

In [0]:
# let's unit test once again by taking batches and compare the embeddings outputs to the original inputs

# iterate over the training set
for batch, labels in fin_ds_train.take(3):

  # pass the text reviews, of course they are integer indexed by now, to the embedding layer
  out = model(batch)
  #print(batch[-1])
  #print(out[-1])

  print(batch.shape)
  print(out.shape)
  # assert that the inputs and outputs have the same number of words
  assert batch.shape[1] == out.shape[1]
 

  # assert that the embeddings have the expected size of 10 elements feature vectors
  assert out.shape[2] == EMBED_SZ

  # assert that the batch size is untouched before and after passing the data to your model
  assert batch.shape[0] == out.shape[0]

(32, 1026)
(32, 1026, 32)
(32, 1029)
(32, 1029, 32)
(32, 1051)
(32, 1051, 32)


### 6. Design, Train, Evaluate RNN Models

#### 6.1 DNN

In [0]:
# create a simple sequential model
# make sure that the embedding layer is the first layer of the model
# use LSTM RNN layer
# design for single neuron output either positive or negative

model = tf.keras.Sequential(layers=[
                                    tf.keras.layers.Embedding(
                                        input_dim = (VOCAB_SZ+2),
                                        output_dim = EMBED_SZ,
                                        mask_zero = True,
                                        embeddings_regularizer = tf.keras.regularizers.l2(REG_LAMBDA)
                                    ),
                                    tf.keras.layers.GlobalAveragePooling1D(),
                                    tf.keras.layers.Dense(
                                        units = 16,
                                        activation = 'relu',
                                        activity_regularizer = tf.keras.regularizers.l2(REG_LAMBDA)
                                    ),
                                    tf.keras.layers.Dense(
                                        units = 1,
                                        activation = 'sigmoid'
                                    )                               
])

# compile the model using an optimizer, loss, and metrics
model.compile(
    optimizer = tf.keras.optimizers.SGD(learning_rate=LR),
    loss='binary_crossentropy',
    metrics=[
             'accuracy'
    ]
)


# train the model using the fit method and mind the number of epochs and the steps per epoch
# for simplicity use small values otherwise it will take a very long time

history = model.fit(
    fin_ds_train,
    validation_data = fin_ds_test,
    epochs = 2, #EPOCHS,
    steps_per_epoch = 10,
    validation_steps = 8,
    verbose = 1
)

# evaluate the loss and the accuracy on the testing set

# use the evaluate method and save the results in a variable this time
test_loss, test_acc = list(
    model.evaluate(fin_ds_test, steps = 8)
)

# save the loss and accuracy in dictionary for comparison
report = {
    'DNN': (test_loss,test_acc)
}

Epoch 1/2


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/2


In [0]:
report

{'DNN': (0.7280976176261902, 0.484375)}