# RNN, GRU, LSTM and Bidirectionality

## 1. Imports and Configuration

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Configure GPU memory growth to be dynamic instead of allocating all memory at once
physical_devices = tf.config.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## 2. Data Loading and Preprocessing

In [7]:
from tensorflow.keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float32') / 255.0
x_test  = x_test.astype('float32') / 255.0

## 3. Model Definition

### 3.1.1 RNN
- `model.add(keras.Input(shape=(None, 28)))` we specify None because <ins>we dont have to have specific number of time steps</ins>. We have 28 pixels in one row of the image. But we put None as we dont have to specify that dimension.

- `return_sequences=True` will return the output of the RNN layer for each time step. If we dont specify this, it will return the output of the RNN layer for the last time step. We do this so that we can stack multiple RNN layers. So in the code below the output from the RNN are 512 nodes and return_sequences=True will return 512 for each time step, in this case 28 time steps.
    - In the `model.summary()` of this RNN you will notice output shape (None, None, 512) One for batch size, one for hidden states (time steps) and 512 are the nodes in the hidden state.

- Notice in the code that for the second RNN we dont do return_sequences=True. This is because we only want the output of the last time step. Which is passed to the Dense layer which has 10 nodes for the 10 classes.
    - In the `model.summary()` of this RNN you will notice output shape (None, 512) One for batch size and 512 from the last hidden state (time step) of the RNN layer.

In [3]:
model = keras.Sequential()
model.add(keras.Input(shape=(None, 28)))
model.add(layers.SimpleRNN(512, return_sequences=True, activation="relu"))
model.add(layers.SimpleRNN(512, activation="relu"))
model.add(layers.Dense(10))

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       (None, None, 512)         276992    
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 512)               524800    
_________________________________________________________________
dense (Dense)                (None, 10)                5130      
Total params: 806,922
Trainable params: 806,922
Non-trainable params: 0
_________________________________________________________________
None


### 3.1.2 RNN tanh
- The default activation function for RNN is **tanh**. We can change it to relu or sigmoid. 

In [8]:
model = keras.Sequential()
model.add(keras.Input(shape=(None, 28)))
model.add(layers.SimpleRNN(256, return_sequences=True, activation="tanh"))
model.add(layers.SimpleRNN(256))
model.add(layers.Dense(10))

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_2 (SimpleRNN)     (None, None, 256)         72960     
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 256)               131328    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                2570      
Total params: 206,858
Trainable params: 206,858
Non-trainable params: 0
_________________________________________________________________
None


### 3.2 GRU

In [None]:
model.keras.Sequential()
model.add(keras.Input(shape=(None, 28)))
model.add(layers.GRU(256, return_sequences=True, activation="tanh"))
model.add(layers.GRU(256))
model.add(layers.Dense(10))

print(model.summary())

### 3.3 LSTM

In [None]:
model.keras.Sequential()
model.add(keras.Input(shape=(None, 28)))
model.add(layers.LSTM(256, return_sequences=True, activation="tanh"))
model.add(layers.LSTM(256))
model.add(layers.Dense(10))

print(model.summary())

### 3.4.1 Single Bidirectional LSTM

In the `model.summary()` of this Bidirectional LSTM you will notice that the output shape doubled from 256 to 512. This is because the output of the Bidirectional LSTM is the concatenation of the forward LSTM and the backward LSTM.

In [11]:
model = keras.Sequential()
model.add(keras.Input(shape=(None, 28)))
model.add(
    layers.Bidirectional(layers.LSTM(256, return_sequences=True, activation="relu"))
)
model.add(layers.LSTM(256, name="lstm_layer2"))
model.add(layers.Dense(10))

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, None, 512)         583680    
_________________________________________________________________
lstm_layer2 (LSTM)           (None, 256)               787456    
_________________________________________________________________
dense_2 (Dense)              (None, 10)                2570      
Total params: 1,373,706
Trainable params: 1,373,706
Non-trainable params: 0
_________________________________________________________________
None


### 3.4.2 Stacked Bidirectional LSTM

In [None]:
model = keras.Sequential()
model.add(keras.Input(shape=(None, 28)))
model.add(
    layers.Bidirectional(layers.LSTM(256, return_sequences=True, activation="relu"))
)
model.add(
    layers.Bidirectional(layers.LSTM(256, name="lstm_layer2"))
    )
model.add(layers.Dense(10))

print(model.summary())

## 4. Compile Model

In [9]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(lr=0.001),
    metrics=["accuracy"],
)

## 5. Model Training and Evaluation

In [10]:
print("Training model...")
model.fit(x_train, y_train, batch_size=64, epochs=10, verbose=2)

print("\nEvaluating model...")
results = model.evaluate(x_test, y_test, batch_size=64, verbose=0)
print(f"Test loss: {results[0]:.4f}")
print(f"Test accuracy: {results[1]:.4f}")

Training model...
Epoch 1/10
938/938 - 18s - loss: 0.2994 - accuracy: 0.9089
Epoch 2/10
938/938 - 20s - loss: 0.1846 - accuracy: 0.9466
Epoch 3/10
938/938 - 23s - loss: 0.1593 - accuracy: 0.9550
Epoch 4/10
938/938 - 23s - loss: 0.1478 - accuracy: 0.9570
Epoch 5/10
938/938 - 24s - loss: 0.1447 - accuracy: 0.9579
Epoch 6/10
938/938 - 25s - loss: 0.1403 - accuracy: 0.9598
Epoch 7/10
938/938 - 21s - loss: 0.1414 - accuracy: 0.9588
Epoch 8/10
938/938 - 19s - loss: 0.1323 - accuracy: 0.9617
Epoch 9/10
938/938 - 19s - loss: 0.1374 - accuracy: 0.9603
Epoch 10/10
938/938 - 19s - loss: 0.1702 - accuracy: 0.9498

Evaluating model...
Test loss: 0.1770
Test accuracy: 0.9464
