# 9 Adding a Custom Attention Layer to Recurrent Neural Network in Keras

In [18]:
import numpy as np
import tensorflow.keras.backend as K
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, Layer, SimpleRNN
from tensorflow.keras.metrics import mean_squared_error
from tensorflow.keras.models import Sequential

seed = 13

## 9.2 The SimpleRNN Network

Our aim is to train an RNN on the Fibonacci numbers and get it to predict the next term given the first few.  
First, we need to construct the data for our contrived prediction problem...

In [19]:
def get_fib_seq(n, scale_data=True):
    """
    Get the first n terms of the Fibonacci sequence,
    excluding the initial (0, 1).
    """
    seq = np.zeros(n)
    fib_2_prior = 0.0
    fib_1_prior = 1.0
    for i in range(n):
        seq[i] = fib_2_prior + fib_1_prior
        fib_2_prior = fib_1_prior
        fib_1_prior = seq[i]
    scaler = None
    if scale_data:
        scaler = MinMaxScaler(feature_range=(0, 1))
        seq = np.reshape(seq, (n, 1))
        seq = scaler.fit_transform(seq).flatten()
    return seq, scaler

In [20]:
fib_seq, _ = get_fib_seq(10, False)
print(fib_seq)

[ 1.  2.  3.  5.  8. 13. 21. 34. 55. 89.]


In [21]:
def get_fib_xy(total_fib_numbers, time_steps, train_frac, scale_data=True):
    """Returns train/test data constructed from the Fibonacci series.

    Constructs training and test data (as X and y pairs),  where each row of X
    consists of `time_steps` consecutive Fibonacci numbers and each y is the
    Fibonacci number immediately following the last one in the corresponding X.

    Args:
        total_fib_numbers: Total number of terms of the Fibonacci series to use
          to draw from to construct X/y pairs. (Excludes the initial 0, 1)
        time_steps: Number of terms in each X sample.
        train_frac: Fraction of data to designate as the training set.
        scale_data: Whether to min-max-scale the data to the (0, 1) range.

    Returns:
        Shuffled training and testing pairs of (X, y) data and, optionally, a
        MinMaxScaler object (or None). The returned X tensors are of dimensions
        (num_samples, num_time_steps, num_features = 1).
    """
    data, scaler = get_fib_seq(total_fib_numbers, scale_data)

    y_indices = np.arange(time_steps, len(data), 1)
    y = data[y_indices]
    num_samples = len(y)
    X = data[0:num_samples]
    for i in range(1, time_steps):
        X = np.column_stack((X, data[i : num_samples + i]))

    # Now introduce random permutations
    rand = np.random.RandomState(seed)
    indices = rand.permutation(num_samples)
    split = int(train_frac * num_samples)
    train_indices = indices[0:split]
    test_indices = indices[split:]

    X_train = X[train_indices]
    y_train = y[train_indices]

    X_test = X[test_indices]
    y_test = y[test_indices]

    X_train = np.reshape(X_train, (len(X_train), time_steps, 1))
    X_test = np.reshape(X_test, (len(X_test), time_steps, 1))

    return X_train, y_train, X_test, y_test, scaler

In [22]:
get_fib_xy(12, 3, 0.7, False)

(array([[[ 8.],
         [13.],
         [21.]],
 
        [[ 5.],
         [ 8.],
         [13.]],
 
        [[ 2.],
         [ 3.],
         [ 5.]],
 
        [[13.],
         [21.],
         [34.]],
 
        [[21.],
         [34.],
         [55.]],
 
        [[34.],
         [55.],
         [89.]]]),
 array([ 34.,  21.,   8.,  55.,  89., 144.]),
 array([[[ 55.],
         [ 89.],
         [144.]],
 
        [[  1.],
         [  2.],
         [  3.]],
 
        [[  3.],
         [  5.],
         [  8.]]]),
 array([233.,   5.,  13.]),
 None)

Great! Now let us build a model...

In [23]:
# Set up parameters
time_steps = 20
hidden_units = 2
epochs = 30

# Create a traditional RNN network
def create_rnn(hidden_units, dense_units, input_shape, activations):
    model = Sequential()
    model.add(
        SimpleRNN(
            hidden_units, input_shape=input_shape, activation=activations[0]
        )
    )
    model.add(Dense(dense_units, activation=activations[1]))
    model.compile(loss="mse", optimizer="adam")
    return model

In [24]:
model_rnn = create_rnn(
    hidden_units=hidden_units,
    dense_units=1,
    input_shape=(time_steps, 1),
    activations=["tanh", "tanh"],
)
model_rnn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_2 (SimpleRNN)    (None, 2)                 8         
                                                                 
 dense_2 (Dense)             (None, 1)                 3         
                                                                 
Total params: 11
Trainable params: 11
Non-trainable params: 0
_________________________________________________________________


Question
> Why does the Simple RNN layer have 8 parameters?  

<details>
    <summary>Answer</summary>
    (2 hidden units + 1 input feature) × (2 hidden units) + 2 biases
</details>

<br>
Now we are ready to create a larger dataset and train the model on it...

In [25]:
# Generate the dataset
X_train, y_train, X_test, y_test, scaler = get_fib_xy(1200, time_steps, 0.7)

In [26]:
model_rnn.fit(X_train, y_train, epochs=epochs, batch_size=1, verbose=2)

Epoch 1/30
826/826 - 1s - loss: 0.0021 - 953ms/epoch - 1ms/step
Epoch 2/30
826/826 - 1s - loss: 0.0019 - 668ms/epoch - 809us/step
Epoch 3/30
826/826 - 1s - loss: 0.0018 - 638ms/epoch - 772us/step
Epoch 4/30
826/826 - 1s - loss: 0.0017 - 651ms/epoch - 789us/step
Epoch 5/30
826/826 - 1s - loss: 0.0016 - 643ms/epoch - 779us/step
Epoch 6/30
826/826 - 1s - loss: 0.0015 - 627ms/epoch - 759us/step
Epoch 7/30
826/826 - 1s - loss: 0.0015 - 621ms/epoch - 752us/step
Epoch 8/30
826/826 - 1s - loss: 0.0014 - 629ms/epoch - 761us/step
Epoch 9/30
826/826 - 1s - loss: 0.0013 - 635ms/epoch - 769us/step
Epoch 10/30
826/826 - 1s - loss: 0.0013 - 618ms/epoch - 748us/step
Epoch 11/30
826/826 - 1s - loss: 0.0012 - 631ms/epoch - 764us/step
Epoch 12/30
826/826 - 1s - loss: 0.0011 - 620ms/epoch - 750us/step
Epoch 13/30
826/826 - 1s - loss: 0.0010 - 651ms/epoch - 788us/step
Epoch 14/30
826/826 - 1s - loss: 9.0076e-04 - 634ms/epoch - 767us/step
Epoch 15/30
826/826 - 1s - loss: 7.9481e-04 - 635ms/epoch - 768us/ste

<keras.callbacks.History at 0x17f4a4dc0>

In [27]:
# Evaluate the model
train_mse = model_rnn.evaluate(X_train, y_train)
test_mse = model_rnn.evaluate(X_test, y_test)
print("Train set MSE = ", train_mse)
print("Test set MSE = ", test_mse)

Train set MSE =  7.015452138148248e-05
Test set MSE =  9.266976121580228e-06


## 9.3 Adding a Custom Attention Layer to the Network

We will create the `Attention` class which inherits from Keras's `Layer` class. Our class will implement the Bahdanau attention mechanism.  
In order to build a custom layer, Keras requires us to implement the `__init__()`, `build()` and `call()` methods. The `build` method "lazily" builds the weights and biases once the input shape is known. The `call()` method implements the forward pass of training. Everything else (computing gradients and tuning the weights via the backward pass) is taken care of by Keras.  

See the Keras guide on [Making new layers and models via subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/).

In [28]:
class Attention(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.w = self.add_weight(
            name="attention_weight",
            shape=(input_shape[-1], 1),
            initializer="random_normal",
            trainable=True,
        )
        self.b = self.add_weight(
            name="attention_bias",
            shape=(input_shape[1], 1),
            initializer="zeros",
            trainable=True,
        )
        super().build(input_shape)

    def call(self, x):
        # Compute alignment scores and pass them thru the `tanh` function
        e = K.tanh(K.dot(x, self.w) + self.b)
        # Remove dimension of size 1
        e = K.squeeze(e, axis=-1)
        # Compute the attention weights
        alpha = K.softmax(e)
        # Reshape to the format TensorFlow needs (adding back the removed dim)
        alpha = K.expand_dims(alpha, axis=-1)
        # Compute the context vector
        context = x * alpha
        context = K.sum(context, axis=1)
        return context

We can now create an RNN with attention using the "Functional API" of Keras. Our attention layer expects a sequence as input, so we will have to make sure to return the entire sequence of hidden states from our SimpleRNN.  

In [29]:
def create_rnn_with_attention(
    hidden_units, dense_units, input_shape, activation
):
    x = Input(shape=input_shape)
    rnn_layer = SimpleRNN(
        hidden_units, return_sequences=True, activation=activation
    )(x)
    attention_layer = Attention()(rnn_layer)
    outputs = Dense(dense_units, trainable=True, activation=activation)(
        attention_layer
    )
    model = Model(x, outputs)
    model.compile(loss="mse", optimizer="adam")
    return model

In [30]:
model_attention = create_rnn_with_attention(
    hidden_units=hidden_units,
    dense_units=1,
    input_shape=(time_steps, 1),
    activation="tanh",
)
model_attention.summary()

input_shape=TensorShape([None, 20, 2])
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20, 1)]           0         
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 20, 2)             8         
                                                                 
 attention_1 (Attention)     (None, 2)                 22        
                                                                 
 dense_3 (Dense)             (None, 1)                 3         
                                                                 
Total params: 33
Trainable params: 33
Non-trainable params: 0
_________________________________________________________________


Question
> Why does the Attention layer have 22 parameters?  

<details>
    <summary>Answer</summary>
    20 "weights" for the 20 observations (`time_steps`) outputted via `return_sequences=True` + 2 "biases" for the 2 hidden units
</details>

<br>
We can now train and evaluate the model.

In [31]:
model_attention.fit(X_train, y_train, epochs=epochs, batch_size=1, verbose=2)

Epoch 1/30
826/826 - 1s - loss: 0.0018 - 1s/epoch - 1ms/step
Epoch 2/30
826/826 - 1s - loss: 0.0017 - 680ms/epoch - 823us/step
Epoch 3/30
826/826 - 1s - loss: 0.0016 - 685ms/epoch - 830us/step
Epoch 4/30
826/826 - 1s - loss: 0.0016 - 691ms/epoch - 837us/step
Epoch 5/30
826/826 - 1s - loss: 0.0016 - 677ms/epoch - 819us/step
Epoch 6/30
826/826 - 1s - loss: 0.0015 - 697ms/epoch - 844us/step
Epoch 7/30
826/826 - 1s - loss: 0.0015 - 678ms/epoch - 821us/step
Epoch 8/30
826/826 - 1s - loss: 0.0015 - 690ms/epoch - 835us/step
Epoch 9/30
826/826 - 1s - loss: 0.0015 - 674ms/epoch - 816us/step
Epoch 10/30
826/826 - 1s - loss: 0.0015 - 696ms/epoch - 843us/step
Epoch 11/30
826/826 - 1s - loss: 0.0015 - 723ms/epoch - 875us/step
Epoch 12/30
826/826 - 1s - loss: 0.0015 - 699ms/epoch - 847us/step
Epoch 13/30
826/826 - 1s - loss: 0.0014 - 675ms/epoch - 817us/step
Epoch 14/30
826/826 - 1s - loss: 0.0014 - 688ms/epoch - 832us/step
Epoch 15/30
826/826 - 1s - loss: 0.0014 - 695ms/epoch - 841us/step
Epoch 16/

<keras.callbacks.History at 0x28eef66e0>

In [32]:
# Evaluate the model
train_mse_attn = model_attention.evaluate(X_train, y_train)
test_mse_attn = model_attention.evaluate(X_test, y_test)
print("Train set MSE = ", train_mse_attn)
print("Test set MSE = ", test_mse_attn)

Train set MSE =  0.0013820648891851306
Test set MSE =  0.0012247657869011164


This is a very contrived, simple example and the model with attention may or may not beat the one without.  
We could potentially improve the model further by trying the following:
- Hyperparameter tuning and model selection
- Adding more layers to the network
- Using `LSTM` units instead of `SimpleRNN`s
- Building a network with convolution and pooling layers
- Switching to the encoder-decoder model architecture

**Note:** We can use the `scaler` object to convert back to the original values.

**Personal Note:** Actually this dataset is pretty useless and not suited to this kind of model. I found out as much while trying to convert the predictions back to the original range as per the above note:

In [33]:
fib_seq, _ = get_fib_seq(920, False)

example = fib_seq[900:]
print(f"Example test case (unscaled):\n{example}")

print("\n")

actual_next_term = example[-2] + example[-1]
print(
    f"The correct next term is: {example[-2]} + {example[-1]} ="
    f" {actual_next_term}"
)

print("\n")

example_scaled = scaler.transform(np.reshape(example, (time_steps, 1)))
predicted_term = model_attention.predict(np.expand_dims(example_scaled, 0))
predicted_term_scaled = predicted_term[0][0]
predicted_term_scaled
print(f"The model predicted: {scaler.inverse_transform([[predicted_term_scaled]])[0][0]}")

Example test case (unscaled):
[1.43670136e+188 2.32463163e+188 3.76133300e+188 6.08596463e+188
 9.84729763e+188 1.59332623e+189 2.57805599e+189 4.17138221e+189
 6.74943820e+189 1.09208204e+190 1.76702586e+190 2.85910790e+190
 4.62613377e+190 7.48524167e+190 1.21113754e+191 1.95966171e+191
 3.17079925e+191 5.13046096e+191 8.30126022e+191 1.34317212e+192]


The correct next term is: 8.301260217870547e+191 + 1.34317211819719e+192 = 2.1732981399842448e+192


The model predicted: 7.105794145798422e+247


My first reaction was: "Why are they so far off?!" Then I looked at a few arbitrary 20-term sub-sequences of the Fibonacci series and got the exact same result. And by "exact same result" I don't mean "same huge error". No! I mean the exact same prediction!! As a matter of fact, let's take a look at the model's predictions on the entire test set:

In [34]:
model_attention.predict(X_test)



array([[0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161349],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00424584],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.00161043],
       [0.001

Yup! They are mostly identical. Why? Because the $1200^{th}$ Fibonacci number has 250 digits (‼️) and when you map a range as wide as $(1, F_{1200})$ to the (0, 1) range in the scaling step, you can't expect a much better result!

If I get a chance later on, I will try to re-do this chapter with the sunspot dataset (from ch. 7) instead of this contrived Fibonacci-based one.