# 19 Joining the Transformer Encoder and Decoder with Masking

Let's go over masking first and then stitch everything together (our encoder and decoder which we already implemented and will simply import).

In [9]:
%load_ext autoreload
%autoreload 2

import numpy as np
from tensorflow import cast, float32, linalg, math, maximum, newaxis, ones
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense

from xformer.decoder import Decoder
from xformer.encoder import Encoder

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 19.2 Masking

### Creating a Padding Mask

In [10]:
def padding_mask(inuput):
    # Create mask marking zero padding values in the input by 1s
    mask = math.equal(input, 0)
    mask = cast(mask, float32)

    return mask

In [11]:
#  Let's test it...
input = np.array([1, 2, 3, 4, 0, 0, 0])
print(padding_mask(input))

tf.Tensor([0. 0. 0. 0. 1. 1. 1.], shape=(7,), dtype=float32)


**Note:** 1 means mask it.

### Creating a Look-Ahead Mask

In [12]:
def lookahead_mask(n_tokens):
    # Mask out "future" entries by marking them with a 1.0
    mask = 1 - linalg.band_part(ones((n_tokens, n_tokens)), -1, 0)

    return mask

In [13]:
# Let's take it for a spin...
print(lookahead_mask(5))

tf.Tensor(
[[0. 1. 1. 1. 1.]
 [0. 0. 1. 1. 1.]
 [0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]], shape=(5, 5), dtype=float32)


**Note:** Once again 1 means it's masked.

## 19.3 Joining the Transformer Encoder and Decoder

In [14]:
class Xformer(Model):
    def padding_mask(self, inuput):
        # Create mask marking zero padding values in the input by 1s
        mask = math.equal(input, 0)
        mask = cast(mask, float32)

        return mask

    def lookahead_mask(self, n_tokens):
        # Mask out "future" entries by marking them with a 1.0
        mask = 1 - linalg.band_part(ones((n_tokens, n_tokens)), -1, 0)

        return mask

    def __init__(
        self,
        enc_vocab_size,
        dec_vocab_size,
        enc_seq_len,
        dec_seq_len,
        n_heads,
        d_model,
        d_ff_inner,
        n_layers,
        dropout_rate,
        **kwargs
    ):
        super().__init__(**kwargs)

        # Set up the encoder block
        self.encoder = Encoder(
            enc_vocab_size,
            enc_seq_len,
            n_heads,
            d_model,
            d_ff_inner,
            n_layers,
            dropout_rate,
        )

        # Set up the decoder block
        self.decoder = Decoder(
            dec_vocab_size,
            dec_seq_len,
            n_heads,
            d_model,
            d_ff_inner,
            n_layers,
            dropout_rate,
        )

        # Define the final Dense layer that maps output probabilities to tokens in the target vocabulary
        self.final_layer = Dense(dec_vocab_size)

    def call(self, enc_input, dec_input, training):
        # Create padding mask to mask the encoder inputs as well as the encoder outputs (which are input to the decoder)
        enc_mask = self.padding_mask(enc_input)

        # Create and combine padding and look-ahead masks to be fed into the decoder
        dec_padding_mask = self.padding_mask(dec_input)
        dec_lookahead_mask = self.lookahead_mask(dec_input.shape[1])
        dec_mask = maximum(dec_padding_mask, dec_lookahead_mask)

        # Feed inputs into the encoder
        enc_output = self.encoder(enc_input, enc_padding_mask, training)

        # Feed encoder output into the decoder
        dec_output = self.decoder(
            dec_input, dec_mask, enc_output, enc_mask, training
        )

        # Pass decoder output through a final Dense layer
        final_output = self.final_layer(dec_output)

        return final_output

## 19.4 Creating an Instance of the Transformer Model

As usual, we will stick to the parameter values used in AIAYN and dummy input values...

In [15]:
h = 8  # Number of attention heads
d_ff = 2048  # Dimensionality of the inner fully-connected layer
d_model = 512  # Dimensionality of the model
n = 6  # Number of layers in the encoder and decoder stacks

dropout_rate = 0.1  # Frequency of dropping input units in dropout layers

enc_vocab_size = 20  # Vocabulary size for the encoder
dec_vocab_size = 20  # Vocabulary size for the decoder

enc_seq_length = 5  # Maximum length of the input sequence
dec_seq_length = 5  # Maximum length of the target sequence

training_model = Xformer(
    enc_vocab_size,
    dec_vocab_size,
    enc_seq_length,
    dec_seq_length,
    h,
    d_model,
    d_ff,
    n,
    dropout_rate,
)

### Printing Out a Summary of the Encoder and Decoder Layers

In order to do this, we need to add a few lines of code to our `EncoderLayer` and `DecoderLayer` classes, which I have done retroactively (and added the necessary imports and method arguments to make it all work).  

In the `__init__()` methods of both, we add:  
```python
self.build(input_shape=[None, sequence_length, d_model])
```

In the `EncoderLayer` class we add the following method:
```python
def build_graph(self):
    input_layer = Input(shape=(self.sequence_length, self.d_model))
    return Model(inputs=[input_layer], outputs=self.call(input_layer, None, True))
```

And in `DecoderLayer` we add:
```python
def build_graph(self):
    input_layer = Input(shape=(self.sequence_length, self.d_model))
    return Model(inputs=[input_layer], outputs=self.call(input_layer, None, input_layer, None, True))
```

Now we can build a single `EncoderLayer` or `DecoderLayer` as a `Model` and look at its summary (number of parameters, input/output shapes, etc.):

In [16]:
from xformer.encoder import EncoderLayer
from xformer.decoder import DecoderLayer

enc_sub_layer = EncoderLayer(enc_seq_length, h, d_model, d_ff, dropout_rate)
enc_sub_layer.build_graph().summary()

dec_sub_layer = DecoderLayer(dec_seq_length, h, d_model, d_ff, dropout_rate)
dec_sub_layer.build_graph().summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 5, 512)]             0         []                            
                                                                                                  
 multi_head_attention_37 (M  (None, 5, 512)               1050624   ['input_2[0][0]',             
 ultiHeadAttention)                                                  'input_2[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 dropout_66 (Dropout)        (None, 5, 512)               0         ['multi_head_attention_37[0][0
                                                                    ]']                       