## Table of Contents

- [Import](#0)
- [1 - Positional Encoding](#1)
- [2 - Masking](#2)
- [3 - Self-Attention](#3)
- [4 - Encoder](#4)
    - [4.1 Encoder Layer](#4-1)
    - [4.2 - Full Encoder](#4-2)
- [5 - Decoder](#5)
    - [5.1 - Decoder Layer](#5-1)
    - [5.2 - Full Decoder](#5-2)

<a name='0'></a>
## Import

In [None]:
from os import environ
environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, Input, Dropout, LayerNormalization
from transformers import DistilBertTokenizerFast #, TFDistilBertModel
from transformers import TFDistilBertForTokenClassification

<a id="1"></a>
## Positional Encoding

In [None]:
def get_angles(pos, k, d:int):
    """
    Get angles to be used in the positional encoding vectors
    
    Arguments:
        pos -- Column vector containing the positions [[0], [1], ...,[N-1]]
        k --   Row vector containing the dimension span [[0, 1, 2, ..., d-1]]
        d -- Encoding size
    
    Returns:
        angles -- (pos, d) numpy array 
    """
    # Get i from dimension span k
    i = k // 2
    # Calculate the angles using pos, i and d
    angles = pos / (10000 ** (2*i/d))
    # END CODE HERE
    
    return angles

In [None]:
def positional_encoding(positions:int, d:int):
    """
    Precomputes a matrix with all the positional encodings 
    
    Arguments:
        positions - Maximum number of positions to be encoded 
        d - Encoding size 
    
    Returns:
        pos_encoding - (1, position, d_model) matrix with the positional encodings
    """
    angle_rads = get_angles(np.arange(positions)[:,np.newaxis],
                            np.arange(d)[np.newaxis,:],
                            d)
  
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, :, :].reshape(1,positions,d)
    
    return tf.cast(pos_encoding, dtype=tf.float32) #casts tensor to float dtype

In [None]:
pos_encoding = positional_encoding(50, 512)

print (pos_encoding.shape)

plt.pcolormesh(pos_encoding[0], cmap='RdBu')
plt.xlabel('d')
plt.xlim((0, 512))
plt.ylabel('Position')
plt.colorbar()
plt.show()

<a name='2'></a>
## 2 - Masking

Two types of masks that are useful when building your Transformer network: the *padding mask* and the *look-ahead mask*

In [None]:
def create_padding_mask(decoder_token_ids):
    """
    Creates a matrix mask for the padding cells
    replaces zeros with negative infinities to not cause problems when applying softmax
    
    Arguments:
        decoder_token_ids -- (n, m) matrix
    
    Returns:
        mask -- (n, 1, m) binary tensor   
    """    
    seq = 1 - tf.cast(tf.math.equal(decoder_token_ids, 0), tf.float32)
    #tf.math.equal(x,y) Returns the truth value of (x == y) element-wise.
    # True has numerical value 1, False 0 so all the zeros turn into 1 here, and everything else becomes 0
  
    # add extra dimensions to add the padding to the attention logits.
    
    return seq[:, tf.newaxis, :]

In [None]:
def create_look_ahead_mask(sequence_length):
    """
    Returns an upper triangular matrix filled with ones.
    Lets the training model check if it got predictions right by having access to the actual output
    
    Arguments:
        sequence_length -- matrix size (sequence length is the number of time steps per input
                           input.shape = [batch_size, sequence_length, num_features])
    
    Returns:
        mask -- (size, size) tensor
    
    >>>create_look_ahead_mask(5)
    <tf.Tensor: shape=(1, 5, 5), dtype=float32, numpy=
    array([[[1., 0., 0., 0., 0.],
            [1., 1., 0., 0., 0.],
            [1., 1., 1., 0., 0.],
            [1., 1., 1., 1., 0.],
            [1., 1., 1., 1., 1.]]], dtype=float32)>
    """
    mask = tf.linalg.band_part(tf.ones((1, sequence_length, sequence_length)), -1, 0)
    return mask

<a name='3'></a>
## 3 - Self-Attention

$$
\text { Attention }(Q, K, V)=\operatorname{softmax}\left(\frac{Q K^{T}}{\sqrt{d_{k}}}+{M}\right) V\tag{4}\
$$

In [None]:
def scaled_dot_product_attention(q, k, v, mask=None):
    """
    Calculate the attention weights.
      q, k, v must have matching leading dimensions.
      k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
      The mask has different shapes depending on its type(padding or look ahead) 
      but it must be broadcastable for addition.

    Arguments:
        q -- query shape == (..., seq_len_q, depth)
        k -- key shape == (..., seq_len_k, depth)
        v -- value shape == (..., seq_len_v, depth_v)
        mask: Float tensor with shape broadcastable 
              to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
        output -- attention_weights
    """
    assert k.shape[-2] == v.shape[-2]
    matmul_qk = tf.matmul(q, k.T)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = k.shape[-2]
    scaled_attention_logits = matmul_qk / np.power(dk, 0.5)

    if mask is not None:
        scaled_attention_logits += (1. - mask) * -1e9 

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.exp(scaled_attention_logits) / np.sum(np.exp(scaled_attention_logits),
                                                                 axis = -1,
                                                                 keepdims = True)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
    # END CODE HERE

    return output, attention_weights

<a name='4'></a>
## 4 - Encoder

In [None]:
def FullyConnected(embedding_dim, fully_connected_dim):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(fully_connected_dim, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(embedding_dim)  # (batch_size, seq_len, d_model)
    ])

<a name='4-1'></a>
### 4.1 Encoder Layer

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    """
    The encoder layer is composed by a multi-head self-attention mechanism,
    followed by a simple, positionwise fully connected feed-forward network. 
    This archirecture includes a residual connection around each of the two 
    sub-layers, followed by layer normalization.
    """
    def __init__(self, embedding_dim, num_heads, fully_connected_dim,
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.ffn = FullyConnected(embedding_dim=embedding_dim,
                                  fully_connected_dim=fully_connected_dim)

        self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)
    
    def call(self, x, training, mask):
        """
        Forward pass for the Encoder Layer
        
        Arguments:
            x -- Tensor of shape (batch_size, input_seq_len, fully_connected_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            mask -- Boolean mask to ensure that the padding is not 
                    treated as part of the input
        Returns:
            encoder_layer_out -- Tensor of shape (batch_size, input_seq_len, fully_connected_dim)
        """
        # Dropout is added by Keras automatically if the dropout parameter is non-zero during training
        attn_output = self.mha(query = x,
                               value = x,
                               attention_mask = mask) # Self attention
        out1 = self.layernorm1(tf.add(x, attn_output))  # (batch_size, input_seq_len, fully_connected_dim)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, fully_connected_dim)
        
        ffn_output =  self.dropout_ffn(ffn_output)
        
        encoder_layer_out = self.layernorm2(tf.add(ffn_output, out1)) # (batch_size, input_seq_len, fully_connected_dim)
        
        return encoder_layer_out
    

<a name='4-2'></a>
### 4.2 - Full Encoder

# Change the Vocab Size

In [None]:
class Encoder(tf.keras.layers.Layer):
    """
    The entire Encoder starts by passing the input to an embedding layer 
    and using positional encoding to then pass the output through a stack of
    encoder Layers
        
    """  
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size,
               maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Encoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = Embedding(input_vocab_size, self.embedding_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                self.embedding_dim)


        self.enc_layers = [EncoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                           for _ in range(self.num_layers)]

        self.dropout = Dropout(dropout_rate)
        
    def call(self, x, training, mask):
        """
        Forward pass for the Encoder
        
        Arguments:
            x -- Tensor of shape (batch_size, input_seq_len)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            mask -- Boolean mask to ensure that the padding is not 
                    treated as part of the input
        Returns:
            out2 -- Tensor of shape (batch_size, input_seq_len, fully_connected_dim)
        """
        seq_len = tf.shape(x)[1]
        
        x = tf.cast(self.embedding(x), tf.float32)  # (batch_size, input_seq_len, embedding_dim)
        # Scale embedding by multiplying it by the square root of the embedding dimension
        x *= np.power(self.embedding_dim, 0.5)
        
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x,training,mask)
        return x  # (batch_size, input_seq_len, fully_connected_dim)

<a name='5'></a>
## 5 - Decoder

<a name='5-1'></a>    
### 5.1 - Decoder Layer

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    """
    The decoder layer is composed by two multi-head attention blocks, 
    one that takes the new input and uses self-attention, and the other 
    one that combines it with the output of the encoder, followed by a
    fully connected block. 
    """
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.mha2 = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.ffn = FullyConnected(embedding_dim=embedding_dim,
                                  fully_connected_dim=fully_connected_dim)

        self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm3 = LayerNormalization(epsilon=layernorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        """
        Forward pass for the Decoder Layer
        
        Arguments:
            x -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            enc_output --  Tensor of shape(batch_size, input_seq_len, fully_connected_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            look_ahead_mask -- Boolean mask for the target_input
            padding_mask -- Boolean mask for the second multihead attention layer
        Returns:
            out3 -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            attn_weights_block1 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
            attn_weights_block2 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
        """
        assert enc_output.shape == (batch_size, input_seq_len, fully_connected_dim)
        
        # BLOCK 1
        # Dropout will be applied during training.
        mult_attn_out1, attn_weights_block1 = self.mha1(query = x,
                                                        value = x,
                                                        attention_mask = look_ahead_mask,
                                                        return_attention_scores=True)
        
        Q1 = self.layernorm1(tf.add(x,mult_attn_out1))

        # BLOCK 2
        # self-attention using the Q from the first block and K and V from the encoder output
        mult_attn_out2, attn_weights_block2 = self.mha2(query = Q1,
                                                        value = enc_output,
                                                        key = enc_output,
                                                        attention_mask = padding_mask,
                                                        return_attention_scores=True)
        
        mult_attn_out2 = self.layernorm2( tf.add(mult_attn_out1, mult_attn_out2) )
        # (batch_size, target_seq_len, fully_connected_dim)
                
        #BLOCK 3
        #sequential dense layers and dropout
        ffn_output = self.ffn(mult_attn_out2)  # (batch_size, target_seq_len, fully_connected_dim)
        ffn_output = self.dropout_ffn(ffn_output)
        
        # layer normalization to the sum of the ffn output and the output of the second block
        out3 = self.layernorm3( tf.add(ffn_output, mult_attn_out2) )

        return out3, attn_weights_block1, attn_weights_block2
    

<a name='5-2'></a> 
### 5.2 - Full Decoder

In [None]:
class Decoder(tf.keras.layers.Layer):
    """
    The entire Encoder is starts by passing the target input to an embedding layer 
    and using positional encoding to then pass the output through a stack of
    decoder Layers
        
    """ 
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, target_vocab_size,
               maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Decoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = Embedding(target_vocab_size, self.embedding_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.embedding_dim)

        self.dec_layers = [DecoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                           for _ in range(self.num_layers)]
        self.dropout = Dropout(dropout_rate)
    
    def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
        """
        Forward  pass for the Decoder
        
        Arguments:
            x -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            enc_output --  Tensor of shape(batch_size, input_seq_len, fully_connected_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            look_ahead_mask -- Boolean mask for the target_input
            padding_mask -- Boolean mask for the second multihead attention layer
        Returns:
            x -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            attention_weights - Dictionary of tensors containing all the attention weights
                                each of shape Tensor of shape (batch_size, num_heads, target_seq_len, input_seq_len)
        """

        seq_len = tf.shape(x)[1]
        attention_weights = {}
        # create embeddings 
        x = self.embedding(x)  # (batch_size, target_seq_len, fully_connected_dim)
        
        # scale embeddings by multiplying by the square root of their dimension
        x *= np.power(self.embedding_dim, 0.5)
        
        # positional encodings
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x)

        for i in range(self.num_layers):
            # pass x and encoder output through decoder and save the attention weights of block 1 and 2
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                   look_ahead_mask, padding_mask)

            #update attention_weights dictionary with the attention weights of block 1 and block 2
            attention_weights[f'decoder_layer{i+1}_block1_self_att'] = block1
            attention_weights[f'decoder_layer{i+1}_block2_decenc_att'] = block2
        
        assert x.shape == (batch_size, target_seq_len, fully_connected_dim)
        return x, attention_weights

<a name='6'></a> 
## 6 - Transformer

In [None]:
class Transformer(tf.keras.Model):
    """
    Complete transformer with an Encoder and a Decoder
    """
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, num_features,
                 max_positional_encoding_input,
                 max_positional_encoding_target,
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers=num_layers,
                               embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               fully_connected_dim=fully_connected_dim,
                               num_features = num_features
                               maximum_position_encoding=max_positional_encoding_input,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)

        self.decoder = Decoder(num_layers=num_layers, 
                               embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               fully_connected_dim=fully_connected_dim,
                               target_vocab_size=target_vocab_size, 
                               maximum_position_encoding=max_positional_encoding_target,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)

        self.final_layer = Dense(1)
    
    def call(self, input_sentence, output_sentence, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        """
        Forward pass for the entire Transformer
        Arguments:
            input_data -- Tensor of shape (batch_size, input_seq_len, fully_connected_dim)
                              An array of the windowed voltage, current and time data
            output_soc -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
                              An array of the indexes of the words in the output sentence
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            enc_padding_mask -- Boolean mask to ensure that the padding is not treated as part of the input
            look_ahead_mask -- Boolean mask for the target_input
            dec_padding_mask -- Boolean mask for the second multihead attention layer
        Returns:
            final_output -- SOC prediction at time t
            attention_weights - Dictionary of tensors containing all the attention weights for the decoder
                                each of shape Tensor of shape (batch_size, num_heads, target_seq_len, input_seq_len)
        
        """
        enc_output = self.encoder(input_sentence, training, enc_padding_mask)
        
        dec_output, attention_weights = self.decoder(output_sentence, enc_output, training,
                                                     look_ahead_mask, dec_padding_mask)
        assert dec_output.shape == (batch_size, tar_seq_len, fully_connected_dim)
        
        final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size)
        # END CODE HERE

        return final_output, attention_weights

In [None]:
# num_layers = 6
# embedding_dim = 4
# num_heads = 4
# fully_connected_dim = 8
# input_vocab_size = 30
# target_vocab_size = 35
# max_positional_encoding_input = 5
# max_positional_encoding_target = 6

# transform = Transformer(num_layers, 
#                     embedding_dim, 
#                     num_heads, 
#                     fully_connected_dim, 
#                     input_vocab_size, 
#                     target_vocab_size, 
#                     max_positional_encoding_input,
#                     max_positional_encoding_target)
# sen_a = np.array([[2,1,4,3,0]])
# sen_b = np.array([[3,2,1,0,0]])

# enc_padding_mask = create_padding_mask(sen_a)
# dec_padding_mask = create_padding_mask(sen_b)
# look_ahead_mask = create_look_ahead_mask(sen_a.shape[1])

# translation, weights = transform(
#     sen_a,
#     sen_b,
#     False,  # Training
#     enc_padding_mask,
#     look_ahead_mask,
#     dec_padding_mask
# )