## Table of Contents

- [Import](#0)
- [Windowed Data](#win)
- [Encoder](#enc)
    - [Encoder Layer](#enc-lay)
    - [Full Encoder](#full-enc)
- [Decoder](#dec)
    - [Decoder Layer](#dec-lay)
    - [Full Decoder](#full-dec)
- [Transformer](#transform)

# Literature:


According to [A Transformer-based Framework for Multivariate Time Series Representation Learning](https://dl.acm.org/doi/abs/10.1145/3447548.3467401):
Using **Batch Normalization is significantly more effective** for multivariate time-series than using the traditional Layer Normalization method found in NLP.

In addition, according to [Deep learning approach towards accurate state of charge estimation for lithium-ion batteries using self-supervised transformer model](https://www.nature.com/articles/s41598-021-98915-8#Sec9):
Using a transformer network while **forgoing the Decoder Layer** is more effective for the application of State-of-Charge estimation.

<a name='0'></a>
## Import

In [None]:
from os import environ
environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
# removes tensorflow warnings triggered because of Tensorflow incompatibility with my Apple M1 chip.
# ignore this when using a non Apple Silicon device, ie. Google Colab or the likes.

import tensorflow as tf
# import time
import numpy as np
# import matplotlib.pyplot as plt

from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, Input, Dropout, BatchNormalization
# from transformers import DistilBertTokenizerFast #, TFDistilBertModel
# from transformers import TFDistilBertForTokenClassification

from transformer_helper import *

$\large{Self\ Attention}$
$$
\text { Attention }(Q, K, V)=\operatorname{softmax}\left(\frac{Q K^{T}}{\sqrt{d_{k}}}+{M}\right) V
$$

<a id="win"></a>
## Windowed Data 

In [None]:
@dataclass
class G:
#     split_time = None #for now, might be useless
    # sequence_length = 5
    num_features = 4 # delta_t, current, voltage, and soc at t-1
    window_size = 512
    batch_size = 16
    epochs = 10
    shuffle_buffer_size = 500

In [None]:
def windowed_dataset(series,
                     window_size=G.window_size,
                     batch_size=G.batch_size,
                     shuffle_buffer=G.shuffle_buffer_size):
    '''
    Windows a dataset for training. I'll come up with a better docstring later
    '''
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + 1))
    ds = ds.shuffle(shuffle_buffer)
    ds = ds.map(lambda w: (w[:-1], w[-1]))
    ds = ds.batch(batch_size).prefetch(1)
    return ds


# Apply the transformation to the training set
train_set = windowed_dataset("Enter variable remove the string")

<a name='enc'></a>
## Encoder

In [None]:
def FullyConnected(embedding_dim, fully_connected_dim):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(fully_connected_dim, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(embedding_dim)  # (batch_size, seq_len, d_model)
    ])

<a name='enc-lay'></a>
###  Encoder Layer

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    """
    The encoder layer is composed by a multi-head self-attention mechanism,
    followed by a simple, positionwise fully connected feed-forward network. 
    This archirecture includes a residual connection around each of the two 
    sub-layers, followed by layer normalization.
    """
    def __init__(self, embedding_dim, num_heads, fully_connected_dim,
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.ffn = FullyConnected(embedding_dim=embedding_dim,
                                  fully_connected_dim=fully_connected_dim)
        
        
        self.batchnorm1 = BatchNormalization(epsilon=layernorm_eps)
        self.batchnorm2 = BatchNormalization(epsilon=layernorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)
    
    def call(self, x, training, mask):
        """
        Forward pass for the Encoder Layer
        
        Arguments:
            x -- Tensor of shape (batch_size, input_seq_len, fully_connected_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            mask -- Boolean mask to ensure that the padding is not 
                    treated as part of the input
        Returns:
            encoder_layer_out -- Tensor of shape (batch_size, input_seq_len, fully_connected_dim)
        """
        # Dropout is added by Keras automatically if the dropout parameter is non-zero during training
        attn_output = self.mha(query = x,
                               value = x,
                               attention_mask = mask) # Self attention
        out1 = self.batchnorm1(tf.add(x, attn_output))  # (batch_size, input_seq_len, fully_connected_dim)

        ffn_output = self.ffn(out1)
        
        ffn_output =  self.dropout_ffn(ffn_output)
        
        encoder_layer_out = self.batchnorm2(tf.add(ffn_output, out1)) # (batch_size, input_seq_len, fully_connected_dim)
        
        return encoder_layer_out
    

<a name='full-enc'></a>
### Full Encoder

In [None]:
class Encoder(tf.keras.layers.Layer):
    """
    The entire Encoder starts by passing the input to an embedding layer 
    and using positional encoding to then pass the output through a stack of
    encoder Layers
        
    """  
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size,
               maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Encoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = Embedding(input_vocab_size, self.embedding_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                self.embedding_dim)


        self.enc_layers = [EncoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                           for _ in range(self.num_layers)]

        self.dropout = Dropout(dropout_rate)
        
    def call(self, x, training, mask):
        """
        Forward pass for the Encoder
        
        Arguments:
            x -- Tensor of shape (batch_size, input_seq_len)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            mask -- Boolean mask to ensure that the padding is not 
                    treated as part of the input
        Returns:
            out2 -- Tensor of shape (batch_size, input_seq_len, fully_connected_dim)
        """
        seq_len = tf.shape(x)[1]
        
        x = tf.cast(self.embedding(x), tf.float32)  # (batch_size, input_seq_len, embedding_dim)
        # Scale embedding by multiplying it by the square root of the embedding dimension
        x *= np.power(self.embedding_dim, 0.5)
        
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x,training,mask)
        return x  # (batch_size, input_seq_len, fully_connected_dim)

<a name='dec'></a>
## Decoder

<a name='dec-lay'></a>    
### Decoder Layer

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    """
    The decoder layer is composed by two multi-head attention blocks, 
    one that takes the new input and uses self-attention, and the other 
    one that combines it with the output of the encoder, followed by a
    fully connected block. 
    """
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.mha2 = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.ffn = FullyConnected(embedding_dim=embedding_dim,
                                  fully_connected_dim=fully_connected_dim)

        self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm3 = LayerNormalization(epsilon=layernorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        """
        Forward pass for the Decoder Layer
        
        Arguments:
            x -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            enc_output --  Tensor of shape(batch_size, input_seq_len, fully_connected_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            look_ahead_mask -- Boolean mask for the target_input
            padding_mask -- Boolean mask for the second multihead attention layer
        Returns:
            out3 -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            attn_weights_block1 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
            attn_weights_block2 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
        """
        assert enc_output.shape == (batch_size, input_seq_len, fully_connected_dim)
        
        # BLOCK 1
        # Dropout will be applied during training.
        mult_attn_out1, attn_weights_block1 = self.mha1(query = x,
                                                        value = x,
                                                        attention_mask = look_ahead_mask,
                                                        return_attention_scores=True)
        
        Q1 = self.layernorm1(tf.add(x,mult_attn_out1))

        # BLOCK 2
        # self-attention using the Q from the first block and K and V from the encoder output
        mult_attn_out2, attn_weights_block2 = self.mha2(query = Q1,
                                                        value = enc_output,
                                                        key = enc_output,
                                                        attention_mask = padding_mask,
                                                        return_attention_scores=True)
        
        mult_attn_out2 = self.layernorm2( tf.add(mult_attn_out1, mult_attn_out2) )
        # (batch_size, target_seq_len, fully_connected_dim)
                
        #BLOCK 3
        #sequential dense layers and dropout
        ffn_output = self.ffn(mult_attn_out2)  # (batch_size, target_seq_len, fully_connected_dim)
        ffn_output = self.dropout_ffn(ffn_output)
        
        # layer normalization to the sum of the ffn output and the output of the second block
        out3 = self.layernorm3( tf.add(ffn_output, mult_attn_out2) )

        return out3, attn_weights_block1, attn_weights_block2
    

<a name='full-dec'></a> 
### Full Decoder

In [None]:
class Decoder(tf.keras.layers.Layer):
    """
    The entire Encoder is starts by passing the target input to an embedding layer 
    and using positional encoding to then pass the output through a stack of
    decoder Layers
        
    """ 
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, target_vocab_size,
               maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Decoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = Embedding(target_vocab_size, self.embedding_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.embedding_dim)

        self.dec_layers = [DecoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                           for _ in range(self.num_layers)]
        self.dropout = Dropout(dropout_rate)
    
    def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
        """
        Forward  pass for the Decoder
        
        Arguments:
            x -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            enc_output --  Tensor of shape(batch_size, input_seq_len, fully_connected_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            look_ahead_mask -- Boolean mask for the target_input
            padding_mask -- Boolean mask for the second multihead attention layer
        Returns:
            x -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            attention_weights - Dictionary of tensors containing all the attention weights
                                each of shape Tensor of shape (batch_size, num_heads, target_seq_len, input_seq_len)
        """

        seq_len = tf.shape(x)[1]
        attention_weights = {}
        # create embeddings 
        x = self.embedding(x)  # (batch_size, target_seq_len, fully_connected_dim)
        
        # scale embeddings by multiplying by the square root of their dimension
        x *= np.power(self.embedding_dim, 0.5)
        
        # positional encodings
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x)

        for i in range(self.num_layers):
            # pass x and encoder output through decoder and save the attention weights of block 1 and 2
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                   look_ahead_mask, padding_mask)

            #update attention_weights dictionary with the attention weights of block 1 and block 2
            attention_weights[f'decoder_layer{i+1}_block1_self_att'] = block1
            attention_weights[f'decoder_layer{i+1}_block2_decenc_att'] = block2
        
        assert x.shape == (batch_size, target_seq_len, fully_connected_dim)
        return x, attention_weights

<a name='transform'></a> 
## Transformer

In [None]:
class Transformer(tf.keras.Model):
    """
    Complete transformer with an Encoder and a Decoder
    """
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, num_features,
                 max_positional_encoding_input,
                 max_positional_encoding_target,
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers=num_layers,
                               embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               fully_connected_dim=fully_connected_dim,
                               num_features = num_features
                               maximum_position_encoding=max_positional_encoding_input,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)

        self.decoder = Decoder(num_layers=num_layers, 
                               embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               fully_connected_dim=fully_connected_dim,
                               target_vocab_size=target_vocab_size, 
                               maximum_position_encoding=max_positional_encoding_target,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)

        self.final_layer = Dense(1)
    
    def call(self, input_sentence, output_sentence, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        """
        Forward pass for the entire Transformer
        Arguments:
            input_data -- Tensor of shape (batch_size, input_seq_len, fully_connected_dim)
                              An array of the windowed voltage, current and time data
            output_soc -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
                              An array of the indexes of the words in the output sentence
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            enc_padding_mask -- Boolean mask to ensure that the padding is not treated as part of the input
            look_ahead_mask -- Boolean mask for the target_input
            dec_padding_mask -- Boolean mask for the second multihead attention layer
        Returns:
            final_output -- SOC prediction at time t
            attention_weights - Dictionary of tensors containing all the attention weights for the decoder
                                each of shape Tensor of shape (batch_size, num_heads, target_seq_len, input_seq_len)
        
        """
        enc_output = self.encoder(input_sentence, training, enc_padding_mask)
        
        dec_output, attention_weights = self.decoder(output_sentence, enc_output, training,
                                                     look_ahead_mask, dec_padding_mask)
        assert dec_output.shape == (batch_size, tar_seq_len, fully_connected_dim)
        
        final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size)
        # END CODE HERE

        return final_output, attention_weights

In [None]:
# num_layers = 6
# embedding_dim = 4
# num_heads = 4
# fully_connected_dim = 8
# input_vocab_size = 30
# target_vocab_size = 35
# max_positional_encoding_input = 5
# max_positional_encoding_target = 6

# transform = Transformer(num_layers, 
#                     embedding_dim, 
#                     num_heads, 
#                     fully_connected_dim, 
#                     input_vocab_size, 
#                     target_vocab_size, 
#                     max_positional_encoding_input,
#                     max_positional_encoding_target)
# sen_a = np.array([[2,1,4,3,0]])
# sen_b = np.array([[3,2,1,0,0]])

# enc_padding_mask = create_padding_mask(sen_a)
# dec_padding_mask = create_padding_mask(sen_b)
# look_ahead_mask = create_look_ahead_mask(sen_a.shape[1])

# translation, weights = transform(
#     sen_a,
#     sen_b,
#     False,  # Training
#     enc_padding_mask,
#     look_ahead_mask,
#     dec_padding_mask
# )