<a name='0'></a>
## Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp /content/drive/MyDrive/transformer_soc/rolling_and_plot_tf.py .
!cp /content/drive/MyDrive/transformer_soc/sim_data.csv .
!cp /content/drive/MyDrive/transformer_soc/transformer_helper.py .

In [None]:
# from os import environ
# environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
# removes tensorflow warnings triggered because of Tensorflow incompatibility with my Apple M1 chip.
# ignore this when using a non Apple Silicon device, ie. Google Colab or the likes.

import tensorflow as tf
from tensorflow.keras.layers import MultiHeadAttention, Dense, Input, Dropout, BatchNormalization
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from dataclasses import dataclass

Cells Below is **only for TPUs**

---



In [None]:
# import os
# resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
# tf.config.experimental_connect_to_cluster(resolver)
# # This is the TPU initialization code that has to be at the beginning.
# tf.tpu.experimental.initialize_tpu_system(resolver)
# print("All devices: ", tf.config.list_logical_devices('TPU'))

# strategy = tf.distribute.TPUStrategy(resolver)



---



In [None]:
import numpy as np
import pandas as pd

!pip install jupyterplot
from jupyterplot import ProgressPlot as PP

from transformer_helper import *
from rolling_and_plot_tf import data_plot, rolling_split, normalize, validate

%reload_ext autoreload
%autoreload 2

Will have to figure out how to set device to cuda in TensorFlow

## Table of Contents

- [Import](#0)
- [JupyterPlot](#jup)
- [Preprocessing](#win)
- [Encoder](#enc)
    - [Encoder Layer](#enc-lay)
    - [Full Encoder](#full-enc)
- [Transformer](#transform)
- [Callbacks & Learn Rate Scheduler](#loss)
- [Training](#train)
- [Validate](#val)

# Literature:


According to [A Transformer-based Framework for Multivariate Time Series Representation Learning](https://dl.acm.org/doi/abs/10.1145/3447548.3467401):
Using **Batch Normalization is significantly more effective** for multivariate time-series than using the traditional Layer Normalization method found in NLP.

In addition, according to [Deep learning approach towards accurate state of charge estimation for lithium-ion batteries using self-supervised transformer model](https://www.nature.com/articles/s41598-021-98915-8#Sec9):
Using a transformer network while **forgoing the Decoder Layer** is more effective for the application of State-of-Charge estimation.

$\large{Self\ Attention}$
$$
\text { Attention }(Q, K, V)=\operatorname{softmax}\left(\frac{Q K^{T}}{\sqrt{d_{k}}}+{M}\right) V
$$

$\large{Input}$

Voltage, Current, SOC at times:
$$t - window\_size - 1 \rightarrow t - 1 $$

**Note**

Cannot use embedding layers with battery data because of floating point values and negative values

In [None]:
@dataclass
class G:
    #preprocess
    window_time = 96 #seconds
    window_size = 32
    slicing = window_time // window_size
    batch_size = 16
    #network
    dense_dim = 32
    model_dim = 128
    num_features = 3 # current, voltage, and soc at t minus G.window_size -> t minus 1
    num_heads = 16
    num_layers = 6
    #learning_rate_scheduler
    T_i = 1
    T_mult = 2
    T_cur = 0.0
    #training
    epochs = 256 #should be a power of T_mult because of cosine annealing with warm restarts scheduler
    learning_rate = 0.0045
    min_learning_rate = 6e-11
#     weight_decay = 0.0 #No weight decay param in the the keras optimizers

<a id="win"></a>
# Preprocessing

In [None]:
# from google.colab import files
file = pd.read_csv("/Users/attar/Library/CloudStorage/OneDrive-UniversityofWaterloo/Simulated_Data/sim_data.csv")
#if using sim_data.csv:
file["soc"] *= 100.0

In [None]:
data_plot(data = [file],
          title="OCV v SOC",
          x = ["test time (sec)"],
          y = ["soc"],
          markers = "lines",
          color = "darkorchid",
          x_title = "Test Time (sec)",
          y_title = "SOC"
         )

In [None]:
file = normalize(file.loc[:,["current","voltage","soc"]].iloc[::G.slicing])
#uses sklearn.preprocessing

In [None]:
x_train, x_test, y_train, y_test = rolling_split(file, G.window_size, train=True)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
#uses sklearn.model_selection

x_train = tf.data.Dataset.from_tensor_slices(x_train)
y_train = tf.data.Dataset.from_tensor_slices(y_train)
x_test = tf.data.Dataset.from_tensor_slices(x_test)
y_test = tf.data.Dataset.from_tensor_slices(y_test)

train_dataloader = tf.data.Dataset.zip((x_train, y_train)).batch(G.batch_size, drop_remainder=True)
test_dataloader = tf.data.Dataset.zip((x_test, y_test)).batch(G.batch_size, drop_remainder=True)

In [None]:
for x,y in train_dataloader:
    print(f"Shape of X [window, features]: {x.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

<a name='enc'></a>
# Encoder

In [None]:
def FullyConnected():
    return tf.keras.Sequential([
        tf.keras.layers.Dense(G.dense_dim, activation='relu',
                              kernel_initializer = tf.keras.initializers.HeNormal(),
                              bias_initializer = tf.keras.initializers.RandomUniform(minval=0.005, maxval = 0.08)
                             ),
        # (G.batch_size, G.window_size, G.dense_dim)
        tf.keras.layers.BatchNormalization(momentum = 0.98, epsilon=5e-4),
        tf.keras.layers.Dense(G.dense_dim, activation='relu',
                              kernel_initializer = tf.keras.initializers.HeNormal(),
                              bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.01)
                             ),
        # (G.batch_size, G.window_size, G.dense_dim)
        tf.keras.layers.BatchNormalization(momentum = 0.95, epsilon=5e-4)
    ])

<a name='enc-lay'></a>
###  Encoder Layer

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    """
    The encoder layer is composed by a multi-head self-attention mechanism,
    followed by a simple, positionwise fully connected feed-forward network. 
    This archirecture includes a residual connection around each of the two 
    sub-layers, followed by batch normalization.
    """
    def __init__(self,
                 num_heads,
                 num_features,
                 dense_dim,
                 dropout_rate,
                 batchnorm_eps):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(
            num_heads = num_heads,
            key_dim = dense_dim,
            dropout = dropout_rate,
            kernel_initializer = tf.keras.initializers.HeNormal(),
            # kernel_regularizer = tf.keras.regularizers.L2(1e-4),
            bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.01)
                                     )
        
        #feed-forward-network
        self.ffn = FullyConnected()
        
        
        self.batchnorm1 = BatchNormalization(momentum = 0.95, epsilon=batchnorm_eps)
        self.batchnorm2 = BatchNormalization(momentum = 0.95, epsilon=batchnorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)
    
    def call(self, x, training):
        """
        Forward pass for the Encoder Layer
        
        Arguments:
            x -- Tensor of shape (G.batch_size, G.window_size, G.num_features)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
        Returns:
            encoder_layer_out -- Tensor of shape (G.batch_size, G.window_size, G.num_features)
        """
        # Dropout is added by Keras automatically if the dropout parameter is non-zero during training
        
        attn_output = self.mha(query = x,
                               value = x) # Self attention
        
        out1 = self.batchnorm1(tf.add(x, attn_output))  # (G.batch_size, G.window_size, G.dense_dim)
        
        ffn_output = self.ffn(out1)
    
        ffn_output = self.dropout_ffn(ffn_output) # (G.batch_size, G.window_size, G.dense_dim)
        
        encoder_layer_out = self.batchnorm2(tf.add(ffn_output, out1))
        # (G.batch_size, G.window_size, G.dense_dim)
        return encoder_layer_out

<a name='full-enc'></a>
### Full Encoder

In [None]:
class Encoder(tf.keras.layers.Layer):
    """
    The entire Encoder starts by passing the input to an embedding layer 
    and using positional encoding to then pass the output through a stack of
    encoder Layers
        
    """  
    def __init__(self,
                 num_layers = G.num_layers,
                 num_heads = G.num_heads,
                 num_features = G.num_features,
                 dense_dim = G.dense_dim,
                 maximum_position_encoding = G.window_size,
                 dropout_rate=0.15,
                 batchnorm_eps=1e-4):
        
        super(Encoder, self).__init__()

        self.num_layers = num_layers

        #linear input layer
        self.lin_input = tf.keras.layers.Dense(dense_dim, activation="relu")
        
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                dense_dim)


        self.enc_layers = [EncoderLayer(num_heads = num_heads,
                                        num_features = num_features,
                                        dense_dim = dense_dim,
                                        dropout_rate = dropout_rate,
                                        batchnorm_eps = batchnorm_eps) 
                           for _ in range(self.num_layers)]
        
    def call(self, x, training):
        """
        Forward pass for the Encoder
        
        Arguments:
            x -- Tensor of shape (G.batch_size, G.window_size, G.num_features)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            mask -- Boolean mask to ensure that the padding is not 
                    treated as part of the input
        Returns:
            Tensor of shape (G.batch_size, G.window_size, G.dense_dim)
        """
        x = self.lin_input(x)
        seq_len = tf.shape(x)[1]
        x += self.pos_encoding[:, :seq_len, :]
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training)
            
        # only need the final time's data : time = t-1 from the window
        # x has shape (G.batch_size, G.window_size, G.dense_dim)
        # but I am only returning time t-1:
        return x[:, -1, :] # (G.batch_size, G.dense_dim)

<a name='transform'></a> 
# Transformer

In [None]:
class Transformer(tf.keras.Model):
    """
    Complete transformer with an Encoder and a Decoder
    """
    def __init__(self,
                 num_layers = G.num_layers,
                 num_heads = G.num_heads,
                 dense_dim = G.dense_dim,
                 max_positional_encoding_input = G.window_size,
                 max_positional_encoding_target = G.window_size):
        super(Transformer, self).__init__()


        self.encoder = Encoder()

        self.final_stack = tf.keras.Sequential([
            tf.keras.layers.Dense(
                dense_dim, activation = "relu",
                kernel_initializer = tf.keras.initializers.HeNormal(),
                bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.02)
                                  ),
            tf.keras.layers.BatchNormalization(momentum = 0.97, epsilon=5e-4),

            tf.keras.layers.Dense(
                1, activation = "sigmoid",
                bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.005)
                                 )
                                              ])
    
    def call(self, x, training):
        """
        Forward pass for the entire Transformer
        Arguments:
            x -- tf.data.Dataset containing batch inputs and targets
                 batched & windowed voltage, current and soc data with batched soc targets
            training -- Boolean, set to true to activate
                        the training mode for dropout and batchnorm layers
        Returns:
            final_output -- SOC prediction at time t
        
        """
        enc_output = self.encoder(x, training) # (G.batch_size, G.dense_dim)
        
        final_output = self.final_stack(enc_output) # (G.batch_size, 1)


    
        return final_output

## Note:

The `training` argument in the model and layer calls sets the `keras.backend.learning_phase()` value to the appropriate value for the use case.
ie.
- If I am using the train_loop(), `training` is set to True which means all the Dropout and BatchNormalization layers are active.
- If I am using the test_loop(), `training` is set to False which means all the Dropout and BatchNormalization layers are inactive.

If Using **TPUs** use the cell right below this text

---



In [None]:
# tf.keras.backend.clear_session()
# with strategy.scope():
#     model = Transformer()



---



If **not using TPUs**:

---



In [None]:
tf.keras.backend.clear_session()
model = Transformer()
model.build((G.batch_size, G.window_size, G.num_features))
model.summary(expand_nested=True)



---



In [None]:
model.load_weights("/content/drive/MyDrive/transformer_soc/model_weights.tf")

<a id = "loss"></a>
# Callbacks and Scheduler

**Learning Rate Scheduler**

Cosine Annealing with Warm Restarts proposed by Loshchilov et al. in [SGDR: Stochastic Gradient Descent with Warm Restarts](https://doi.org/10.48550/arXiv.1608.03983)

$$\mu_t = \mu_{min} + \frac{1}{2}(\mu_{max} - \mu_{min})\cdot (1 + \cos (\frac{T_{cur}}{T_i}\pi))$$

Where:
 - $\mu$ is the learning_rate, subscript $t$ is for time = $t$
 - $T_{cur}$ is the number of epochs since the last restart
 - $T_i$ is the number of epochs between two restarts

Note:
 - When $T_{cur} = T_i \rightarrow \mu_t = \mu_{min}$
 - When $T_{cur} = 0 \rightarrow \mu_t = \mu_{max}$

---
**The Cell below is for the LambdaCallback Class in keras in order to implement Cosine Annealing with Warm Restarts** ↓

Used with callbacks in model.fit()

---

In [None]:
def schedule(batch, logs):
        '''
        This is a dummy function for the LearningRateScheduler Class
        I am trying to see if I can use the model.compile(), model.fit(), model.evaluate(), trio with
        Cosine Annealing with Warm Restarts
        Returns a new learning rate based on the schedule described below
        
        Call after every batch
        '''
        
        mu_i = G.min_learning_rate + 0.5 * (
                G.learning_rate - G.min_learning_rate) * (
                    1 + tf.math.cos(np.pi * G.T_cur / G.T_i))
        
        G.T_cur += G.batch_size / len(x_train)
        if np.isclose(G.T_cur, G.T_i):
            G.T_i *= G.T_mult
            G.T_cur = 0.0
        K.set_value(model.optimizer.learning_rate, mu_i)

**Progress Plot Callback**

In [None]:
class ProgressCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs = None):
        train_loss = logs["loss"]
        train_acc = 100.0 - logs["mean_absolute_percentage_error"]
        test_loss = logs["val_loss"]
        test_acc = 100.0 - logs["val_mean_absolute_percentage_error"]
        global pp
        pp.update([[train_loss, test_loss],
                   [train_acc, test_acc]])

**Save Model Progress Callback**

Does not work with TPUs

In [None]:
class SaveModel(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs = None):
        if epoch != 0 and epoch % 15 == 0:
            self.model.save_weights("/content/drive/MyDrive/transformer_soc/model_weights.h5")

**Early Stopping and Saving Best Model checkpoint Callbacks**

In [None]:
model_options = tf.saved_model.SaveOptions(experimental_io_device="/job:localhost")
# earlystopping = EarlyStopping(monitor='val_mean_absolute_percentage_error', patience=150, verbose=0, mode='min')
mcp_save = ModelCheckpoint('/content/drive/MyDrive/transformer_soc/tpu_model_weights', save_format = "tf", save_best_only=True, monitor='val_mean_absolute_percentage_error', mode='min', options = model_options)

In [None]:
loss_object = tf.keras.losses.LogCosh()

optimizer = tf.keras.optimizers.Adam(learning_rate = G.learning_rate,
                                     beta_1 = 0.9,
                                     beta_2 = 0.999
                                    )

#cos_anneal is for the model.fit() call
cos_anneal = tf.keras.callbacks.LambdaCallback(on_batch_end = schedule)

#progress plot callback
pp_update = ProgressCallback()

#model parameters save callback
model_save = SaveModel() #This is optional

<a id = "train"></a>
# Training

**There are two compile calls, one requires a TPU**

In [None]:
pp = PP(plot_names = ["Mean Log Loss", "% Accuracy"],
        line_names = ["Train Loop", "Test Loop"],
        x_label = "epochs"
       )

# ##### if using a TPU:
# with strategy.scope():
#     model.compile(optimizer, loss_object, steps_per_execution = 3, metrics=["mean_absolute_percentage_error"])

##### else:
# model.compile(optimizer, loss_object, metrics=["mean_absolute_percentage_error"])
## Dont compile after training, it causes issues.

#-----------------------------------------------------------------
#Note: can add `model_save` to the callbacks list in model.fit()
#      it saves the model params to the google drive every 15 epochs
#-------------------------------------------------------------------

steps_per_epoch = len(train_dataloader) // G.epochs
validation_steps = len(test_dataloader) // G.epochs

history = model.fit(train_dataloader,
                    batch_size = G.batch_size,
                    epochs = G.epochs,
                    verbose = 1,
                    steps_per_epoch = steps_per_epoch,
                    callbacks = [cos_anneal, pp_update],
                    validation_data = test_dataloader,
                    validation_steps = validation_steps
                    )

In [None]:
model.save("/content/drive/MyDrive/transformer_soc/tpu_model.h5") #doesnt work with TPUs

In [None]:
#works with TPUs
checkpoint = tf.train.Checkpoint(model = model)
options = tf.train.CheckpointOptions(experimental_io_device="/job:localhost")
checkpoint.save("/content/drive/MyDrive/transformer_soc/tpu_model/ckpt", options=options)

<a id = "val"></a>
# Validate

**Dev Set**

In [None]:
visualize_dev = validate(model, test_dataloader, dev = True)

**Entire Dataset**

In [None]:
x_set, y_set = rolling_split(file, G.window_size, train = False)

x_set = tf.data.Dataset.from_tensor_slices(x_set)
y_set = tf.data.Dataset.from_tensor_slices(y_set)

set_dataloader = tf.data.Dataset.zip((x_set, y_set)).batch(G.batch_size, drop_remainder=True)

visualize = validate(model, set_dataloader, dev = False)