<a name='0'></a>
## Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp /content/drive/MyDrive/transformer_soc/rolling_and_plot_tf.py .
!cp /content/drive/MyDrive/transformer_soc/sim_data.csv .
!cp /content/drive/MyDrive/transformer_soc/transformer_helper.py .

In [None]:
# from os import environ
# environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
# removes tensorflow warnings triggered because of Tensorflow incompatibility with my Apple M1 chip.
# ignore this when using a non Apple Silicon device, ie. Google Colab or the likes.

import tensorflow as tf
from tensorflow.keras.layers import MultiHeadAttention, Dense, Input, Dropout, BatchNormalization

from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from dataclasses import dataclass

In [None]:
import numpy as np
import pandas as pd
!pip install jupyterplot
from jupyterplot import ProgressPlot as PP

from transformer_helper import *
from rolling_and_plot_tf import data_plot, rolling_split, normalize, validate

%reload_ext autoreload
%autoreload 2

Will have to figure out how to set device to cuda in TensorFlow

## Table of Contents

- [Import](#0)
- [Preprocessing](#win)
- [Model](#model)
 - [Encoder](#enc)
    - [Encoder Layer](#enc-lay)
    - [Full Encoder](#full-enc)
 - [Decoder](#dec)
    - [Decoder Layer](#dec-lay)
    - [Full Decoder](#full-dec)
 - [Transformer](#transform)
- [Loss and Learn Rate Scheduler](#loss)
- [Training](#train)
- [Validate](#val)

# Literature:


According to [A Transformer-based Framework for Multivariate Time Series Representation Learning](https://dl.acm.org/doi/abs/10.1145/3447548.3467401):
Using **Batch Normalization is significantly more effective** for multivariate time-series than using the traditional Layer Normalization method found in NLP.

In addition, according to [Deep learning approach towards accurate state of charge estimation for lithium-ion batteries using self-supervised transformer model](https://www.nature.com/articles/s41598-021-98915-8#Sec9):
Using a transformer network while **forgoing the Decoder Layer** is more effective for the application of State-of-Charge estimation.

$\large{Self\ Attention}$
$$
\text { Attention }(Q, K, V)=\operatorname{softmax}\left(\frac{Q K^{T}}{\sqrt{d_{k}}}+{M}\right) V
$$

$\large{Input}$

Voltage, Current, SOC at times:
$$t - window\_size - 1 \rightarrow t - 1 $$

**Note**

Cannot use embedding layers with battery data because of floating point values and negative values

In [None]:
@dataclass
class G:
    #preprocess
    window_time = 96 #seconds
    window_size = 32
    slicing = window_time // window_size
    batch_size = 16
    #network
    dense_dim = 32
    model_dim = 128
    num_features = 3 # current, voltage, and soc at t minus G.window_size -> t minus 1
    num_heads = 16
    num_layers = 6
    #training
    epochs = 100
    learning_rate = 0.0035
    min_learning_rate = 7e-11
#     weight_decay = 0.0 #No weight decay param in the the keras optimizers

<a id="win"></a>
# Preprocessing

In [None]:
# from google.colab import files
file = pd.read_csv("/content/sim_data.csv")
#if using sim_data.csv:
file["soc"] *= 100.0

In [None]:
data_plot(data = [file],
          title="OCV v SOC",
          x = ["test time (sec)"],
          y = ["soc"],
          markers = "lines",
          color = "darkorchid",
          x_title = "Test Time (sec)",
          y_title = "SOC"
         )

In [None]:
file = normalize(file.loc[:,["current","voltage","soc"]].iloc[::G.slicing])
#uses sklearn.preprocessing

In [None]:
x_train, x_test, y_train, y_test = rolling_split(file, G.window_size, train=True)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
#uses sklearn.model_selection

x_train = tf.data.Dataset.from_tensor_slices(x_train)
y_train = tf.data.Dataset.from_tensor_slices(y_train)
x_test = tf.data.Dataset.from_tensor_slices(x_test)
y_test = tf.data.Dataset.from_tensor_slices(y_test)

train_dataloader = tf.data.Dataset.zip((x_train, y_train)).batch(G.batch_size, drop_remainder=True)
test_dataloader = tf.data.Dataset.zip((x_test, y_test)).batch(G.batch_size, drop_remainder=True)

In [None]:
for x,y in train_dataloader:
    print(f"Shape of X [window, features]: {x.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

<a id ="model"></a>
# Model

In [None]:
def FullyConnected():
    return tf.keras.Sequential([
        tf.keras.layers.Dense(G.dense_dim, activation='relu',
                              kernel_initializer = tf.keras.initializers.HeNormal(),
                              bias_initializer = tf.keras.initializers.RandomUniform(minval=0.005, maxval = 0.08)
                             ),
        # (G.batch_size, G.window_size, G.dense_dim)
        tf.keras.layers.BatchNormalization(momentum = 0.90, epsilon=5e-4),
        tf.keras.layers.Dense(G.num_features, activation='relu',
                              kernel_initializer = tf.keras.initializers.HeNormal(),
                              bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.01)
                             ),
        # (G.batch_size, G.window_size, G.num_features)
        tf.keras.layers.BatchNormalization(momentum = 0.90, epsilon=5e-4)
    ])

<a name='enc'></a>
## Encoder

<a name='enc-lay'></a>
###  Encoder Layer

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    """
    The encoder layer is composed by a multi-head self-attention mechanism,
    followed by a simple, positionwise fully connected feed-forward network. 
    This archirecture includes a residual connection around each of the two 
    sub-layers, followed by batch normalization.
    """
    def __init__(self,
                 num_heads,
                 num_features,
                 dense_dim,
                 dropout_rate=0.3,
                 batchnorm_eps=1e-3):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(
            num_heads = num_heads,
            key_dim = num_features,
            dropout = dropout_rate,
            kernel_initializer = tf.keras.initializers.HeNormal(),
            kernel_regularizer = tf.keras.regularizers.L2(1e-4),
            bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.01)
                                     )
        #feed-forward-network
        self.ffn = FullyConnected()
        
        
        self.batchnorm1 = BatchNormalization(momentum = 0.9, epsilon=batchnorm_eps)
        self.batchnorm2 = BatchNormalization(momentum = 0.85, epsilon=batchnorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)
    
    def call(self, x, training):
        """
        Forward pass for the Encoder Layer
        
        Arguments:
            x -- Tensor of shape (G.batch_size, G.window_size, G.num_features)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
        Returns:
            encoder_layer_out -- Tensor of shape (G.batch_size, G.window_size, G.num_features)
        """
        # Dropout is added by Keras automatically if the dropout parameter is non-zero during training
        attn_output = self.mha(query = x,
                               value = x) # Self attention
        
        out1 = self.batchnorm1(tf.add(x, attn_output))  # (G.batch_size, G.window_size, G.num_features)
        
        ffn_output = self.ffn(out1)
        
        if training:
            ffn_output = self.dropout_ffn(ffn_output) # (G.batch_size, G.window_size, G.num_features)
        
        encoder_layer_out = self.batchnorm2(tf.add(ffn_output, out1))
        # (G.batch_size, G.window_size, G.num_features)
        return encoder_layer_out

<a name='full-enc'></a>
### Full Encoder

In [None]:
class Encoder(tf.keras.layers.Layer):
    """
    The entire Encoder starts by passing the input to an embedding layer 
    and using positional encoding to then pass the output through a stack of
    encoder Layers
        
    """  
    def __init__(self,
                 num_layers = G.num_layers,
                 num_heads = G.num_heads,
                 num_features = G.num_features,
                 dense_dim = G.dense_dim,
                 input_size = G.num_features,
                 maximum_position_encoding = G.window_size,
                 dropout_rate=0.35,
                 batchnorm_eps=1e-6):
        
        super(Encoder, self).__init__()
        
        self.num_layers = num_layers
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                input_size)


        self.enc_layers = [EncoderLayer(num_heads = num_heads,
                                        num_features = num_features,
                                        dense_dim = dense_dim,
                                        dropout_rate = dropout_rate,
                                        batchnorm_eps = batchnorm_eps) 
                           for _ in range(self.num_layers)]

        self.dropout = Dropout(dropout_rate)
        
    def call(self, x, training):
        """
        Forward pass for the Encoder
        
        Arguments:
            x -- Tensor of shape (G.batch_size, G.window_size, G.num_features)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            mask -- Boolean mask to ensure that the padding is not 
                    treated as part of the input
        Returns:
            out2 -- Tensor of shape (G.batch_size, G.window_size, G.num_features)
        """
        seq_len = tf.shape(x)[1]
        x += self.pos_encoding[:, :seq_len, :]
        if training: x = self.dropout(x)
        
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x,training)
            
        # only need the final time's data : time = t-1 from the window
        # x has shape (G.batch_size, G.window_size, G.num_features)
        # but I am only returning time t-1:
        return x[:, -1, :] # (G.batch_size, G.num_features)

<a name='dec'></a> 
## Decoder

<a name='dec-lay'></a> 
### Decoder Layer

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    """
    The decoder layer is composed by two multi-head attention blocks, 
    one that takes the new input and uses self-attention, and the other 
    one that combines it with the output of the encoder, followed by a
    fully connected block. 
    """
    def __init__(self,
                 num_heads,
                 num_features,
                 dense_dim,
                 dropout_rate=0.3,
                 batchnorm_eps=1e-3):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(
            num_heads = num_heads,
            key_dim = num_features,
            dropout = dropout_rate,
            kernel_initializer = tf.keras.initializers.HeNormal(),
            kernel_regularizer = tf.keras.regularizers.L2(1e-4),
            bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.01)
                                     )

        self.mha2 = MultiHeadAttention(
            num_heads = num_heads,
            key_dim = num_features,
            dropout = dropout_rate,
            kernel_initializer = tf.keras.initializers.HeNormal(),
            kernel_regularizer = tf.keras.regularizers.L2(1e-4),
            bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.01)
                                     )

        self.ffn = FullyConnected()

        self.batchnorm1 = BatchNormalization(momentum = 0.95, epsilon=batchnorm_eps)
        self.batchnorm2 = BatchNormalization(momentum = 0.9, epsilon=batchnorm_eps)
        self.batchnorm3 = BatchNormalization(momentum = 0.85, epsilon=batchnorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)
    
    def call(self, y, enc_output, training):
        """
        Forward pass for the Decoder Layer
        
        Arguments:
            y -- Tensor of shape (G.batch_size, 1) #the soc values for the batches
            enc_output --  Tensor of shape(G.batch_size, G.num_features)
            training -- Boolean, set to true to activate
                        the training mode for dropout and batchnorm layers
        Returns:
            out3 -- Tensor of shape (G.batch_size, 1)
            attn_weights_block1
            attn_weights_block2
        """
        
        # BLOCK 1
        # Dropout will be applied during training only
        mult_attn_out1, attn_weights_block1 = self.mha1(query = y[np.newaxis,:],
                                                        value = y[np.newaxis,:],
                                                        return_attention_scores=True)
        # (G.batch_size, G.num_features)
        
        Q1 = self.batchnorm1(tf.add(y,mult_attn_out1))

        # BLOCK 2
        # calculate self-attention using the Q from the first block and K and V from the encoder output. 
        # Dropout will be applied during training
        mult_attn_out2, attn_weights_block2 = self.mha2(query = Q1,
                                                        value = enc_output[np.newaxis,:],
                                                        key = enc_output[np.newaxis,:],
                                                        return_attention_scores=True)
        
        mult_attn_out2 = self.batchnorm2( tf.add(mult_attn_out1, mult_attn_out2) )
                
        #BLOCK 3
        # pass the output of the second block through a ffn
        ffn_output = self.ffn(mult_attn_out2)
        
        # apply a dropout layer to the ffn output
        if training:
            ffn_output = self.dropout_ffn(ffn_output)
            
        out3 = self.batchnorm3( tf.add(ffn_output, mult_attn_out2) )
        return tf.squeeze(out3,axis=0), attn_weights_block1, attn_weights_block2
    

<a name='full-dec'></a> 
### Full Decoder

In [None]:
class Decoder(tf.keras.layers.Layer):
    """
    The entire Encoder is starts by passing the target input to an embedding layer 
    and using positional encoding to then pass the output through a stack of
    decoder Layers
        
    """ 
    def __init__(self,
                 num_layers = G.num_layers,
                 num_heads = G.num_heads,
                 num_features = G.num_features,
                 dense_dim = G.dense_dim,
                 target_size = G.num_features,
                 maximum_position_encoding = 1,
                 dropout_rate=0.35,
                 batchnorm_eps=1e-6):
        super(Decoder, self).__init__()
        
        self.num_layers = num_layers
#         self.pos_encoding = positional_encoding(maximum_position_encoding, 
#                                                 target_size)
        # I don't need positional encoding for a single value label

        self.dec_layers = [DecoderLayer(num_heads,
                                        num_features,
                                        dense_dim) 
                           for _ in range(self.num_layers)]
        self.dropout = Dropout(dropout_rate)
    
    def call(self, y, enc_output, training):
        """
        Forward  pass for the Decoder
        
        Arguments:
            y -- Tensor of shape (G.batch_size, 1) #the SOC values for the batches
            enc_output --  Tensor of shape(G.batch_size, G.num_features)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
        Returns:
            y -- Tensor of shape (G.batch_size, 1)
            attention_weights - Dictionary of tensors containing all the attention weights
        """

        seq_len = tf.shape(y)[1]
        attention_weights = {}
        
#         y += self.pos_encoding[:, :seq_len, :]

        # apply a dropout layer to y
        if training: y = self.dropout(y)

        # use a for loop to pass y through a stack of decoder layers and update attention_weights
        for i in range(self.num_layers):
            # pass y and the encoder output through a stack of decoder layers and save attention weights
            y, block1, block2 = self.dec_layers[i](y, enc_output, training)

            #update attention_weights dict
            attention_weights[f'decoder_layer{i+1}_block1_self_att'] = block1
            attention_weights[f'decoder_layer{i+1}_block2_decenc_att'] = block2
            
        return y, attention_weights

<a name='transform'></a> 
## Transformer

In [None]:
class Transformer(tf.keras.Model):
    """
    Complete transformer with an Encoder and a Decoder
    """
    def __init__(self,
                 num_layers = G.num_layers,
                 num_heads = G.num_heads,
                 dense_dim = G.dense_dim,
                 max_positional_encoding_input = G.window_size,
                 max_positional_encoding_target = G.window_size):
        super(Transformer, self).__init__()

        self.encoder = Encoder()
        self.decoder = Decoder()

        self.final_stack = tf.keras.Sequential([
            tf.keras.layers.Dense(
                dense_dim, activation = "relu",
                kernel_initializer = tf.keras.initializers.HeNormal(),
                bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.02)
                                  ),
            tf.keras.layers.BatchNormalization(momentum = 0.90, epsilon=5e-4),
            tf.keras.layers.Dense(
                1, activation = "sigmoid",
                bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.005)
                                 )
                                              ])
    
    def call(self, x, y, training):
        """
        Forward pass for the entire Transformer
        Arguments:
            x -- Tensor of shape (G.batch_size, G.window_size, G.num_features)
                 An array of the windowed voltage, current and soc data
            y -- Tensor of shape (G.batch_size, 1)
                 An array of the SOC targets for each batch
            training -- Boolean, set to true to activate
                        the training mode for dropout and batchnorm layers
        Returns:
            final_output -- SOC prediction at time t
        
        """
        enc_output = self.encoder(x, training) # (G.batch_size, G.num_features)
        
        dec_output, attention_weights = self.decoder(y, enc_output, training) # (G.batch_size, G.num_features)

        final_output = self.final_stack(dec_output) # (G.batch_size, 1)

        return final_output, attention_weights

Build Model

In [None]:
tf.keras.backend.clear_session()
model = Transformer()
x_inst = tf.random.uniform((G.batch_size, G.window_size, G.num_features))
y_inst = tf.random.uniform((G.batch_size,1))
model(x_inst,y_inst,False)
model.summary(expand_nested=True)

**Loading Already Saved Progress**

In [None]:
model.load_weights("/content/drive/MyDrive/transformer_soc/decoder_model_weights.tf")

# attn_weights_load = pd.read_csv("/content/drive/MyDrive/transformer_soc/attn_weights.csv").to_dict("list")

# scheduler_state = np.load("/content/drive/MyDrive/transformer_soc/scheduler_state.npy")
# print(f"Saved learning_rate, T_cur, and T_i: {scheduler_state}")

# scheduler.learning_rate, scheduler.T_cur, scheduler.T_i = scheduler_state

<a id = "loss"></a>
# Loss and LR Scheduler

**Learning Rate Scheduler**

Cosine Annealing with Warm Restarts proposed by Loshchilov et al. in [SGDR: Stochastic Gradient Descent with Warm Restarts](https://doi.org/10.48550/arXiv.1608.03983)

$$\mu_t = \mu_{min} + \frac{1}{2}(\mu_{max} - \mu_{min})\cdot (1 + \cos (\frac{T_{cur}}{T_i}\pi))$$

Where:
 - $\mu$ is the learning_rate, subscript $t$ is for time = $t$
 - $T_{cur}$ is the number of epochs since the last restart
 - $T_i$ is the number of epochs between two restarts

Note:
 - When $T_{cur} = T_i \rightarrow \mu_t = \mu_{min}$
 - When $T_{cur} = 0 \rightarrow \mu_t = \mu_{max}$

In [None]:
class CosAnnealWarmRestarts():
    def __init__(self, T_0: float, T_mult: float):
        '''
        Cosine Annealing with Warm Restarts
        Returns a new learning rate based on the call method
        
        Parameters:
        `T_0` int
            the number of iterations for the first restart to occur
        `T_mult` int
            the factor to increase T_i by after a restart, where T_i is the i^th restart.
        '''
        super(CosAnnealWarmRestarts, self).__init__()
        
        assert isinstance(T_0, float) and isinstance(T_mult, float)
        assert T_0 > 0.0
        self.mu_max = G.learning_rate #initial and max learning_rate
        self.mu_min = G.min_learning_rate #minimum learning_rate
        self.T_i = T_0
        self.T_mult = T_mult
        self.T_cur = 0.0
        
        self.learning_rate = G.learning_rate
        
    def step(self, increment:float, optimizer):
        '''
        Cosine Annealing with Warm Restarts
        Returns a new learning rate based on the schedule described below
        
        Call after every batch

        Parameters:
        `increment` float
            1 batch / total number of batches
            !!!!! Not the current batch number, that would be a series summation
            every epoch, a total of 1.0 will be added to self.T_cur
        `optimizer`
            the optimizer for the neural network
        '''
        try:
            optimizer.learning_rate
        except AttributeError:
            print("Error: optimizer does not have a learning_rate parameter")
        
        mu_i = self.mu_min + 0.5 * (
                self.mu_max - self.mu_min) * (
                    1 + tf.math.cos(np.pi * self.T_cur / self.T_i))
        
        self.T_cur += increment
        
        if np.isclose(self.T_cur, self.T_i):
            self.T_i *= self.T_mult
            self.T_cur = 0.0
        
        #update the learning_rate accordingly:
        optimizer.learning_rate.assign(tf.cast(mu_i,tf.float32))
        
        self.learning_rate = mu_i
        #this is just so that you can find the current learning rate from the scheduler

In [None]:
loss_object = tf.keras.losses.LogCosh()

optimizer = tf.keras.optimizers.Adam(learning_rate = G.learning_rate,
                                     beta_1 = 0.9,
                                     beta_2 = 0.999
                                    )
scheduler = CosAnnealWarmRestarts(T_0 = 1.0, T_mult = 3.0)

## Custom Train and Test Loops

In [None]:
def loss_fn(model, x, y, training):
    y_hat, attn_weights = model(x, y, training=training)
    return y_hat, attn_weights, loss_object(y_true = y, y_pred = y_hat)

def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        y_hat, attn_weights, loss = loss_fn(model, inputs, targets, training=True)
    return y_hat, attn_weights, loss, tape.gradient(loss, model.trainable_variables)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, scheduler, training = True):
    size = len(dataloader)
    perc_error = 0.0
    epoch_loss_avg = tf.keras.metrics.Mean()
    
    for batch, (x,y) in enumerate(dataloader):
        
        predict, attn_weights, loss, grads = grad(model, x, y) # assert(loss.shape == [])
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        scheduler.step(1 / size, optimizer)
        
        perc_error += tf.math.reduce_mean(tf.abs(predict - y) / (y + 1e-2) * 100, [0,1])
        epoch_loss_avg.update_state(loss)
        if batch % (size // 15) == 0:
            print(f"Mean loss: {epoch_loss_avg.result():>7f}  [{batch:4d}/{size:4d}]")
        elif batch == 3: break

    perc_error /= size
    print(f"Train Error: \nAverage Accuracy: {100 - perc_error}%")
    return epoch_loss_avg.result(), (100. - perc_error), attn_weights

def test_loop(dataloader, model, loss_fn, training = False):
    size = len(dataloader)
    perc_error = 0.0
    counter = 0
    epoch_loss_avg = tf.keras.metrics.Mean()
    
    for x,y in dataloader:
        predict, test_loss = loss_fn(model, x, y, training)
        
        if np.isnan(test_loss).any():
            print("Test Loss had a np.nan value")
            break
        
        epoch_loss_avg.update_state(test_loss)
        perc_error += tf.math.reduce_mean(tf.abs(predict - y) / (y + 1e-2) * 100, [0,1])

        counter += 1
        if counter % (size // 2) == 0:
            print(f"{counter} / {size} tested")
        elif counter == 3:
            break
            
    perc_error /= size
    print(f"Test Error: \nAverage Accuracy: {100 - perc_error}%, Avg Loss: {epoch_loss_avg.result():>8f}\n")
    return epoch_loss_avg.result(), 100. - perc_error

<a id = "train"></a>
# Training

In [None]:
pp = PP(plot_names = ["Mean Log Loss", "% Accuracy"],
        line_names = ["Train Loop", "Test Loop"],
        x_label = "epochs"
       )
## Note the y-axis gets cut off with numbers longer than three digits because the source code has a bug
## I checked the github repo for lr-curve and the issue has been raised but not closed

for epoch in range(1, G.epochs+1):
    print(f"Epoch {epoch}/{G.epochs}\n--------------------------------------")
    train_loss, train_acc, attn_weights = train_loop(train_dataloader, model, loss_fn, optimizer, scheduler)
    test_loss, test_acc = test_loop(test_dataloader, model, loss_fn)
    pp.update([[train_loss.numpy(), test_loss.numpy()], [train_acc, test_acc]])
    
#     if epoch % 15:
#         model.save_weights("/content/drive/MyDrive/transformer_soc/decoder_model_weights.tf", overwrite = True)
#         pd.DataFrame(attn_weights).to_csv("/content/drive/MyDrive/transformer_soc/decoder_attn_weights.csv",index=False)
    
print("Completed!")

**Saving Progress**

In [None]:
model.save_weights("/content/drive/MyDrive/transformer_soc/model_weights.tf", overwrite = True)

pd.DataFrame(attn_weights).to_csv("/content/drive/MyDrive/transformer_soc/attn_weights.csv",index=False)

np.save(
    "/content/drive/MyDrive/transformer_soc/scheduler_state.npy",
    np.array([scheduler.learning_rate.numpy(), scheduler.T_cur, scheduler.T_i])
       )

print(f'''
lr: {scheduler.learning_rate.numpy()}
T_cur: {scheduler.T_cur}
T_i: {scheduler.T_i}
''')

<a id = "val"></a>
# Validate

**Dev Set**

In [None]:
visualize_dev = validate(model, test_dataloader, dev = True)

**Entire Dataset**

In [None]:
x_set, y_set = rolling_split(file, G.window_size, train = False)

x_set = tf.data.Dataset.from_tensor_slices(x_set)
y_set = tf.data.Dataset.from_tensor_slices(y_set)

set_dataloader = tf.data.Dataset.zip((x_set, y_set)).batch(G.batch_size, drop_remainder=True)

visualize = validate(model, set_dataloader, dev = False)