## These are cells that I am no longer using because I have adopted Keras a little better

---

In [None]:
class CosAnnealWarmRestarts():
    def __init__(self, T_0: float, T_mult: float):
        '''
        Cosine Annealing with Warm Restarts
        Returns a new learning rate based on the call method
        
        Parameters:
        `T_0` int
            the number of iterations for the first restart to occur
        `T_mult` int
            the factor to increase T_i by after a restart, where T_i is the i^th restart.
        '''
        super(CosAnnealWarmRestarts, self).__init__()
        
        assert isinstance(T_0, float) and isinstance(T_mult, float)
        assert T_0 > 0.0
        self.mu_max = G.learning_rate #initial and max learning_rate
        self.mu_min = G.min_learning_rate #minimum learning_rate
        self.T_i = T_0
        self.T_mult = T_mult
        self.T_cur = 0.0
        
        self.learning_rate = G.learning_rate
        
    def step(self, increment:float, optimizer):
        '''
        Cosine Annealing with Warm Restarts
        Returns a new learning rate based on the schedule described below
        
        Call after every batch

        Parameters:
        `increment` float
            1 batch / total number of batches
            !!!!! Not the current batch number, that would be a series summation
            every epoch, a total of 1.0 will be added to self.T_cur
        `optimizer`
            the optimizer for the neural network
        '''
        try:
            optimizer.learning_rate
        except AttributeError:
            print("Error: optimizer does not have a learning_rate parameter")
        
        mu_i = self.mu_min + 0.5 * (
                self.mu_max - self.mu_min) * (
                    1 + tf.math.cos(np.pi * self.T_cur / self.T_i))
        
        self.T_cur += increment
        
        if np.isclose(self.T_cur, self.T_i):
            self.T_i *= self.T_mult
            self.T_cur = 0.0
        
        #update the learning_rate accordingly:
        optimizer.learning_rate.assign(tf.cast(mu_i,tf.float32))
        
        self.learning_rate = mu_i
        #this is just so that you can find the current learning rate from the scheduler

In [None]:
loss_object = tf.keras.losses.LogCosh()

optimizer = tf.keras.optimizers.Adam(learning_rate = G.learning_rate,
                                     beta_1 = 0.9,
                                     beta_2 = 0.999
                                    )
scheduler = CosAnnealWarmRestarts(T_0 = 1.0, T_mult = 3.0)

In [None]:
def loss_fn(model, x, y, training):
    y_hat = model(x, training=training)
    return y_hat, loss_object(y_true = y, y_pred = y_hat)

def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        y_hat, attn_weights, loss = loss_fn(model, inputs, targets, training=True)
    return y_hat, loss, tape.gradient(loss, model.trainable_variables)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, scheduler, training = True):
    size = len(dataloader)
    perc_error = 0.0
    epoch_loss_avg = tf.keras.metrics.Mean()
    
    for batch, (x,y) in enumerate(dataloader):
        
        predict, loss, grads = grad(model, x, y) # assert(loss.shape == [])
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        scheduler.step(1 / size, optimizer)
        
        perc_error += tf.math.reduce_mean(tf.abs(predict - y) / (y + 1e-2) * 100, [0,1])
        epoch_loss_avg.update_state(loss)
        if batch % (size // 15) == 0:
            print(f"Mean loss: {epoch_loss_avg.result():>7f}  [{batch:4d}/{size:4d}]")

    perc_error /= size
    print(f"Train Error: \nAverage Accuracy: {100 - perc_error}%")
    return epoch_loss_avg.result(), (100. - perc_error), attn_weights

def test_loop(dataloader, model, loss_fn, training = False):
    size = len(dataloader)
    perc_error = 0.0
    counter = 0
    epoch_loss_avg = tf.keras.metrics.Mean()
    
    for x,y in dataloader:
        predict, test_loss = loss_fn(model, x, y, training)
        
        if np.isnan(test_loss).any():
            print("Test Loss had a np.nan value")
            break
        
        epoch_loss_avg.update_state(test_loss)
        perc_error += tf.math.reduce_mean(tf.abs(predict - y) / (y + 1e-2) * 100, [0,1])

        counter += 1
        if counter % (size // 2) == 0:
            print(f"{counter} / {size} tested")

            
    perc_error /= size
    print(f"Test Error: \nAverage Accuracy: {100 - perc_error}%, Avg Loss: {epoch_loss_avg.result():>8f}\n")
    return epoch_loss_avg.result(), 100. - perc_error

In [None]:
pp = PP(plot_names = ["Mean Log Loss", "% Accuracy"],
        line_names = ["Train Loop", "Test Loop"],
        x_label = "epochs"
       )
## Note the y-axis gets cut off with numbers longer than three digits because the source code has a bug
## I checked the github repo for lr-curve and the issue has been raised but not closed

for epoch in range(1, 10):
    print(f"Epoch {epoch}/{G.epochs}\n--------------------------------------")
    train_loss, train_acc, attn_weights = train_loop(train_dataloader, model, loss_fn, optimizer, scheduler)
    test_loss, test_acc = test_loop(test_dataloader, model, loss_fn)
    pp.update([[train_loss.numpy(), test_loss.numpy()], [train_acc, test_acc]])
    
#     if epoch % 15:
#         model.save_weights("/content/drive/MyDrive/transformer_soc/decoder/model_weights.tf", overwrite = True)
    
print("Completed!")

In [None]:
model.save_weights("/content/drive/MyDrive/transformer_soc/decoder/model_weights.tf", overwrite = True)

filehandler = open("/content/drive/MyDrive/transformer_soc/decoder/attn_weights", "wb") #write in binary
pickle.dump(attn_weights, filehandler)

np.save(
    "/content/drive/MyDrive/transformer_soc/decoder/scheduler_state.npy",
    np.array([scheduler.learning_rate.numpy(), scheduler.T_cur, scheduler.T_i])
       )

print(f'''
lr: {scheduler.learning_rate.numpy()}
T_cur: {scheduler.T_cur}
T_i: {scheduler.T_i}
''')