# Cell below is need if we want to run this notebook on colab.

In [10]:
# code required for running this notebook on colab. 
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/ML-optimisation

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing libraries

In [6]:
# importing the needed packages for this experiment
import pandas as pd
import numpy as np
import PIL
import os
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
import matplotlib.pyplot as plt
import nfft
from sklearn.decomposition import PCA
from pytorch_optimizer import load_optimizer
from determinestic import set_seed
from custome_dataset import CustomMnist
from SCRNOptimizer import SCRNOptimizer

# Reproducibility
In this part we set seeds (along with other commands) to make the code reproducible. Refer to `deterministic.py` for more info. We try with 5 random seeds in this experiment. 

In [7]:
set_seed(1)
# set_seed(44)
# set_seed(12)
# set_seed(15)
# set_seed(51)

  'Enabeling deterministic mode in PyTorch can have a performance '


# Preparing Data

The `MNIST` object is a subclass of a PyTorch [`torch.utils.data.Dataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) class, which as we'll see later can be used with a `DataLoader` object to provide streaming access to batches of data.

In [8]:
# load (download if needed) the MNIST dataset
mnist_train = CustomMnist(".", train=True, download=True,)
mnist_test = CustomMnist(".", train=False, download=True,)
print (mnist_train)

Dataset CustomMnist
    Number of datapoints: 60000
    Root location: .
    Split: Train


In [10]:
# flattening the data as we use a two layer MLP to learn the data
flat_mnist_train = mnist_train.data.reshape(mnist_train.data.shape[0],mnist_train.data.shape[1]*mnist_train.data.shape[2])
flat_mnist_test = mnist_test.data.reshape(mnist_test.data.shape[0],mnist_test.data.shape[1]*mnist_test.data.shape[2])
flat_mnist_train.shape, flat_mnist_test.shape

(torch.Size([60000, 784]), torch.Size([10000, 784]))

## Doing the dimensionality reduction using PCA

In [11]:
# get PCA in this cell to be able to do non uniform fast Fourier transform
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(flat_mnist_train)

# Apply transform to both the training set and the test set.
mnist_train.data = scaler.transform(flat_mnist_train)
mnist_test.data = scaler.transform(flat_mnist_test)

print(mnist_train.data.shape, mnist_test.data.shape)

pca = PCA(n_components=1)
pca.fit(mnist_train.data)

(60000, 784) (10000, 784)


PCA(n_components=1)

In [12]:
# Here we split the train dataset to validation and train sets
total_count=len(mnist_train)
print(total_count)
train_count = int(0.80 * total_count)
val_count = total_count - train_count

train_set, val_set = torch.utils.data.random_split(
    mnist_train, (train_count,val_count)
)
len(train_set), len(val_set)

60000


(48000, 12000)

In [13]:
# create data loaders
trainloader = DataLoader(train_set, batch_size=128, shuffle=True)
valloader = DataLoader(val_set, batch_size=128, shuffle=True)
testloader = DataLoader(mnist_test, batch_size=128, shuffle=True)

# Defining the neural network

In [14]:
# define baseline model which is a two layer fully connected network with ReLU activation.
class BaselineModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BaselineModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        # self.bn= nn.BatchNorm1d(64),
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)
        return out

In [15]:
# setting the device for execution
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

# Defining functions to train and evaluate the models

In [16]:
# define the to train models
def train(model, data_loader, optimizer):
    model.train()
    train_loss = 0
    x_pca=[]
    y_hat=[]
    y_true=[]
    correct=0
    # main training loop for one epoch
    for batch, tensor in enumerate(data_loader):
        data, target = tensor
        # doing the PCA analysis
        x_pca+=np.squeeze(pca.transform(data)).tolist()
        y_true+=target.tolist()
        data=data.to(device)
        target=target.to(device)
        optimizer.zero_grad()
        # forward + loss + backward + optimise (update weights)
        out = model(data)
        predicted = torch.argmax(out, dim=1)
        y_hat += predicted.tolist()
        correct += torch.sum(target==predicted).item()

        loss = loss_function(out,target)
        train_loss += loss.item()
        loss.backward(create_graph=True, retain_graph=True)
        optimizer.step()
    x_pca=torch.tensor(x_pca)
    y_true=torch.tensor(y_true)
    y_hat=torch.tensor(y_hat)
    
    # print(x_pca.shape)
    # Reporting the average loss, average accuracy and the error of Fourier coefficients as explained in the report.
    avg_accuracy = correct / (1*len(data_loader.dataset))
    avg_loss = train_loss / len(data_loader.dataset)
    nfft_out_hat = nfft.nfft_adjoint(x_pca, y_hat , 20)
    nfft_out_true = nfft.nfft_adjoint(x_pca, y_true, 20)
    avg_nfft = np.abs((nfft_out_hat-nfft_out_true)/nfft_out_true)
    return avg_loss,avg_accuracy, avg_nfft
           
# define the to evaluate models            
def test(model, data_loader,):
    model.eval()
    test_loss = 0
    correct = 0
    nfft_out=0
    x_pca=[]
    y_hat=[]
    y_true=[]
    with torch.no_grad():
        for batch, tensor in enumerate(data_loader):
            data, target = tensor
            x_pca+=np.squeeze(pca.transform(data)).tolist()
            y_true+=target.tolist()
            data=data.to(device)
            target=target.to(device)
            # forward + loss + backward + optimise (update weights)
            out = model(data)
            test_loss += loss_function(out, target).item()
            predicted = torch.argmax(out, dim=1)
            y_hat+=predicted.tolist()
            correct += torch.sum(target==predicted).item()

    x_pca=torch.tensor(x_pca)
    y_true=torch.tensor(y_true)
    y_hat=torch.tensor(y_hat) 

    # Reporting the average loss, average accuracy and the error of Fourier coefficients as explained in the report.
    avg_accuracy = correct / (1*len(data_loader.dataset))
    avg_loss = test_loss / (1*len(data_loader.dataset))
    nfft_out_hat = nfft.nfft_adjoint(x_pca, y_hat , 20)
    nfft_out_true = nfft.nfft_adjoint(x_pca, y_true , 20)
    avg_nfft = np.abs((nfft_out_hat-nfft_out_true)/nfft_out_true)

    return avg_loss, avg_accuracy, avg_nfft

# Results for SGD 

In [None]:
# build the model 
model = BaselineModel(784, 784, 10).to(device)

# define the loss function and the optimiser
loss_function = nn.CrossEntropyLoss()


learning_rate = 1e-2
# learning_momentum = 0.9
optimiser = optim.SGD(model.parameters(), lr=learning_rate,)

In [None]:
SGD_epoch_nums = []
SGD_training_loss = []
SGD_training_acc = []
SGD_training_nfft = []
SGD_validation_loss = []
SGD_validation_acc = []
epochs = 50

for epoch in range(1, epochs + 1):
    
    train_loss,train_accuracy, train_nfft = train(model, trainloader, optimiser,)
    # print(train_nfft)
    val_loss, val_accuracy, val_nfft = test(model, valloader,)
    test_loss, test_accuracy, test_nfft = test(model, testloader,)
    SGD_epoch_nums.append(epoch)
    SGD_training_loss.append(train_loss)
    SGD_training_nfft.append(train_nfft)
    SGD_validation_loss.append(val_loss)
    SGD_validation_acc.append(val_accuracy)
    SGD_training_acc.append(train_accuracy)
    
    # torch.save(model.state_dict(), save_path)
    print('Epoch {:d}: Training loss= {:.4f}, val loss= {:.4f}, val_Accuracy={:.4%}, test loss= {:.4f}, test_Accuracy={:.4%}'.format(epoch, train_loss,
                                                                                            val_loss, val_accuracy,test_loss, test_accuracy,))
    

print('**** Finished Training ****')

  allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass


Epoch 1: Training loss= 0.0080, val loss= 0.0041, val_Accuracy=87.4917%, test loss= 0.0041, test_Accuracy=87.8300%
Epoch 2: Training loss= 0.0034, val loss= 0.0029, val_Accuracy=90.4250%, test loss= 0.0028, test_Accuracy=90.3700%
Epoch 3: Training loss= 0.0026, val loss= 0.0024, val_Accuracy=91.5167%, test loss= 0.0024, test_Accuracy=91.5400%
Epoch 4: Training loss= 0.0023, val loss= 0.0022, val_Accuracy=92.1750%, test loss= 0.0021, test_Accuracy=92.2200%
Epoch 5: Training loss= 0.0020, val loss= 0.0020, val_Accuracy=92.6917%, test loss= 0.0020, test_Accuracy=92.7400%
Epoch 6: Training loss= 0.0019, val loss= 0.0019, val_Accuracy=93.0917%, test loss= 0.0018, test_Accuracy=93.2200%
Epoch 7: Training loss= 0.0017, val loss= 0.0018, val_Accuracy=93.4000%, test loss= 0.0017, test_Accuracy=93.5200%
Epoch 8: Training loss= 0.0016, val loss= 0.0017, val_Accuracy=93.7167%, test loss= 0.0016, test_Accuracy=93.8600%
Epoch 9: Training loss= 0.0015, val loss= 0.0016, val_Accuracy=93.9583%, test lo

In [None]:
np.save("./SGDdata1",np.array([SGD_epoch_nums,SGD_training_loss, SGD_training_acc,SGD_training_nfft,
                                                                   SGD_validation_loss,SGD_validation_acc]) )

  This is separate from the ipykernel package so we can avoid doing imports until


# Results for Adam

In [None]:
# build the model 
model = BaselineModel(784, 784, 10).to(device)

# define the loss function and the optimiser
loss_function = nn.CrossEntropyLoss()


learning_rate = 1e-2
optimiser = optim.Adam(model.parameters(),lr=learning_rate)

In [None]:
Adam_epoch_nums = []
Adam_training_loss = []
Adam_training_acc = []
Adam_training_nfft = []
Adam_validation_loss = []
Adam_validation_acc = []

epochs = 50

for epoch in range(1, epochs + 1):
    
    train_loss,train_accuracy, train_nfft = train(model, trainloader, optimiser,)
    # print(train_nfft)
    val_loss, val_accuracy, val_nfft = test(model, valloader,)
    test_loss, test_accuracy, test_nfft = test(model, testloader,)
    Adam_epoch_nums.append(epoch)
    Adam_training_loss.append(train_loss)
    Adam_training_nfft.append(train_nfft)
    Adam_validation_loss.append(val_loss)
    Adam_validation_acc.append(val_accuracy)
    Adam_training_acc.append(train_accuracy)
    # torch.save(model.state_dict(), save_path)
    print('Epoch {:d}: Training loss= {:.4f}, val loss= {:.4f}, val_Accuracy={:.4%}, test loss= {:.4f}, test_Accuracy={:.4%}'.format(epoch, train_loss,
                                                                                            val_loss, val_accuracy,test_loss, test_accuracy,))
    

print('**** Finished Training ****')


Epoch 1: Training loss= 0.0455, val loss= 0.0510, val_Accuracy=92.6750%, test loss= 0.0331, test_Accuracy=93.0500%
Epoch 2: Training loss= 0.0390, val loss= 0.0968, val_Accuracy=94.4167%, test loss= 0.0477, test_Accuracy=94.4600%
Epoch 3: Training loss= 0.0280, val loss= 0.0695, val_Accuracy=95.8250%, test loss= 0.0420, test_Accuracy=95.6700%
Epoch 4: Training loss= 0.0127, val loss= 0.0964, val_Accuracy=95.0083%, test loss= 0.0432, test_Accuracy=95.2800%
Epoch 5: Training loss= 0.0162, val loss= 0.0649, val_Accuracy=95.3083%, test loss= 0.0424, test_Accuracy=96.2200%
Epoch 6: Training loss= 0.0151, val loss= 0.0576, val_Accuracy=96.3417%, test loss= 0.0583, test_Accuracy=96.4000%
Epoch 7: Training loss= 0.0082, val loss= 0.0557, val_Accuracy=96.5833%, test loss= 0.0500, test_Accuracy=96.5900%
Epoch 8: Training loss= 0.0057, val loss= 0.0670, val_Accuracy=96.3750%, test loss= 0.0523, test_Accuracy=96.4000%
Epoch 9: Training loss= 0.0063, val loss= 0.0663, val_Accuracy=96.5500%, test lo

In [None]:
np.save("./Adamdata1",np.array([Adam_epoch_nums,Adam_training_loss,
                                                                   Adam_training_acc,Adam_training_nfft,
                                                                   Adam_validation_loss,Adam_validation_acc]) )

  This is separate from the ipykernel package so we can avoid doing imports until


# Results for AdaHessian

In [None]:
# build the model 
model = BaselineModel(784, 784, 10).to(device)

# define the loss function and the optimiser
loss_function = nn.CrossEntropyLoss()


learning_rate = 1e-2
import torch_optimizer as optimadahessian
optimiser = optimadahessian.Adahessian(
    model.parameters(), lr=learning_rate
)

In [None]:
AdaHess_epoch_nums = []
AdaHess_training_loss = []
AdaHess_training_acc = []
AdaHess_training_nfft = []
AdaHess_validation_loss = []
AdaHess_validation_acc = []
epochs = 50

for epoch in range(1, epochs + 1):
    
    train_loss,train_accuracy, train_nfft = train(model, trainloader, optimiser,)
    # print(train_nfft)
    val_loss, val_accuracy, val_nfft = test(model, valloader,)
    test_loss, test_accuracy, test_nfft = test(model, testloader,)
    AdaHess_epoch_nums.append(epoch)
    AdaHess_training_loss.append(train_loss)
    AdaHess_training_nfft.append(train_nfft)
    AdaHess_validation_loss.append(val_loss)
    AdaHess_validation_acc.append(val_accuracy)
    AdaHess_training_acc.append(train_accuracy)


    
    
    # torch.save(model.state_dict(), save_path)
    print('Epoch {:d}: Training loss= {:.4f}, val loss= {:.4f}, val_Accuracy={:.4%}, test loss= {:.4f}, test_Accuracy={:.4%}'.format(epoch, train_loss,
                                                                                            val_loss, val_accuracy,test_loss, test_accuracy,))
    

print('**** Finished Training ****')

Epoch 1: Training loss= 0.0052, val loss= 0.0025, val_Accuracy=91.1750%, test loss= 0.0025, test_Accuracy=91.2900%
Epoch 2: Training loss= 0.0021, val loss= 0.0019, val_Accuracy=93.2833%, test loss= 0.0018, test_Accuracy=93.2000%
Epoch 3: Training loss= 0.0016, val loss= 0.0016, val_Accuracy=94.0917%, test loss= 0.0015, test_Accuracy=94.2700%
Epoch 4: Training loss= 0.0013, val loss= 0.0014, val_Accuracy=94.8083%, test loss= 0.0014, test_Accuracy=94.9100%
Epoch 5: Training loss= 0.0012, val loss= 0.0013, val_Accuracy=95.2833%, test loss= 0.0012, test_Accuracy=95.4000%
Epoch 6: Training loss= 0.0010, val loss= 0.0012, val_Accuracy=95.6083%, test loss= 0.0011, test_Accuracy=95.7800%
Epoch 7: Training loss= 0.0009, val loss= 0.0011, val_Accuracy=95.9583%, test loss= 0.0010, test_Accuracy=96.1400%
Epoch 8: Training loss= 0.0008, val loss= 0.0010, val_Accuracy=96.2583%, test loss= 0.0010, test_Accuracy=96.3300%
Epoch 9: Training loss= 0.0007, val loss= 0.0010, val_Accuracy=96.3833%, test lo

In [None]:
np.save("./AdaHessdata1",np.array([AdaHess_epoch_nums,AdaHess_training_loss,
                                                                  AdaHess_training_acc, AdaHess_training_nfft,
                                                                   AdaHess_validation_loss, AdaHess_validation_acc]) )

  This is separate from the ipykernel package so we can avoid doing imports until


# Results for SCRN

In [None]:
# build the model 
model = BaselineModel(784, 784, 10).to(device)

# define the loss function and the optimiser
loss_function = nn.CrossEntropyLoss()

learning_rate = 1e-2
optimiser = SCRNOptimizer(model.parameters(), ro=1, l=100, inner_itr=100)

In [None]:
SCRN_epoch_nums = []
SCRN_training_loss = []
SCRN_training_acc = []
SCRN_training_nfft = []
SCRN_validation_loss = []
SCRN_validation_acc = []
epochs = 50

for epoch in range(1, epochs + 1):
    
    train_loss,train_accuracy, train_nfft = train(model, trainloader, optimiser,)
    # print(train_nfft)
    val_loss, val_accuracy, val_nfft = test(model, valloader,)
    test_loss, test_accuracy, test_nfft = test(model, testloader,)
    SCRN_epoch_nums.append(epoch)
    SCRN_training_loss.append(train_loss)
    SCRN_training_nfft.append(train_nfft)
    SCRN_validation_loss.append(val_loss)
    SCRN_validation_acc.append(val_accuracy)
    SCRN_training_acc.append(train_accuracy)


    
    
    # torch.save(model.state_dict(), save_path)
    print('Epoch {:d}: Training loss= {:.4f}, val loss= {:.4f}, val_Accuracy={:.4%}, test loss= {:.4f}, test_Accuracy={:.4%}'.format(epoch, train_loss,
                                                                                            val_loss, val_accuracy,test_loss, test_accuracy,))
    

print('**** Finished Training ****')

Epoch 1: Training loss= 0.0037, val loss= 0.0020, val_Accuracy=92.7250%, test loss= 0.0020, test_Accuracy=92.8000%
Epoch 2: Training loss= 0.0017, val loss= 0.0016, val_Accuracy=94.1250%, test loss= 0.0015, test_Accuracy=94.1400%
Epoch 3: Training loss= 0.0013, val loss= 0.0013, val_Accuracy=94.9833%, test loss= 0.0013, test_Accuracy=95.0700%
Epoch 4: Training loss= 0.0011, val loss= 0.0012, val_Accuracy=95.6417%, test loss= 0.0012, test_Accuracy=95.7000%
Epoch 5: Training loss= 0.0009, val loss= 0.0011, val_Accuracy=95.8833%, test loss= 0.0011, test_Accuracy=95.9900%
Epoch 6: Training loss= 0.0008, val loss= 0.0011, val_Accuracy=96.0417%, test loss= 0.0011, test_Accuracy=96.2200%
Epoch 7: Training loss= 0.0007, val loss= 0.0010, val_Accuracy=96.2583%, test loss= 0.0010, test_Accuracy=96.4000%
Epoch 8: Training loss= 0.0006, val loss= 0.0010, val_Accuracy=96.4500%, test loss= 0.0009, test_Accuracy=96.6600%
Epoch 9: Training loss= 0.0006, val loss= 0.0009, val_Accuracy=96.5667%, test lo

In [None]:
np.save("./SCRN1",np.array([SCRN_epoch_nums,SCRN_training_loss,
                                                                  SCRN_training_acc, SCRN_training_nfft,
                                                                   SCRN_validation_loss, SCRN_validation_acc]) )

# Plotting the results
This results are based on the averaged results over 5 different seeds

In [24]:
# here you can find the plot functions 

import seaborn as sns; sns.set_theme() 
sns.set_context("notebook",  rc={"lines.linewidth": 1}) 
def plot_Fmap(FP, epoch_tr, ax, cbar=False, title=''): 
 
  if cbar: 
    sns.heatmap(FP[:,:10].transpose(),  ax=ax, vmin=0, vmax=1,cbar=cbar, label='normal', cbar_kws={'label': '$\Delta$F'}) 
    cbar = ax.collections[0].colorbar 
    cbar.ax.tick_params(labelsize=30) 
    ax.figure.axes[-1].yaxis.label.set_size(50) 
 
  else: 
    sns.heatmap(FP[:,:10].transpose(),  ax=ax, vmin=0, vmax=1,cbar=False, label='small') 
 
  ax.invert_yaxis()
  ax.set_title(title, fontsize = 70, pad= 60) 
  ax.set_xlabel('epoch', fontsize = 60)  
  ax.set_ylabel('frequency index', fontsize = 60) 
  ax.tick_params(labelsize = 30) 
 
  xticks=ax.xaxis.get_major_ticks() 
  for i in range(len(xticks)): 
      if i%5==0: 
        continue 
      else: 
          xticks[i].set_visible(False) 
 
 
import matplotlib.pyplot as plt 
# from google.colab import files 
 
 
 
def plot_barplots(models_name, models_frequencies, epochs, savepath='./HEATMAPS.svg',): 
 
  ''' 
  models_name: a list with the models titles i.e "[(a) Adam ...,] " 
  moels_frequencies: a list with numpy arrays storing the training frequencies per epochs for each model (i.e Adam... ) 
  epochs: a list with numpy arrays storing the training epochs for each model (i.e Adam... ) 
  ''' 
 
  fig, axs = plt.subplots(1, 4,  figsize=(100,20)) 
 
  for k in range(len(models_name)): 
 
    name = models_name[k] 
    FP = models_frequencies[k] 
    epoch_tr = epochs[k] 
 
    plot_Fmap(FP, epoch_tr, axs[k],  cbar=(k==3), title=name) 
    plt.subplots_adjust(wspace=0.08, hspace=0.00) 
    plt.grid('on') 
 
  plt.savefig(savepath, bbox_inches='tight', pad_inches=0)


def plot_corrolation(x, y, title):
  #create scatterplot
  plt.scatter(x, y, label=title)

  #calculate equation for trendline
  z = np.polyfit(x, y, 1)
  p = np.poly1d(z)

  #add trendline to plot
  plt.plot(x, p(x))

In [None]:
# here we load the data and plot them (for the average of different seeds)
scrn=np.load("./SCRNdata.npy",allow_pickle=True)
adahess=np.load("./AdaHessdata.npy",allow_pickle=True)
adam=np.load("./Adamdata.npy",allow_pickle=True)
sgd=np.load("./SGDdata.npy",allow_pickle=True)
model_names=["SGD","Adam","AdaHessian","SCRN"]
plot_barplots(model_names,np.array([sgd[3].tolist(),adam[3].tolist(),adahess[3].tolist(),scrn[3].tolist()]),[i+1 for i in range (50)])

In [None]:
## plot corrolation map
plot_corrolation(np.array(sgd[3].tolist()).mean(axis=1),sgd[5].astype(float),model_names[0])
plot_corrolation(np.array(adam[3].tolist()).mean(axis=1),adam[5].astype(float),model_names[1])
plot_corrolation(np.array(adahess[3].tolist()).mean(axis=1),adahess[5].astype(float),model_names[2])
plot_corrolation(np.array(scrn[3].tolist()).mean(axis=1),scrn[5].astype(float),model_names[3])
plt.legend()
# plt.title("Validation Corrolation")
plt.xlabel('average frequency error',)  
plt.ylabel('validation accuracy',) 
plt.savefig('./val_corr.svg', bbox_inches='tight', pad_inches=0)
plt.show()