# Report project: AdaGrad Optimizer Notebook


### Import uself libraries

In [8]:
# Import useful libraries for computation
import numpy as np

# Import torch and libraries to deal with NN
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from pyhessian import hessian # Hessian computation
#from density_plot import get_esd_plot # ESD plot

# Import usefil library to visualize results
import matplotlib.pyplot as plt

# Importing the LeNet5 architecture we are going to use for our study and comparisons
from cnn_architectures import *

# Importing parameters to use with different optimizers before comparing them
import params

# Importing useful functions
from helpers import *

# Ignoring warnings to make the code more readable
import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Setting the parameters and additional variables

In [9]:
# Defininig neural network's parameters and seed for reproducibility purposes
RANDOM_SEED = 42
IMG_SIZE = 32
N_CLASSES = 10
# Checking device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

### Loading, reshaping and plotting  data (AdraGrad)

In [10]:
# Loading data
transforms = transforms.Compose([transforms.Resize(IMG_SIZE),
                                 transforms.ToTensor()])

# Load the MNIST dataset
raw_mnist_trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms)
raw_mnist_testset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms)

# Passing train data to the dataloader
train_loader = DataLoader(dataset=raw_mnist_trainset, 
                          batch_size=params.ADAGRAD_BATCH_SIZE, 
                          shuffle=True)

# Passing test data to the dataloader
test_loader = DataLoader(dataset=raw_mnist_testset, 
                          batch_size=params.ADAGRAD_BATCH_SIZE, 
                          shuffle=False)

In [11]:
# Reshaping train data (from 28*28 to 32*32) for visualization purposes
train_data, train_target = reshape_train_data(raw_mnist_trainset, DEVICE)
# Reshaping test data (from 28*28 to 32*32) for visualization purposes
test_data, test_target = reshape_test_data(raw_mnist_trainset, DEVICE)

## Model training and Model Evaluation using AdaGrad

First, we train our model using LeNet5. The model was trained using batches of size 50 and 15 epochs.



In [12]:
# Initializing the model we are going to use in our study
model = LeNet5(num_classes=N_CLASSES)
# Defining the criterion (loss function) to be used during the training procedure
criterion = nn.CrossEntropyLoss()
# Defining and initializing the optimizer (AdaGrad in this notebook)
optimizer = torch.optim.Adagrad(model.parameters(), lr=params.ADAGRAD_LEARNING_RATE, weight_decay=params.ADAGRAD_DECAY, initial_accumulator_value= params.ADAGRAD_INITIAL_ACCUMULATOR_VALUE)

Let's train and test our first model

In [13]:
model, optimizer, losses, grad_norms = training_loop(model, criterion, optimizer, train_loader, test_loader, params.ADAGRAD_N_EPOCHS,
                                    DEVICE)

15:01:13 --- Epoch: 0	Train loss: 0.2773	Valid loss: 0.1552	Train accuracy: 95.28	Valid accuracy: 95.45
15:01:57 --- Epoch: 1	Train loss: 0.1333	Valid loss: 0.1093	Train accuracy: 96.82	Valid accuracy: 96.78
15:02:37 --- Epoch: 2	Train loss: 0.0996	Valid loss: 0.0905	Train accuracy: 97.50	Valid accuracy: 97.25
15:02:57 --- Epoch: 3	Train loss: 0.0817	Valid loss: 0.0806	Train accuracy: 97.86	Valid accuracy: 97.61
15:03:22 --- Epoch: 4	Train loss: 0.0697	Valid loss: 0.0689	Train accuracy: 98.25	Valid accuracy: 97.92
15:03:46 --- Epoch: 5	Train loss: 0.0611	Valid loss: 0.0649	Train accuracy: 98.48	Valid accuracy: 98.10
15:04:06 --- Epoch: 6	Train loss: 0.0546	Valid loss: 0.0615	Train accuracy: 98.64	Valid accuracy: 98.20
15:04:27 --- Epoch: 7	Train loss: 0.0499	Valid loss: 0.0579	Train accuracy: 98.80	Valid accuracy: 98.25
15:04:46 --- Epoch: 8	Train loss: 0.0457	Valid loss: 0.0536	Train accuracy: 98.90	Valid accuracy: 98.28
15:05:07 --- Epoch: 9	Train loss: 0.0422	Valid loss: 0.0516	Trai

Plot the gradient norm, which has been proven to be an important factor related to generalization properties of the architecture

In [None]:
plot_gradient_norm(grad_norms[-30:], method = 'AdaGrad')

In [None]:
compute_confusion_matrix(test_loader, model, N_CLASSES)

We now start looking at the eigenvalues in order to see whether we've reached a flat or sharp minimum

In [None]:
# We now divide the training dataset into batches to compute the hessian of the loss evaluated in the solution
for inputs, targets in train_loader:
    break

# We move everything to the device
inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)

# We now compute the hessian matrix, to later retrieve the spectral norm and the eigenvalues
device_flag = True if torch.cuda.is_available() else False
hessian_comp = hessian(model, criterion, data=(inputs, targets), cuda=device_flag)

# Now let's compute the top eigenvalue. This only takes a few seconds.
top_eigenvalues, top_eigenvector = hessian_comp.eigenvalues()
print("The top Hessian eigenvalue of this model is %.4f"%top_eigenvalues[-1])