In [None]:
import numpy as np
from activations import ReLU, LeakyReLU, Tanh, Softmax, Sigmoid
from losses import CrossEntropy, MSELoss
from layers import Linear
from layers import L2regularization, Dropout
from model import Model

import torch
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

You don't have to strictly follow the cell structure, but please keep the overall organisation of the notebook the same.

## 8.2.1 Stochastic Gradient Descent (1.5 points)
In this exercise you are given the PyTorch model and the training loop which uses Stochastic Gradient Descent. We train the model for 10 epochs with batch size equal to 4.   
Your task is to implement the same model using Model class which we got from the previous assignment. Use the same hyperparameters (batch size, number of epochs, learning rate) for training. Adapt the training process from the previous assignment so that it uses mini-batches instead of the loading the whole training set at once. You are expected to achieve the same performance on your model as with PyTorch model (around 80% accuracy on the test data after training for 10 epochs).   
Additionally, record both the training and test loss every 2000 minibatches both for PyTorch model and your model. Plot the loss graphs and comment on the differences between them (if any). For each model, the graphs of train and test loss should be displayed on one plot.

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),])
batch_size = 4


trainset = datasets.MNIST(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)
testset = datasets.MNIST(root='./data', train=False,
                                        download=True, transform=transform)
testloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(784, 200, bias=False)
        self.layer2 = nn.Linear(200, 80, bias=False)
        self.layer3 = nn.Linear(80, 10, bias=False)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = torch.sigmoid(self.layer1(x))
        x = torch.sigmoid(self.layer2(x))
        x = self.layer3(x)
        return x

net = Net()

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0)

In [None]:
for epoch in range(10):

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        optimizer.zero_grad()

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

In [None]:
correct = 0
total = 0

with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
        100 * correct / total))

In [None]:
from model import Model
np.random.seed(123)

# Your code for defining a model goes here

In [None]:
# Train the model for 10 epochs using SGD

In [None]:
# Evaluate the performance of the model by computing the accuracy on test data

In [None]:
# Plot the loss graphs for PyTorch model and your model (train and test losses should be on the same plot) 
# and comment on differences (if any)

## 8.2.2 Stochastic Gradient Descent with Momentum (1.5 points)
As it was discussed during the lecture, momentum helps to accelerate gradient in the right direction and helps to solve some problems related with optimization.  
Train the PyTorch model using SGD with Momentum while keeping other hyperparameters the same. Try to find the optimal value for momentum for the given problem (you can use the test data as validation data since it's more or less toy exercise).  
Implement the training with momentum for your Model class. Your code must be contained in `sgd_momentum` method in model.py. You can change the arguments of the function according to your needs, but please keep the implementation there. 
Evaluate the performance of both models on test data. Did the accuracy improved for the same number of epochs trained?  
Again, keep the record of train loss and test loss every 2000 minibatches for both models, plot them and comment on the differences between two models and between SGD with Momentum and without Momentum 

In [None]:
# incorporate momentum for training the PyTorch model

In [None]:
# incorporate momentum for training your model

In [None]:
# Plot the loss graphs for PyTorch model and your model (train and test losses should be on the same plot) 
# and comment on differences (if any) between two models and between SGD with Momentum and without Momentum 

## 8.2.3 AdaGrad (1.5 points)
Train the PyTorch model using AdaGram while keeping other hyperparameters the same  
Implement the training with AdaGrad for your Model class. Your code must be contained in to `ada_grad` method in model.py. You can change the arguments of the function according to your needs, but please keep the implementation there. 
Evaluate the performance of both models on test data. Did the accuracy improved for the same number of epochs trained?  
Again, keep the record of train loss and test loss after every 2000 minibatches for both models, plot them and comment on the differences between two models and between AdaGrad and the previous optimization techniques.

In [None]:
# incorporate AdaGrad for training the PyTorch model

In [None]:
# incorporate AdaGrad for training your model

In [None]:
# Plot the loss graphs for PyTorch model and your model (train and test losses should be on the same plot) 
# and comment on differences (if any) between two models and between AdaGrad and the previous optimization techniques.

## 8.2.4 Adam (1.5 points)
Train the PyTorch model using Adam while keeping other hyperparameters the same  
Implement the training with Adam for your Model class. Your code must be contained in to `adam` method in model.py. You can change the arguments of the function according to your needs, but please keep the implementation there. 
Evaluate the performance of both models on test data. Did the accuracy improved for the same number of epochs trained?
Again, keep the record of train loss and test loss every 2000 minibatches for both models, plot them and comment on the differences between two models and between Adam and the previous optimization techniques.

In [None]:
# incorporate Adam for training the PyTorch model

In [None]:
# incorporate Adam for training your model

In [None]:
# Plot the loss graphs for PyTorch model and your model (train and test losses should be on the same plot) 
# and comment on differences (if any) between two models and between Adam and the previous optimization techniques.