<center><img src='https://drive.google.com/uc?id=1_utx_ZGclmCwNttSe40kYA6VHzNocdET' height="60"></center>

AI TECH - Akademia Innowacyjnych Zastosowań Technologii Cyfrowych. Program Operacyjny Polska Cyfrowa na lata 2014-2020
<hr>

<center><img src='https://drive.google.com/uc?id=1BXZ0u3562N_MqCLcekI-Ens77Kk4LpPm'></center>

<center>
Projekt współfinansowany ze środków Unii Europejskiej w ramach Europejskiego Funduszu Rozwoju Regionalnego
Program Operacyjny Polska Cyfrowa na lata 2014-2020,
Oś Priorytetowa nr 3 "Cyfrowe kompetencje społeczeństwa" Działanie  nr 3.2 "Innowacyjne rozwiązania na rzecz aktywizacji cyfrowej"
Tytuł projektu:  „Akademia Innowacyjnych Zastosowań Technologii Cyfrowych (AI Tech)”
    </center>

Code based on https://github.com/pytorch/examples/blob/master/mnist/main.py

In this exercise we are using high level abstractions from torch.nn like nn.Linear.
Note: during the next lab session we will go one level deeper and implement more things
with bare hands.

Tasks:

    1. Read the code.

    2. Check that the given implementation reaches 95% test accuracy for architecture input-128-128-10 after few epochs.

    3. Add the option to use SGD with momentum instead of ADAM.

    4. Experiment with different learning rates. Use provided TrainingVisualizer
    to plot the learning curves and gradient to weight ratios. Compare visualizations
    for different learning rates for both ADAM and SGD with momentum.

    5. Parameterize the constructor by a list of sizes of hidden layers of the MLP.
    Note that this requires creating a list of layers as an atribute of the Net class,
    and one can't use a standard python list containing nn.Modules (why?).
    Check torch.nn.ModuleList.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import plotly.graph_objects as go
import sys
if 'google.colab' in sys.modules:
    from google.colab import output
    output.enable_custom_widget_manager()

In [None]:
# @title Visualize gradients

class TrainingVisualizer:
    def __init__(self, log_interval=10):
        self.log_interval = log_interval
        self.train_loss_fig = self.init_line_plot(title='Training loss', xaxis_title='Step')
        self.grad_to_weight_fig = self.init_line_plot(
            title='Gradient standard deviation to weight standard deviation ratio at 1st layer',
            xaxis_title='Step',
            yaxis_title='Gradient to weight ratio (log scale)',
            yaxis_type='log'
        )
        self.test_acc_fig = self.init_line_plot(
            title='Test accuracy', x=[], xaxis_title='Epoch', mode='lines+markers'
        )

        # Parameters related to current tracked model and its training
        self.first_linear_layer = None
        self.lr = None
        self.trace_idx = -1

    def init_line_plot(
        self,
        title,
        x=None, xaxis_title=None,
        yaxis_title=None, yaxis_type='linear',
        mode='lines'
    ):
        fig = go.Figure()
        fig.update_layout(
            title=title, title_x=0.5,
            xaxis_title=xaxis_title, yaxis_title=yaxis_title,
            height=400, width=1500, margin=dict(b=10, t=60)
        )
        fig.update_yaxes(type=yaxis_type)
        # We cannot add new traces dynamically because Colab has problem with widgets
        # from plotly (traces added dynamically are rendered twice).
        # As an ugly workaround we create a lot of empty traces and update them later
        # with actual data. Empty traces are not plotted.
        for _ in range(25):
            fig.add_trace(go.Scatter(x=x, y=[], showlegend=True, mode=mode))

        fig_widget = go.FigureWidget(fig)
        display(fig_widget)
        return fig_widget

    def track_model(self, model, optimizer, lr):
        """
        Start tracking training metrics for a new model.
        """

        for field in model.__dict__['_modules'].values():
            if isinstance(field, nn.Linear):
                self.first_linear_layer = field
                break
            elif isinstance(field, nn.ModuleList):
                self.first_linear_layer = field[0]
                break

        self.lr = lr
        self.trace_idx += 1

        optim_name = type(optimizer).__name__
        self.train_loss_fig.data[self.trace_idx].name = f'{optim_name}, {lr}'
        self.grad_to_weight_fig.data[self.trace_idx].name = f'{optim_name}, {lr}'
        self.test_acc_fig.data[self.trace_idx].name = f'{optim_name}, {lr}'

    def plot_gradients_and_loss(self, batch_idx, loss):
        if batch_idx % self.log_interval == 0:
            self.train_loss_fig.data[self.trace_idx].y += (loss, )

            layer = self.first_linear_layer
            grad_to_weight_ratio = (self.lr * layer.weight.grad.std() / layer.weight.std()).item()

            self.grad_to_weight_fig.data[self.trace_idx].y += (grad_to_weight_ratio, )

    def plot_accuracy(self, epoch, accuracy):
        self.test_acc_fig.data[self.trace_idx].x += (epoch, )
        self.test_acc_fig.data[self.trace_idx].y += (accuracy, )

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # After flattening an image of size 28x28 we have 784 inputs
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        output = F.log_softmax(x, dim=1)
        return output


def train(model, device, train_loader, optimizer, epoch, log_interval, visualizer, verbose=False):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        visualizer.plot_gradients_and_loss(batch_idx, loss.item())
        optimizer.step()
        if batch_idx % log_interval == 0:
            if verbose:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))


def test(model, device, test_loader, visualizer, verbose=False):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    if verbose:
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))
    visualizer.plot_accuracy(epoch, 100. * correct / len(test_loader.dataset))



In [None]:
batch_size = 256
test_batch_size = 1000
epochs = 5
lr = 1e-2
seed = 1
log_interval = 10
use_cuda = torch.cuda.is_available()

In [None]:
torch.manual_seed(seed)
device = torch.device("cuda" if use_cuda else "cpu")

train_kwargs = {'batch_size': batch_size}
test_kwargs = {'batch_size': test_batch_size}
if use_cuda:
    cuda_kwargs = {'num_workers': 1,
                    'pin_memory': True,
                    'shuffle': True}
    train_kwargs.update(cuda_kwargs)
    test_kwargs.update(cuda_kwargs)

In [None]:
transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])
dataset1 = datasets.MNIST('../data', train=True, download=True,
                    transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
                    transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [None]:
visualizer = TrainingVisualizer(log_interval=log_interval)

In [None]:
model = Net().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
visualizer.track_model(model, optimizer, lr)

for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch, log_interval, visualizer)
    test(model, device, test_loader, visualizer)


Test set: Average loss: 0.2348, Accuracy: 9296/10000 (93%)


Test set: Average loss: 0.1406, Accuracy: 9624/10000 (96%)


Test set: Average loss: 0.1440, Accuracy: 9624/10000 (96%)


Test set: Average loss: 0.1549, Accuracy: 9575/10000 (96%)


Test set: Average loss: 0.1535, Accuracy: 9638/10000 (96%)

