# Seminar 2. PyTorch
Hi! Today we are going to study PyTorch. We'll compare numpy and PyTorch commands, rewrite our previous neural network in two ways.

!!! GPU ON !!!

In [None]:
!pip install mnist

In [None]:
from IPython import display
import numpy as np
import random
import torch

In [None]:
np.random.seed(42)
random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

## Numpy vs Pytorch
### Initialization

In [None]:
a = [1. , 1.4 , 2.5]
print(f"Simple way: {torch.tensor(a)}")
print(f"Zeros:\n {torch.zeros((2,3))}")
print(f"Range: {torch.arange(0, 10)}")
print(f"Complicated range: {torch.arange(4, 12, 2)}")
print(f"Space: {torch.linspace(1, 4, 6)}")
print(f"Identity matrix:\n {torch.eye(4)}")

### Random

In [None]:
print(f"From 0 to 1: {torch.rand(1)}")
print(f"Vector from 0 to 1: {torch.rand(5)}")
print(f"Vector from 0 to 10: {torch.randint(10, size=(5,))}")

### Matrix Operation

In [None]:
a = torch.arange(10).type(torch.FloatTensor)
b = torch.linspace(-10, 10, 10)
print(f"a: {a}\nshape: {a.size()}")
print(f"b: {a}\nshape: {b.size()}")
print(f"a + b: {a + b},\n a * b: {a * b}")
print(f"Dot product: {a.dot(b)}")
print(f"Mean: {a.mean()}, STD: {a.std()}")
print(f"Sum: {a.sum()}, Min: {a.min()}, Max: {a.max()}")
print(f"Reshape:\n{a.reshape(-1, 1)}\nshape: {a.reshape(-1, 1).size()}")
c = a.reshape(-1, 1).repeat(1, 5)
print(f"Repeat:\n{c}\nshape: {c.size()}")
print(f"Transpose:\n{c.T}\nshape: {c.T.size()}")
print(f"Unique items: {torch.unique(c)}")

### Indexing

In [None]:
a = torch.arange(100).reshape(10, 10)
print(f"Array:\n{a}\nshape: {a.size()}")
print(f"Get first column: {a[:, 0]}")
print(f"Get last row: {a[-1, :]}")
print(f"Add new awis:\n{a[:, np.newaxis]}\nshape: {a[:, np.newaxis].size()}")
print(f"Specific indexing:\n{a[4:6, 7:]}")

### Numpy <-> Pytorch

In [None]:
a = torch.normal(mean=torch.zeros(2,4))
a.numpy()

In [None]:
b = np.random.normal(size=(2, 4))
torch.from_numpy(b)

### CUDA

In [None]:
a = torch.normal(mean=torch.zeros(2,4))
b = torch.normal(mean=torch.zeros(2, 4))
print(f"a:\n{a}\nb:\n{b}")

In [None]:
a = a.cuda()

In [None]:
a + b

In [None]:
(a + b.cuda()).cpu()

### Autograd

In [None]:
a = torch.randn(2, requires_grad=True)
b = torch.normal(mean=torch.zeros(2))

c = torch.dot(a, b)
print(f'a:\n{a}\nb:\n{b}\n(a,b): {c}')

In [None]:
c.backward()
print(f'a:\n{a}\nb:\n{b}\n(a,b): {c}')

In [None]:
print(f"Grad a: {a.grad}")

Add function!

In [None]:
a = torch.randn(2, requires_grad=True)
b = torch.normal(mean=torch.zeros(2))
c = torch.ones(1, requires_grad=True)

d = torch.sigmoid(torch.dot(a, b) + c)
print(f'a:\n{a}\nb:\n{b}\nSigmoid( (a,b) ): {d}')

In [None]:
print(f"Grad a: {a.grad}\nGrad c: {c.grad}")

In [None]:
d.backward()
print(f"Grad a: {a.grad}\nGrad c: {c.grad}")

Okay, what about vectors?

In [None]:
a = torch.randn(2, requires_grad=True)
b = torch.normal(mean=torch.zeros(2))

c = a * b
c.backward()

## Neural Network. Rewind

In [None]:
from copy import deepcopy

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns

import mnist

sns.set()

In [None]:
images = mnist.train_images() / 255
labels = mnist.train_labels()

X_train, X_valid, y_train, y_valid = train_test_split(images, labels)

In [None]:
def get_batches(dataset, batch_size):
    X = dataset[0].reshape(-1, 28 * 28)
    Y = dataset[1]
    n_samples = X.shape[0]
    
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        
        batch_idx = indices[start:end]
    
        yield torch.FloatTensor(X[batch_idx]), torch.LongTensor(Y[batch_idx])

In [None]:
class CustomLinear:
    def __init__(self, in_size, out_size):
        """
        Simple linear layer
        """
        self.in_size = in_size
        self.out_size = out_size

        self.w = torch.randn((in_size, out_size), requires_grad=True)
        self.b = torch.randn((1, out_size), requires_grad=True)

    def __call__(self, x):
        return torch.mm(x, self.w) + self.b

    def zero_grad(self):
        self.w.grad = None
        self.b.grad = None

In [None]:
class CustomNeuralNetwork:
    def __init__(self, dims, activation="sigmoid"):
        """
        Simple deep networks, that joins several linear layers. 
        """
        self.dims = dims

        self.linears = list(CustomLinear(d_0, d_1) for d_0, d_1 in zip(dims[:-1], dims[1:]))
        if activation == "sigmoid":
            self.activation = torch.sigmoid
        elif activation == "relu":
            self.activation = torch.relu
        else:
            raise NotImplementedError

    def __call__(self, x):
        for l in self.linears[:-1]:
            x = self.activation(l(x))
        return self.linears[-1](x)

    def zero_grad(self):
        for l in self.linears:
            l.zero_grad()

In [None]:
class CustomCrossEntropy:
    def __call__(self, x, target):
        x = torch.log_softmax(x, 1)
        return - torch.mean(x[torch.arange(target.size(0)), target])

In [None]:
def simple_sgd(model, config):
    with torch.no_grad():
        for l in model.linears:
            l.w -= config['learning_rate'] * l.w.grad
            l.b -= config['learning_rate'] * l.b.grad

In [None]:
net = CustomNeuralNetwork((28 * 28, 10, 10))
criterion = CustomCrossEntropy()

In [None]:
optimizer_config = {
    "learning_rate": 1e-1
}

In [None]:
def train(model, optimizer_config, n_epoch=20, batch_size=256):
    train_logs = {"Train Loss": [0,], "Steps": [0,]}
    valid_logs = {"Valid Loss": [0,], "Valid Accuracy": [0,], "Steps": [0,]}
    step = 0
    best_valid_loss = np.inf
    best_model = None

    for i in range(n_epoch):
        for x_batch, y_batch in get_batches((X_train, y_train), batch_size):
            model.zero_grad()
            
            predictions = model(x_batch)
            loss = criterion(predictions, y_batch)

            loss.backward()
            
            simple_sgd(model, optimizer_config)      
            
            step += 1
            train_logs["Train Loss"].append(loss.detach().item())
            train_logs["Steps"].append(step)

        sum_loss = 0
        sum_acc = 0
        count_valid_steps = 0
        with torch.no_grad():
            for x_batch, y_batch in get_batches((X_valid, y_valid), batch_size):
                predictions = model(x_batch)
                loss = criterion(predictions, y_batch)
                sum_loss += loss.item()
                sum_acc += accuracy_score(y_batch, np.argmax(predictions.numpy(), axis=1))
                count_valid_steps += 1

        valid_logs["Valid Loss"].append(sum_loss / count_valid_steps)
        valid_logs["Valid Accuracy"].append(sum_acc / count_valid_steps)
        valid_logs["Steps"].append(step)

        if best_valid_loss > sum_loss / count_valid_steps:
            best_valid_loss = sum_loss / count_valid_steps
            best_model = deepcopy(model)

    fig, ax = plt.subplots(1, 3, figsize=(20, 5))
    sns.lineplot(x="Steps", y="Train Loss", data=train_logs, ax=ax[0])
    sns.lineplot(x="Steps", y="Valid Loss", data=valid_logs, ax=ax[1])
    sns.lineplot(x="Steps", y="Valid Accuracy", data=valid_logs, ax=ax[2])
    plt.plot()

    return best_model, train_logs, valid_logs

In [None]:
net, _, _ = train(net, optimizer_config)

## Neural Network. Rewind #2

In [None]:
import torch.nn as nn


class Linear(nn.Module):
    def __init__(self, in_size, out_size):
        super().__init__()

        self.in_size = in_size
        self.out_size = out_size

        self.w = nn.Parameter(torch.randn(in_size, out_size))
        self.b = nn.Parameter(torch.randn(out_size))

    def forward(self, x):
        return torch.mm(x, self.w) + self.b


class BatchNorm(nn.Module):
    def __init__(self, in_size, alpha=0.1):
        super().__init__()

        self.in_size = in_size

        self.beta = nn.Parameter(torch.zeros(in_size))
        self.gamma = nn.Parameter(torch.ones(1, in_size))

        self.epsilon = 1e-5

    def forward(self, x):
        x = (x - x.mean(dim=0)) / torch.sqrt(x.var(dim=0) + self.epsilon)
        return x * self.gamma + self.beta


class Dropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()

        self.p = p

    def forward(self, x):
        if self.training:
            binomial = torch.distributions.binomial.Binomial(probs=1-self.p)
            return x * binomial.sample(x.size()).to(x.device) * (1.0/(1-self.p))
        return x


class Block(nn.Module):
    def __init__(self, in_size, out_size, activation="sigmoid", p=0.5):
        super().__init__()

        self.linear = Linear(in_size, out_size)
        self.dropout = Dropout(p)
        self.batch_norm = BatchNorm(out_size)
        
        if activation == "sigmoid":
            self.activation = torch.sigmoid
        elif activation == "relu":
            self.activation = torch.relu
        else:
            raise NotImplementedError

    def forward(self, x):
        x = self.linear(x)
        x = self.batch_norm(x)
        x = self.dropout(x)
        return self.activation(x)


class Net(nn.Module):
    def __init__(self, dims, activation="relu", p=0.5):
        super().__init__()

        self.blocks = nn.ModuleList(
            list(Block(d_0, d_1, activation=activation, p=p) for d_0, d_1 in zip(dims[:-2], dims[1:-1]))
        )
        self.cl = Linear(dims[-2], dims[-1])

    def forward(self, x):
        for m in self.blocks:
            x = m(x)
        return self.cl(x)

In [None]:
net = Net((28 * 28, 100, 10), p=0.1).cuda()
criterion = nn.CrossEntropyLoss()

In [None]:
optimizer = torch.optim.SGD(net.parameters(), lr=1e-1)

In [None]:
def train(model, optimizer, n_epoch=20, batch_size=256, device="cpu"):
    train_logs = {"Train Loss": [0,], "Steps": [0,]}
    valid_logs = {"Valid Loss": [0,], "Valid Accuracy": [0,], "Steps": [0,]}
    step = 0
    best_valid_loss = np.inf
    best_model = None

    for i in range(n_epoch):
        for x_batch, y_batch in get_batches((X_train, y_train), batch_size):
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            
            predictions = model(x_batch)
            loss = criterion(predictions, y_batch)

            loss.backward()
            
            optimizer.step()   
            
            step += 1
            train_logs["Train Loss"].append(loss.detach().item())
            train_logs["Steps"].append(step)

        sum_loss = 0
        sum_acc = 0
        count_valid_steps = 0
        with torch.no_grad():
            for x_batch, y_batch in get_batches((X_valid, y_valid), batch_size):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)

                predictions = model(x_batch)
                loss = criterion(predictions, y_batch)
                sum_loss += loss.item()
                sum_acc += accuracy_score(y_batch.cpu().numpy(), np.argmax(predictions.cpu().numpy(), axis=1))
                count_valid_steps += 1

            valid_logs["Valid Loss"].append(sum_loss / count_valid_steps)
            valid_logs["Valid Accuracy"].append(sum_acc / count_valid_steps)
            valid_logs["Steps"].append(step)

            if best_valid_loss > sum_loss / count_valid_steps:
                best_valid_loss = sum_loss / count_valid_steps
                best_model = deepcopy(net)

    fig, ax = plt.subplots(1, 3, figsize=(20, 5))
    sns.lineplot(x="Steps", y="Train Loss", data=train_logs, ax=ax[0])
    sns.lineplot(x="Steps", y="Valid Loss", data=valid_logs, ax=ax[1])
    sns.lineplot(x="Steps", y="Valid Accuracy", data=valid_logs, ax=ax[2])
    plt.plot()

    return best_model, train_logs, valid_logs

In [None]:
net, _, _ = train(net, optimizer, device="cuda:0")

### Neural Network. Rewind #3. Logging

Logging systems:
- [Tensorboard](https://pytorch.org/docs/stable/tensorboard.html)
- [WandB](https://www.wandb.com/)

In [None]:
class Block(nn.Module):
    def __init__(self, in_size, out_size, activation="relu", p=0.5):
        super().__init__()

        self.in_size = in_size
        self.out_size = out_size

        if activation == "sigmoid":
            self.activation = nn.Sigmoid
        elif activation == "relu":
            self.activation = nn.ReLU
        else:
            raise NotImplementedError

        self.fc = nn.Sequential(
            nn.Linear(self.in_size, self.out_size),
            nn.BatchNorm1d(self.out_size),
            nn.Dropout(p),
            self.activation()
        )

    def forward(self, x):
        return self.fc(x)


net = nn.Sequential(Block(28 * 28, 100, p=0.2), nn.Linear(100, 10)).cuda()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-1)
criterion = nn.CrossEntropyLoss()

In [None]:
%load_ext tensorboard

In [None]:
from datetime import datetime
from pathlib import Path

from torch.utils.tensorboard import SummaryWriter

In [None]:
def train(model, optimizer, n_epoch=20, batch_size=256, device="cpu"):
    writer = SummaryWriter(Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S"))
    step = 0
    best_valid_loss = np.inf
    best_model = None

    for i in range(n_epoch):
        for x_batch, y_batch in get_batches((X_train, y_train), batch_size):
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            
            predictions = model(x_batch)
            loss = criterion(predictions, y_batch)

            loss.backward()
            
            optimizer.step()   
            
            step += 1
            writer.add_scalar("Train Loss", loss.detach().item(), step)

        sum_loss = 0
        sum_acc = 0
        count_valid_steps = 0
        with torch.no_grad():
            for x_batch, y_batch in get_batches((X_valid, y_valid), batch_size):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)

                predictions = model(x_batch)
                loss = criterion(predictions, y_batch)
                sum_loss += loss.item()
                sum_acc += accuracy_score(y_batch.cpu().numpy(), np.argmax(predictions.cpu().numpy(), axis=1))
                count_valid_steps += 1

            writer.add_scalar("Valid Loss", sum_loss / count_valid_steps, step)
            writer.add_scalar("Valid Accuracy", sum_acc / count_valid_steps, step)

            if best_valid_loss > sum_loss / count_valid_steps:
                best_valid_loss = sum_loss / count_valid_steps
                best_model = deepcopy(net)

    return best_model

In [None]:
net = train(net, optimizer, device="cuda:0")

In [None]:
%tensorboard --logdir logs

## Spoiler - train loop with [Catalyst](https://github.com/catalyst-team/catalyst)

- [A comprehensive step-by-step guide to basic and advanced features](https://github.com/catalyst-team/catalyst#step-by-step-guide)
- [Docs](https://catalyst-team.github.io/catalyst/)
- [What is Runner?](https://catalyst-team.github.io/catalyst/api/core.html#runner)

In [None]:
!pip install catalyst

In [None]:
import os
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from catalyst import dl
from catalyst.data.cv import ToTensor
from catalyst.contrib.datasets import MNIST
from catalyst.utils import metrics

net = nn.Sequential(Block(28 * 28, 100, p=0.2), nn.Linear(100, 10))
optimizer = torch.optim.Adam(net.parameters(), lr=0.02)
criterion = torch.nn.CrossEntropyLoss()

loaders = {
    "train": DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=ToTensor()), batch_size=32),
    "valid": DataLoader(MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()), batch_size=32),
}

class CustomRunner(dl.Runner):

    def predict_batch(self, batch):
        # model inference step
        return self.model(batch[0].to(self.device).view(batch[0].size(0), -1))

    def _handle_batch(self, batch):
        # model train/valid step
        x, y = batch
        y_hat = self.model(x.view(x.size(0), -1))

        loss = self.criterion(y_hat, y)
        accuracy01, accuracy03 = metrics.accuracy(y_hat, y, topk=(1, 3))
        self.batch_metrics.update(
            {"loss": loss, "accuracy01": accuracy01, "accuracy03": accuracy03}
        )

        if self.is_train_loader:
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()

runner = CustomRunner()
runner.train(
    model=net,
    criterion=criterion,
    optimizer=optimizer,
    loaders=loaders,
    logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S"),
    num_epochs=5,
    verbose=True,
    load_best_on_end=True,
    callbacks={
    "optimizer": dl.OptimizerCallback(
      metric_key="loss",
      accumulation_steps=1,
      grad_clip_params=None,
    )
  }
)

traced_model = runner.trace(loader=loaders["valid"])