## Lab 05 Paradigm of deep learning code

#### 1. Hyperparameter.

In [2]:
# directly define
learining_rate = 0.01
epochs = 1000
# Pass arguments via the command line
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--learning_rate', type=float, default=0.01)
parser.add_argument('--epochs', type=int, default=1000)
args, _ = parser.parse_known_args()
learining_rate = args.learning_rate
epochs = args.epochs
# cmd/bash: python main.py --learning_rate 0.01 --epochs 1000

#### 2. Model.

In [3]:
# custom module
import torch
import torch.nn as nn
import torch.nn.functional as F


class Linear(nn.Module):
    def __init__(self, in_dims, out_dims):
        super(Linear, self).__init__()
        self.w = nn.Parameter(torch.randn(in_dims, out_dims), requires_grad=True)
        self.b = nn.Parameter(torch.randn(out_dims), requires_grad=True)

    def forward(self, x):
        x = x @ self.w + self.b
        return x


class MLP(nn.Module):
    def __init__(self, in_dims=2, hidden_dims=10, out_dims=2):
        super(MLP, self).__init__()
        self.in_dims = in_dims
        self.hidden_dims = hidden_dims
        self.out_dims = out_dims
        self.fc1 = nn.Linear(2, 10)
        self.fc2 = nn.Linear(10, 2)
        # wrong example: self.list_net = [nn.Linear(2, 10), nn.Linear(10, 2)]
        # right example: self.seq_net = nn.Sequential(nn.Linear(2, 10), nn.Linear(10, 2))
        # or self.mlist_net = nn.ModuleList(nn.Linear(2, 10), nn.Linear(10, 2))

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(x)
        return x

- the difference between `nn.Sequential` and `nn.ModuleList`:

  - `Sequential` implements `forward` function, so it can be derectly called, e.g. `self.seq_net(x)`. `ModuleList` doest not implement `forward` function, and it's just a list for storing `Module`s.
  - an OrderedDict of modules can be passed in `Sequential`.


- the difference between `nn.ModuleList` and `list`.
  
  - modules in `nn.ModuleList` are properly registered and will be visible by all `Module` methods.
  - modules in `list` are not registered, e.g., the parameters of modules in list cannot been visiable by `Optimizer`.

#### 3. Dataset and Dataloader


A custom dataloader, take assignment 1 part 2/3 as example:

We want to assess dataset via `for` loop, e.g. `for input, label in train_dloader`. How to implement this?

In [4]:
import numpy as np

x = np.random.randn(100, 2)  # dataset with 100 samples and 2 features
y = np.random.randint(0, 2, (100))  # labels


# yield
def data_generator(x, y, batch_size=32, shuffle=True):
    n = x.shape[0]
    if shuffle:
        idx = np.random.permutation(n)
    else:
        idx = np.arange(n)
    for i in range(0, n, batch_size):
        batch_idx = idx[i:i + batch_size]
        yield x[batch_idx], y[batch_idx]


# class 
class Dataloader(object):
    def __init__(self, x, y, batch_size=32, shuffle=True):
        self.x = x
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n = x.shape[0]
        self.idx = np.arange(self.n)
        self.i = 0

    def __len__(self):
        return self.n // self.batch_size

    def __iter__(self):
        if self.shuffle:
            np.random.shuffle(self.idx)
        return self

    def __next__(self):
        if self.i >= len(self):
            raise StopIteration
        batch_idx = self.idx[self.i * self.batch_size:(self.i + 1) * self.batch_size]
        self.i += 1
        return self.x[batch_idx], self.y[batch_idx]

Dataset and Dataloader in pytorch:

We don't need implement `Dataloader`, we just need to give a custom `Dataset` implementation.

For example, we have a dataset with the following folder structure:

- train
  - data 
    - sample 1
    - sample 2
    - ...
  - label
    - sample 1
    - sample 2
    - ...
- val
  - data 
    - sample 1
    - sample 2
    - ...
  - label
    - sample 1
    - sample 2
    - ...
  

In [5]:
from torch.utils.data import Dataset
import os


class CustomDataset(Dataset):
    def __init__(self, dataset_path, transform=False):
        self.dataset_path = dataset_path
        self.transform = transform
        self.image_paths = os.listdir(os.path.join(self.dataset_path, 'data'))

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = os.path.join(self.dataset_path, 'data', self.image_paths[idx])
        # read image
        image = ...
        label_path = os.path.join(self.dataset_path, 'label', self.image_paths[idx])
        # read label
        label = ...

        if self.transform is not None:
            image = self.transform(image)

        return image, label

What is `transform` in `__init__` function? Format conversion and Dataset augmentation!

For example, we are asked to do a classification for images. For different usages, `transform`s are usually diffrent.

For training dataset, we expect that it contains dataset augmentation and necessary format conversion.

In [6]:
from torchvision import transforms

_normalize = transforms.Normalize(
    (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))

transform_train = transforms.Compose([
    transforms.RandomCrop(
        32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.RandomErasing(),
    transforms.ToTensor(),
    _normalize
])

While for test/validation dataset, it just contains necessary format conversion.

In [7]:
transform_test = transforms.Compose([
    transforms.ToTensor(),
    _normalize
])

Normalization is the fact of modifying the data of each channel/tensor so that the mean is zero and the standard deviation is one.(ref: [Why and How to normalize data â€“ Object detection on image in PyTorch Part 1](https://inside-machinelearning.com/en/why-and-how-to-normalize-data-object-detection-on-image-in-pytorch-part-1/#Normalizing_data))

two main reasons

- normalizing data includes them in the same range as our activation functions, usually between 0 and 1. This allows for less frequent non-zero gradients during training, and therefore the neurons in our network will learn faster.
- by normalizing each channel so that they have the same distribution, we ensure that the channel information can be mixed and updated during the gradient descent (back propagation) using the same learning rate.

Now, we have implemented ours `Dataset`, so how to load this dataset?

In [None]:
from torch.utils.data import DataLoader

train_dataset = CustomDataset(dataset_path='train', transform=transform_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)

test_dataset = CustomDataset(dataset_path='val', transform=transform_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

for image, label in train_loader:
    # do something
    pass

#### 4. Loss Function

There are too many loss function for different tasks. But in fact, the loss function is just a `Module` that receives the output of the last layer and outputs a scalar.

There we implement a MSELoss function.

In [8]:
class MSELoss(nn.Module):
    def __init__(self):
        super(MSELoss, self).__init__()

    def forward(self, y_pred, y_true):
        loss = torch.mean((y_pred - y_true) ** 2)
        return loss


criterion = MSELoss()

Pytorch also provides many implemented loss, e.g., `L1Loss`, `MSELoss`, `CrossEntropyLoss`. You can refer to the official [documentation](https://pytorch.org/docs/stable/nn.html#loss-functions).

#### 5. Optimizer

Pytorch implementes various optimization algorithms. Most commonly used methods are already supported. You can refer to [`torch.optim`](https://pytorch.org/docs/stable/optim.html).

`Optimizer`s implemented by pytorch have two commonly used instance methods: `zero_grad` and `step`. `zero_grad` is used to clear gradient of all parameters it manages. And `step` updates parameters it manages according to its own rules. 

There we only demonstrate how to instance a `Optimizer` and what arguments it has.

In [9]:
from torch.optim import SGD

model = MLP()

optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4, nesterov=False)

# or 
optimizer = SGD([
    {'params': model.fc1.parameters()},
    {'params': model.fc2.parameters(), 'lr': 0.01}
], lr=0.01, momentum=0.9, weight_decay=1e-4)

The `momentum` is consistent with the slides, the `weight_decay` is the coefficient of L2 regularization, and the `nesterov` is whether turn on nesterov momentum.

How to adjust learning rate?

`torch.optim.lr_scheduler` provides several methods to adjust the learning rate based on the number of epochs.

There we show the `CosineAnnealingLR` and how do schedulers work with optimizers. 

In [None]:
from torch.optim.lr_scheduler import CosineAnnealingLR

# optimizer: instanced before
scheduler = CosineAnnealingLR(optimizer, eta_min=0, T_max=args.epochs)
for epoch in range(epochs):
    # train part
    for input, target in train_loader:
        optimizer.zero_grad()
        ...
        optimizer.step()

    # test part
    for input, target in test_loader:
        ...
    scheduler.step()

#### 6. Train

There we only show a template of training script.

In [None]:
def train(train_loader, model, criterion, optimizer):
    model.train()
    for i, (images, target) in enumerate(train_loader):
        images = images.cuda()
        target = target.cuda()

        output = model(images)
        loss = criterion(output, label)

        # TODO calculate performance

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


@torch.no_grad()
def validate(val_loader, model, criterion):
    model.eval()
    for i, (images, target) in enumerate(val_loader):
        images = images.cuda()
        target = target.cuda()

        output = model(images)
        loss = criterion(output, label)

        # TODO calculate performance
        performance = ...
    return performance


for epoch in range(epochs):
    train(train_loader, model, criterion, optimizer)
    performance = validate(test_loader, model, criterion)
    # TODO compare the best performance with the current one, and save the current checkpoint
    torch.save(model.state_dict(), 'save_path')

Why `model.train()` and `model.eavl()`?

Some `Module`s have diffrent behavior in training mode and test/evalate mode, e.g. `nn.Dropout`, `nn.BatchNorm2d`.

##### 7. BatchNorm

In [None]:
class BatchNorm2(nn.Module):
    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
        super(BatchNorm2, self).__init__()
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum
        self.affine = affine
        if self.affine:
            self.gamma = nn.Parameter(torch.ones(num_features))
            self.beta = nn.Parameter(torch.zeros(num_features))
        self.register_buffer('running_mean', torch.zeros(num_features))
        self.register_buffer('running_var', torch.ones(num_features))
        self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))

    def forward(self, x):
        if self.training:
            mean = x.mean((0, 2, 3))
            var = x.var((0, 2, 3), unbiased=False)
            n = x.numel() / x.size(1)
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * var * n / (n - 1)
                self.num_batches_tracked += 1
        else:
            mean = self.running_mean
            var = self.running_var
        x = (x - mean[None, :, None, None]) / torch.sqrt(var[None, :, None, None] + self.eps)
        if self.affine:
            x = x * self.gamma[None, :, None, None] + self.beta[None, :, None, None]
        return x

#### 8. Autograd

In [None]:
from torch.autograd import Function


class Sigmoid(Function):
    @staticmethod
    def forward(ctx, input):
        output = 1 / (1 + torch.exp(-input))
        ctx.save_for_backward(output)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        output, = ctx.saved_tensors
        grad_input = grad_output * output * (1 - output)
        return grad_input