# 残差网络

构建深度神经网络时会遇到两个主要问题：梯度爆炸/消散和网络退化问题。
* 梯度爆炸/消散，是由于随着层数的增加，在网络反向传播过程中梯度会随着级乘运算变得特别大或特别小，即梯度变得不稳定。可以通过BatchNrom技术解决。
* 网络退化，通常认为，随着网络深度的增加，网络的性能会获得相应的提升。**但是，我们发现当网络增加到一定程度后继续增加，网络的性能会变得越來越差，直接体现为训练集上的准确率会下降。** 我们假设通过简单地对叠方式得到很深的网络，网络内部的特征在某一层（浅层）已经达到最佳的性能，此时该网络的浅层形式的解空间是深层模型解空间的子空间。也就是说，如果我们能够将达到最佳性能的层之后的层训练成恒等映射，且深层网络可能得出更优的解来拟合训练集，因此深层网络能够更容易地降低训练误差。**但是，由于网络退化的问题，这一假设并不成立。**
* 通过分析，我们退而求其次，在已知深层网络存在退化的的情况下，寻求方法解决深层网络的退化问题，使得网络至少实现深层网络和浅层网络具有一样的性能。即让深层网络后面的部分即使不提升网络性能的情况下，至少能够实现恒等映射的作用，使得网络的性能不会随着深度的增加而出现退化。Residual模块被提出来解决这一问题。

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.utils.data as data
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

## dataset

In [2]:
def load_dataset(batch_size, size=None, num_workers=4):
    # dataset process
    trans = []
    if size:
        trans.append(torchvision.transforms.Resize(size=size))
    trans.append(torchvision.transforms.ToTensor())

    transform = torchvision.transforms.Compose(trans)

    # load
    mnist_train = torchvision.datasets.FashionMNIST(root='../Datasets/FashionMNIST', train=True, download=True,
                                                    transform=transform)
    mnist_test = torchvision.datasets.FashionMNIST(root='../Datasets/FashionMNIST', train=False, download=True,
                                                   transform=transform)
    # generate
    train_generator = data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_generator = data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    return train_generator, test_generator

## ResNet

### residual block
![ResNet](../Docs/residual_block.png)
### bottleneck block 
![ResNet](../Docs/bottleneck_block.png)
### Architecture for ImageNet
![ResNet](../Docs/resnet_architecture.png)

In [3]:
# residual block

class Residual(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(Residual, self).__init__()


        # main pass
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu1 =  nn.ReLU(inplace=True)

        # shortcut pass
        if stride != 1:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1),
                nn.BatchNorm2d(out_channels)
            )
        else:
            self.downsample = None

    def forward(self, x):

        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        # add main-pass and shortcut-pass  
        out += identity

        out = self.relu1(out)

        return out

In [4]:
# Flatten layer
class Flatten(nn.Module):
    def __init__(self):
        super(Flatten, self).__init__()

    def forward(self, x):

        return x.view(x.shape[0], -1)

In [5]:
# resnet_block
def resnet_block(in_channels, out_channels, stride, num_residuals, first_block=False):

    if first_block:
        assert out_channels == out_channels

    blk = []

    # first Residual execute down-sample
    blk.append(Residual(in_channels, out_channels, stride=stride))

    # others Residual do not down-sample
    for  _ in range(1, num_residuals):
        blk.append(Residual(out_channels, out_channels, stride=1))

    return nn.Sequential(*blk)

## renet_18 model

In [6]:
# base modules
model = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

In [7]:
# residule module
model.add_module("resnet_block1", resnet_block(64, 64, 1, 2, first_block=True))
model.add_module("resnet_block2", resnet_block(64, 128, 2, 2))
model.add_module("resnet_block3", resnet_block(128, 256, 2, 2))
model.add_module("resnet_block4", resnet_block(256, 512, 2, 2))

In [8]:
# classfify module
model.add_module('global_avgpool', nn.AdaptiveAvgPool2d((1, 1)))
model.add_module('fc', nn.Sequential(Flatten(), nn.Linear(512, 10)))

In [9]:
x = torch.rand(10, 1, 224, 224)
for name, layer in model.named_children():
    x = layer(x)
    print(name, x.shape)

0 torch.Size([10, 64, 112, 112])
1 torch.Size([10, 64, 112, 112])
2 torch.Size([10, 64, 112, 112])
3 torch.Size([10, 64, 56, 56])
resnet_block1 torch.Size([10, 64, 56, 56])
resnet_block2 torch.Size([10, 128, 28, 28])
resnet_block3 torch.Size([10, 256, 14, 14])
resnet_block4 torch.Size([10, 512, 7, 7])
global_avgpool torch.Size([10, 512, 1, 1])
fc torch.Size([10, 10])


In [10]:
def test(model, test_loader, epoch, device=None):
    """

    """
    model.eval()  # convert to eval(model)

    if device is None and isinstance(model, torch.nn.Module):
        # if device is None, use the net device
        device = list(model.parameters())[0].device
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)  # load data to device
            acc_sum += (model(x).argmax(dim=1) == y).float().sum().cpu().item()
            n += x.shape[0]

    print('Eval epoch {} => acc {:.4f}'.format(epoch, acc_sum / n))

In [11]:
def train(model, train_loader, loss, optimizer, epoch, device=None):
    """
    convert train model
    """
    model.train()
    
    train_acc, train_loss, num_samples = 0, 0.0, 0
    num_batch = 0
    
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        pred_y = model(x)
        l = loss(pred_y, y)
        # grad clearing
        optimizer.zero_grad()
        # computer grad
        l.backward()
        # update grad
        optimizer.step()
        
        train_loss += l.cpu().item()
        train_acc += (pred_y.argmax(dim=1) == y).float().sum().cpu().item()
        
        num_samples += x.shape[0]
        num_batch += 1
        
    print('Train epoch {} => loss {:.4f}, acc {:.4f}'.
          format(epoch, train_loss / num_batch, train_acc / num_samples))

In [12]:
# Adam
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)  

num_epochs = 10
batch_size = 256
lr, gamma = 0.001, 0.9
model = model.to(device)
loss = nn.CrossEntropyLoss()
# optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)  # SGDM

optimizer = optim.Adam(params=model.parameters(), lr=lr) # Adam
scheduler = StepLR(optimizer, step_size=2, gamma=gamma)

train_loader, test_loader = load_dataset(batch_size, size=(96, 96))

for epoch in range(num_epochs):
    train(model, train_loader, loss, optimizer, epoch+1, device)
    test(model, test_loader, epoch+1, device=device)
    scheduler.step(epoch)

cuda
Train epoch 1 => loss 0.4289, acc 0.8433
Eval epoch 1 => acc 0.8667
Train epoch 2 => loss 0.2657, acc 0.9024
Eval epoch 2 => acc 0.8947
Train epoch 3 => loss 0.2255, acc 0.9163
Eval epoch 3 => acc 0.9141
Train epoch 4 => loss 0.1918, acc 0.9274
Eval epoch 4 => acc 0.9098
Train epoch 5 => loss 0.1763, acc 0.9349
Eval epoch 5 => acc 0.9006
Train epoch 6 => loss 0.1530, acc 0.9431
Eval epoch 6 => acc 0.9171
Train epoch 7 => loss 0.1379, acc 0.9489
Eval epoch 7 => acc 0.9171
Train epoch 8 => loss 0.1160, acc 0.9566
Eval epoch 8 => acc 0.9145
Train epoch 9 => loss 0.1036, acc 0.9614
Eval epoch 9 => acc 0.9112
Train epoch 10 => loss 0.0823, acc 0.9695
Eval epoch 10 => acc 0.9181
