# 概述

添加了残差链接，使得传递的时候能够传递原本的x

In [2]:
from torch.nn import functional as F
import torch
from torch import nn

In [13]:
class Residual(nn.Module):
    def __init__(self, input_channels, num_channels, use_1x1conv=False, strides=1):
        super().__init__()
        self.conv1 = nn.Conv2d(
            input_channels, num_channels, kernel_size=3, padding=1, stride=strides)
        self.conv2 = nn.Conv2d(
            num_channels, num_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2d(
                input_channels, num_channels, kernel_size=1, stride=strides)
        else:
            self.conv3 = None
        
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)
#         self.relu = nn.ReLU(inplace=True)
        
    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = F.relu(self.bn2(self.conv2(Y)))
        if self.conv3:
            X = self.conv3(X) # 直接对X进行操作
        Y += X
        return F.relu(Y)

In [15]:
blk = Residual(3,3)
X = torch.rand((4,3,6,6))
Y = blk(X)
Y.shape

torch.Size([4, 3, 6, 6])

In [16]:
blk = Residual(3, 6, use_1x1conv=True, strides=2)
blk(X).shape

torch.Size([4, 6, 3, 3])

总结：


避免梯度消失：

1. 乘法变加法

大数 + 小数 => 大数字

大数字 * 小数字 => 小数字

因为残差引入，使得最底层的data也能在训练开始的时候拿到较大的梯度。