**第5章 深度学习计算**

@ Date: 2025-03-31<br>
@ Author: Rui Zhu

In [143]:
import torch
from torch import nn
from torch.nn import functional as F

---
# 块(block)
- 块: 可以描述单个层、由多个层组成的组件, 或整个模型本身
- 使用块进行抽象的好处是: 可以将一些块组合成更大的组件
- 编程的角度看, 块由类表示
- pytorch中由Module(模块)表示块
- 块必须提供的基本功能:
    1. 输入数据作为其向前传播函数的输入(即`net(X)`用法, 实际上调用`net.__call__(X)`, 等价于`net.forward(X)`和一些内部操作)
    2. 可以向前传播生成输出
    3. 计算输出关于输入的梯度
    4. 存储和访问所需参数
    5. 可以初始化

## 顺序块(nn.Sequential)
- 按输入的各个层的顺序组成块

In [144]:
net = nn.Sequential(
    nn.Linear(20, 256),
    nn.ReLU(),
    nn.Linear(256, 10),
)
net

Sequential(
  (0): Linear(in_features=20, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)

In [145]:
X = torch.rand(2, 20)
X

tensor([[0.0503, 0.6799, 0.6930, 0.3698, 0.9758, 0.4033, 0.8261, 0.4749, 0.5420,
         0.5808, 0.3703, 0.6608, 0.6753, 0.0940, 0.5640, 0.7127, 0.4017, 0.2146,
         0.2014, 0.1142],
        [0.6852, 0.1228, 0.3651, 0.1883, 0.7049, 0.6834, 0.6705, 0.1299, 0.8873,
         0.1896, 0.0326, 0.2032, 0.0810, 0.6917, 0.5289, 0.9858, 0.9337, 0.7571,
         0.8813, 0.4752]])

In [146]:
net(X)  # 调用模型获得输出

tensor([[-0.0664, -0.0897, -0.3093, -0.1777, -0.0228,  0.0743, -0.1496, -0.0826,
         -0.0046,  0.0664],
        [-0.2194, -0.1063, -0.3389, -0.2302,  0.1186,  0.1686, -0.1135, -0.0627,
          0.1381,  0.0696]], grad_fn=<AddmmBackward0>)

In [147]:
net._modules

OrderedDict([('0', Linear(in_features=20, out_features=256, bias=True)),
             ('1', ReLU()),
             ('2', Linear(in_features=256, out_features=10, bias=True))])

## 自定义块

In [148]:
# 编写MLP的块
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)  # 隐藏层
        self.out = nn.Linear(256, 10)  # 输出层
    
    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))

In [149]:
net = MLP()  # 不用担心反向传播函数或初始化, 自动完成
net(X)

tensor([[ 0.0761,  0.2876,  0.1934, -0.0321, -0.0074, -0.0612,  0.2141, -0.0675,
         -0.1378,  0.1146],
        [ 0.1247,  0.3288,  0.0373, -0.1043, -0.0032, -0.0869,  0.2477,  0.0424,
         -0.0485,  0.0034]], grad_fn=<AddmmBackward0>)

## 手动实现顺序块

In [150]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self._modules[str(idx)] = module
    
    def forward(self, X):
        for block in self.children():
            X = block(X)
        return X
net = MySequential(
    nn.Linear(20, 256),
    nn.ReLU(),
    nn.Linear(256, 10),
)
net(X)

tensor([[ 0.0054, -0.1423,  0.0762,  0.1640, -0.0375, -0.1534, -0.1280, -0.0080,
          0.1053,  0.0697],
        [-0.1190, -0.0507,  0.0269,  0.2294, -0.0397, -0.2902, -0.0873, -0.0166,
          0.2833,  0.0576]], grad_fn=<AddmmBackward0>)

In [151]:
net._modules

OrderedDict([('0', Linear(in_features=20, out_features=256, bias=True)),
             ('1', ReLU()),
             ('2', Linear(in_features=256, out_features=10, bias=True))])

## 在forward函数中执行代码

In [152]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.rand_weight = torch.rand((20, 20), requires_grad=False)  # 随机权重
        self.linear = nn.Linear(20, 20)
    
    def forward(self, X):
        X = self.linear(X)
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        X = self.linear(X)

        # 额外的控制
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

net = FixedHiddenMLP()
net(X)

tensor(0.0886, grad_fn=<SumBackward0>)

## 组合块

In [153]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(20, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            )
        self.linear = nn.Linear(32, 16)
    
    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(
    NestMLP(), 
    nn.Linear(16, 20),
    FixedHiddenMLP()
)
chimera(X)

tensor(0.0520, grad_fn=<SumBackward0>)

---
# 参数管理

In [154]:
# 创建MLP
net = nn.Sequential(
    nn.Linear(4, 8), 
    nn.ReLU(), 
    nn.Linear(8, 1)
)
X = torch.rand(size=(2, 4))
net(X)

tensor([[-0.2293],
        [-0.1451]], grad_fn=<AddmmBackward0>)

## 参数访问

In [155]:
net._modules  # 查看模型的所有层

OrderedDict([('0', Linear(in_features=4, out_features=8, bias=True)),
             ('1', ReLU()),
             ('2', Linear(in_features=8, out_features=1, bias=True))])

In [156]:
net[0].state_dict()  # 查看模型的参数

OrderedDict([('weight',
              tensor([[ 0.4699, -0.3585, -0.1865,  0.3454],
                      [ 0.4991, -0.1852,  0.2873, -0.1153],
                      [-0.3224,  0.3879,  0.4573,  0.1807],
                      [ 0.0139, -0.0044,  0.4673,  0.4174],
                      [-0.1147,  0.3683,  0.3931,  0.4315],
                      [-0.0214,  0.2101, -0.4896,  0.2749],
                      [ 0.0464,  0.4439,  0.4932,  0.1798],
                      [-0.3920, -0.1576,  0.3213,  0.4544]])),
             ('bias',
              tensor([-0.2147,  0.4471, -0.0628,  0.3382,  0.1391,  0.3587,  0.1207, -0.4256]))])

In [157]:
net[0].bias  # 查看模型的偏置

Parameter containing:
tensor([-0.2147,  0.4471, -0.0628,  0.3382,  0.1391,  0.3587,  0.1207, -0.4256],
       requires_grad=True)

In [158]:
net[0].bias.data  # 查看模型的偏置数据

tensor([-0.2147,  0.4471, -0.0628,  0.3382,  0.1391,  0.3587,  0.1207, -0.4256])

In [159]:
net[0].weight.grad == None  # 查看模型的权重梯度
# ! 因为没有调用反向传播函数, 所以梯度为None

True

In [160]:
net.state_dict()['0.weight']  # 一种访问模型参数的方法

tensor([[ 0.4699, -0.3585, -0.1865,  0.3454],
        [ 0.4991, -0.1852,  0.2873, -0.1153],
        [-0.3224,  0.3879,  0.4573,  0.1807],
        [ 0.0139, -0.0044,  0.4673,  0.4174],
        [-0.1147,  0.3683,  0.3931,  0.4315],
        [-0.0214,  0.2101, -0.4896,  0.2749],
        [ 0.0464,  0.4439,  0.4932,  0.1798],
        [-0.3920, -0.1576,  0.3213,  0.4544]])

In [161]:
# 访问所有参数
for name, param in net.named_parameters():
    print(name, param.size())  # 打印参数名称和大小

0.weight torch.Size([8, 4])
0.bias torch.Size([8])
2.weight torch.Size([1, 8])
2.bias torch.Size([1])


In [162]:
# 访问所有参数
for name, param in net[0].named_parameters():
    print(name, param.size())  # 打印参数名称和大小

weight torch.Size([8, 4])
bias torch.Size([8])


## 嵌套块的参数访问

In [163]:
def block1():
    return nn.Sequential(
        nn.Linear(4, 8), 
        nn.ReLU(), 
        nn.Linear(8, 4), 
        nn.ReLU()
    )

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f"block {i}", block1())
    return net

rgnet = nn.Sequential(
    block2(), 
    nn.Linear(4, 1)
)
rgnet(X)

tensor([[0.1392],
        [0.1391]], grad_fn=<AddmmBackward0>)

In [164]:
rgnet

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)

In [165]:
rgnet[0][1][0].bias.data

tensor([ 0.3418, -0.1416, -0.2427, -0.2503,  0.0218,  0.2951, -0.4175, -0.2851])

## 参数初始化
- 默认情况下, pytorch会根据一个范围均匀地初始化权重和偏置矩阵, 这个范围是根据输入维度和输出维度计算出的
- 对于nn.Linear, 默认的初始化方法是Kaiming (He) 初始化: 使用均匀分布, 初始化范围由输入单元数决定$Var(W) = \frac{2}{n_{in}}$, bias为0初始化

In [166]:
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)

### 内置初始化方法

In [167]:
def init_normal(m):
    """
    使用正态分布初始化模型参数
    """
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)

net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]  # 查看模型的参数

(tensor([ 0.0134,  0.0102, -0.0028,  0.0220]), tensor(0.))

In [168]:
def init_constant(m):
    """
    使用常数初始化模型参数
    """
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]  # 查看模型的参数

(tensor([1., 1., 1., 1.]), tensor(0.))

In [169]:
## 不同的层使用不同的初始化
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)

print(net[0].weight.data)
print(net[2].weight.data)

tensor([[-0.2316, -0.4256, -0.1055, -0.1780],
        [ 0.5858, -0.6287, -0.2829, -0.4852],
        [ 0.1387, -0.1904,  0.3128, -0.4191],
        [-0.2791, -0.5202, -0.2405, -0.5866],
        [ 0.3185, -0.1187, -0.4437, -0.6686],
        [ 0.5539, -0.4810,  0.4688, -0.1337],
        [ 0.0854, -0.4808, -0.4596,  0.5794],
        [-0.6199,  0.5872, -0.0100,  0.3264]])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


### 自定义初始化

In [170]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.size()) for name, param in m.named_parameters()])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5


net.apply(my_init)
net[0].weight

Init ('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
Init ('weight', torch.Size([1, 8])) ('bias', torch.Size([1]))


Parameter containing:
tensor([[-0.0000, -7.6155, -9.7038, -0.0000],
        [-8.3351,  0.0000,  8.7073,  6.6785],
        [ 7.5402,  0.0000, -0.0000,  0.0000],
        [ 6.2078, -6.2822, -0.0000, -0.0000],
        [-5.4853, -0.0000, -0.0000,  0.0000],
        [ 0.0000,  5.7375, -5.0802, -0.0000],
        [ 5.1756,  5.6453, -0.0000, -7.7877],
        [-8.1221, -7.7689,  0.0000,  8.5102]], requires_grad=True)

## 修改权重

In [171]:
net[0].weight.data

tensor([[-0.0000, -7.6155, -9.7038, -0.0000],
        [-8.3351,  0.0000,  8.7073,  6.6785],
        [ 7.5402,  0.0000, -0.0000,  0.0000],
        [ 6.2078, -6.2822, -0.0000, -0.0000],
        [-5.4853, -0.0000, -0.0000,  0.0000],
        [ 0.0000,  5.7375, -5.0802, -0.0000],
        [ 5.1756,  5.6453, -0.0000, -7.7877],
        [-8.1221, -7.7689,  0.0000,  8.5102]])

In [172]:
net[0].weight.data += 1
net[0].weight.data[0, 0] = 42

In [173]:
net[0].weight.data

tensor([[42.0000, -6.6155, -8.7038,  1.0000],
        [-7.3351,  1.0000,  9.7073,  7.6785],
        [ 8.5402,  1.0000,  1.0000,  1.0000],
        [ 7.2078, -5.2822,  1.0000,  1.0000],
        [-4.4853,  1.0000,  1.0000,  1.0000],
        [ 1.0000,  6.7375, -4.0802,  1.0000],
        [ 6.1756,  6.6453,  1.0000, -6.7877],
        [-7.1221, -6.7689,  1.0000,  9.5102]])

## 共享层的参数绑定

In [175]:
shared = nn.Linear(8, 8)
net = nn.Sequential(
    nn.Linear(4, 8), nn.ReLU(), 
    shared, nn.ReLU(),
    shared, nn.ReLU(),
    nn.Linear(8, 1)
)
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=8, bias=True)
  (3): ReLU()
  (4): Linear(in_features=8, out_features=8, bias=True)
  (5): ReLU()
  (6): Linear(in_features=8, out_features=1, bias=True)
)

In [176]:
net(X)

tensor([[0.0936],
        [0.1038]], grad_fn=<AddmmBackward0>)

In [177]:
# 检查参数是否相同
print(net[2].weight.data == net[4].weight.data)

tensor([[True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True]])


In [180]:
# 确保共享层实际上是同一个对象, 而不是只是有相同的值
net[2].weight.data[0, 0] = 100
print(net[2].weight.data == net[4].weight.data)

tensor([[True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True]])


---
# 层