**第5章 深度学习计算**

@ Date: 2025-03-31<br>
@ Author: Rui Zhu<br>
@ Note: 使用astro环境以保证使用最新的pytorch版本

In [268]:
import torch
from torch import nn
from torch.nn import functional as F

---
# 块(block)
- 块: 可以描述单个层、由多个层组成的组件, 或整个模型本身
- 使用块进行抽象的好处是: 可以将一些块组合成更大的组件
- 编程的角度看, 块由类表示
- pytorch中由Module(模块)表示块
- 块必须提供的基本功能:
    1. 输入数据作为其向前传播函数的输入(即`net(X)`用法, 实际上调用`net.__call__(X)`, 等价于`net.forward(X)`和一些内部操作)
    2. 可以向前传播生成输出
    3. 计算输出关于输入的梯度
    4. 存储和访问所需参数
    5. 可以初始化

## 顺序块(nn.Sequential)
- 按输入的各个层的顺序组成块

In [269]:
net = nn.Sequential(
    nn.Linear(20, 256),
    nn.ReLU(),
    nn.Linear(256, 10),
)
net

Sequential(
  (0): Linear(in_features=20, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)

In [270]:
X = torch.rand(2, 20)
X

tensor([[0.1352, 0.5201, 0.8918, 0.0411, 0.8182, 0.0069, 0.5662, 0.3699, 0.0806,
         0.4544, 0.0214, 0.0541, 0.0650, 0.7913, 0.1111, 0.1528, 0.0482, 0.1007,
         0.9258, 0.7067],
        [0.6322, 0.1852, 0.8061, 0.1109, 0.0708, 0.7868, 0.9678, 0.1769, 0.1047,
         0.5166, 0.4200, 0.6454, 0.6484, 0.8026, 0.7675, 0.7060, 0.4776, 0.1926,
         0.9065, 0.0157]])

In [271]:
net(X)  # 调用模型获得输出

tensor([[ 0.0018, -0.0990,  0.0108,  0.1761,  0.3643,  0.0541, -0.0111,  0.0722,
          0.0359, -0.0259],
        [-0.0936, -0.1328,  0.0822,  0.0439,  0.3430, -0.0354,  0.0339,  0.1095,
         -0.0946,  0.0232]], grad_fn=<AddmmBackward0>)

In [272]:
net._modules

{'0': Linear(in_features=20, out_features=256, bias=True),
 '1': ReLU(),
 '2': Linear(in_features=256, out_features=10, bias=True)}

## 自定义块

In [273]:
# 编写MLP的块
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)  # 隐藏层
        self.out = nn.Linear(256, 10)  # 输出层
    
    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))

In [274]:
net = MLP()  # 不用担心反向传播函数或初始化, 自动完成
net(X)

tensor([[ 0.0269, -0.0152,  0.0758,  0.1535,  0.1294,  0.0723, -0.0340, -0.0219,
         -0.0129, -0.0445],
        [-0.0626,  0.0412,  0.0689,  0.2269,  0.2851, -0.0695,  0.0508, -0.0397,
         -0.0174,  0.0271]], grad_fn=<AddmmBackward0>)

## 手动实现顺序块

In [275]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self._modules[str(idx)] = module
    
    def forward(self, X):
        for block in self.children():
            X = block(X)
        return X
net = MySequential(
    nn.Linear(20, 256),
    nn.ReLU(),
    nn.Linear(256, 10),
)
net(X)

tensor([[ 0.1050,  0.0060,  0.1853, -0.0542,  0.0855, -0.1577,  0.1147,  0.0351,
          0.1566, -0.0883],
        [-0.0170,  0.0967,  0.0883, -0.0262,  0.0456, -0.1873, -0.0819, -0.0151,
          0.2610, -0.1860]], grad_fn=<AddmmBackward0>)

In [276]:
net._modules

{'0': Linear(in_features=20, out_features=256, bias=True),
 '1': ReLU(),
 '2': Linear(in_features=256, out_features=10, bias=True)}

## 在forward函数中执行代码

In [277]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.rand_weight = torch.rand((20, 20), requires_grad=False)  # 随机权重
        self.linear = nn.Linear(20, 20)
    
    def forward(self, X):
        X = self.linear(X)
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        X = self.linear(X)

        # 额外的控制
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

net = FixedHiddenMLP()
net(X)

tensor(-0.1555, grad_fn=<SumBackward0>)

## 组合块

In [278]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(20, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            )
        self.linear = nn.Linear(32, 16)
    
    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(
    NestMLP(), 
    nn.Linear(16, 20),
    FixedHiddenMLP()
)
chimera(X)

tensor(0.0723, grad_fn=<SumBackward0>)

---
# 参数管理

In [279]:
# 创建MLP
net = nn.Sequential(
    nn.Linear(4, 8), 
    nn.ReLU(), 
    nn.Linear(8, 1)
)
X = torch.rand(size=(2, 4))
net(X)

tensor([[0.3976],
        [0.3495]], grad_fn=<AddmmBackward0>)

## 参数访问

In [280]:
net._modules  # 查看模型的所有层

{'0': Linear(in_features=4, out_features=8, bias=True),
 '1': ReLU(),
 '2': Linear(in_features=8, out_features=1, bias=True)}

In [281]:
net[0].state_dict()  # 查看模型的参数

OrderedDict([('weight',
              tensor([[ 0.4308,  0.0258, -0.4830, -0.4100],
                      [ 0.1067, -0.3336,  0.1812,  0.3307],
                      [ 0.0948,  0.1487, -0.4548, -0.4789],
                      [ 0.2638, -0.1602,  0.4781, -0.2466],
                      [ 0.4083, -0.2553,  0.4273, -0.0307],
                      [ 0.1976, -0.4209,  0.3862,  0.3559],
                      [-0.1568, -0.0467, -0.0026,  0.4222],
                      [ 0.2289,  0.4882,  0.3886,  0.3086]])),
             ('bias',
              tensor([ 0.0929,  0.4731, -0.4559, -0.0874, -0.0534,  0.0716, -0.3312,  0.0290]))])

In [282]:
net[0].bias  # 查看模型的偏置

Parameter containing:
tensor([ 0.0929,  0.4731, -0.4559, -0.0874, -0.0534,  0.0716, -0.3312,  0.0290],
       requires_grad=True)

In [283]:
net[0].bias.data  # 查看模型的偏置数据

tensor([ 0.0929,  0.4731, -0.4559, -0.0874, -0.0534,  0.0716, -0.3312,  0.0290])

In [284]:
net[0].weight.grad == None  # 查看模型的权重梯度
# ! 因为没有调用反向传播函数, 所以梯度为None

True

In [285]:
net.state_dict()['0.weight']  # 一种访问模型参数的方法

tensor([[ 0.4308,  0.0258, -0.4830, -0.4100],
        [ 0.1067, -0.3336,  0.1812,  0.3307],
        [ 0.0948,  0.1487, -0.4548, -0.4789],
        [ 0.2638, -0.1602,  0.4781, -0.2466],
        [ 0.4083, -0.2553,  0.4273, -0.0307],
        [ 0.1976, -0.4209,  0.3862,  0.3559],
        [-0.1568, -0.0467, -0.0026,  0.4222],
        [ 0.2289,  0.4882,  0.3886,  0.3086]])

In [286]:
# 访问所有参数
for name, param in net.named_parameters():
    print(name, param.size())  # 打印参数名称和大小

0.weight torch.Size([8, 4])
0.bias torch.Size([8])
2.weight torch.Size([1, 8])
2.bias torch.Size([1])


In [287]:
# 访问所有参数
for name, param in net[0].named_parameters():
    print(name, param.size())  # 打印参数名称和大小

weight torch.Size([8, 4])
bias torch.Size([8])


## 嵌套块的参数访问

In [288]:
def block1():
    return nn.Sequential(
        nn.Linear(4, 8), 
        nn.ReLU(), 
        nn.Linear(8, 4), 
        nn.ReLU()
    )

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f"block {i}", block1())
    return net

rgnet = nn.Sequential(
    block2(), 
    nn.Linear(4, 1)
)
rgnet(X)

tensor([[0.1964],
        [0.1965]], grad_fn=<AddmmBackward0>)

In [289]:
rgnet

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)

In [290]:
rgnet[0][1][0].bias.data

tensor([ 0.0645,  0.2344, -0.2525,  0.4483, -0.2970,  0.0964, -0.2381,  0.1010])

## 参数初始化
- 默认情况下, pytorch会根据一个范围均匀地初始化权重和偏置矩阵, 这个范围是根据输入维度和输出维度计算出的
- 对于nn.Linear, 默认的初始化方法是Kaiming (He) 初始化: 使用均匀分布, 初始化范围由输入单元数决定$Var(W) = \frac{2}{n_{in}}$, bias为0初始化

In [291]:
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)

### 内置初始化方法

In [292]:
def init_normal(m):
    """
    使用正态分布初始化模型参数
    """
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)

net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]  # 查看模型的参数

(tensor([ 0.0122, -0.0087, -0.0002,  0.0211]), tensor(0.))

In [293]:
def init_constant(m):
    """
    使用常数初始化模型参数
    """
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]  # 查看模型的参数

(tensor([1., 1., 1., 1.]), tensor(0.))

In [294]:
## 不同的层使用不同的初始化
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)

print(net[0].weight.data)
print(net[2].weight.data)

tensor([[-0.6014,  0.0755,  0.5984, -0.4266],
        [-0.4349,  0.4243, -0.5063, -0.0501],
        [-0.6736, -0.4159,  0.3296, -0.6554],
        [-0.0694,  0.4402,  0.3145,  0.4364],
        [ 0.4501, -0.0775, -0.0040, -0.5507],
        [-0.3460, -0.6581,  0.1722, -0.1373],
        [-0.5771, -0.0448, -0.6864,  0.0262],
        [-0.2901, -0.4147, -0.0960,  0.2585]])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


### 自定义初始化

In [295]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.size()) for name, param in m.named_parameters()])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5


net.apply(my_init)
net[0].weight

Init ('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
Init ('weight', torch.Size([1, 8])) ('bias', torch.Size([1]))


Parameter containing:
tensor([[ 5.5724, -0.0000,  9.9922, -8.6631],
        [ 6.4397,  0.0000, -0.0000,  6.6075],
        [ 0.0000, -0.0000,  0.0000, -8.8608],
        [-5.9956, -0.0000,  6.7102, -5.7517],
        [ 6.1070,  0.0000,  5.0100, -0.0000],
        [ 7.0016, -6.0194, -8.1283,  5.4038],
        [-0.0000, -0.0000, -0.0000,  0.0000],
        [-8.4616,  0.0000,  0.0000,  7.0875]], requires_grad=True)

## 修改权重

In [296]:
net[0].weight.data

tensor([[ 5.5724, -0.0000,  9.9922, -8.6631],
        [ 6.4397,  0.0000, -0.0000,  6.6075],
        [ 0.0000, -0.0000,  0.0000, -8.8608],
        [-5.9956, -0.0000,  6.7102, -5.7517],
        [ 6.1070,  0.0000,  5.0100, -0.0000],
        [ 7.0016, -6.0194, -8.1283,  5.4038],
        [-0.0000, -0.0000, -0.0000,  0.0000],
        [-8.4616,  0.0000,  0.0000,  7.0875]])

In [297]:
net[0].weight.data += 1
net[0].weight.data[0, 0] = 42

In [298]:
net[0].weight.data

tensor([[42.0000,  1.0000, 10.9922, -7.6631],
        [ 7.4397,  1.0000,  1.0000,  7.6075],
        [ 1.0000,  1.0000,  1.0000, -7.8608],
        [-4.9956,  1.0000,  7.7102, -4.7517],
        [ 7.1070,  1.0000,  6.0100,  1.0000],
        [ 8.0016, -5.0194, -7.1283,  6.4038],
        [ 1.0000,  1.0000,  1.0000,  1.0000],
        [-7.4616,  1.0000,  1.0000,  8.0875]])

## 共享层的参数绑定

In [299]:
shared = nn.Linear(8, 8)
net = nn.Sequential(
    nn.Linear(4, 8), nn.ReLU(), 
    shared, nn.ReLU(),
    shared, nn.ReLU(),
    nn.Linear(8, 1)
)
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=8, bias=True)
  (3): ReLU()
  (4): Linear(in_features=8, out_features=8, bias=True)
  (5): ReLU()
  (6): Linear(in_features=8, out_features=1, bias=True)
)

In [300]:
net(X)

tensor([[-0.0932],
        [-0.0880]], grad_fn=<AddmmBackward0>)

In [301]:
# 检查参数是否相同
print(net[2].weight.data == net[4].weight.data)

tensor([[True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True]])


In [302]:
# 确保共享层实际上是同一个对象, 而不是只是有相同的值
net[2].weight.data[0, 0] = 100
print(net[2].weight.data == net[4].weight.data)

tensor([[True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True]])


---
# 自定义层

## 不带参数的层

In [303]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, X):
        return X - X.mean()
    
layer = CenteredLayer()
layer(torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0]))

tensor([-2., -1.,  0.,  1.,  2.])

In [304]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())
Y = net(torch.rand(4, 8))
Y.mean()

tensor(5.5879e-09, grad_fn=<MeanBackward0>)

## 带参数的层

In [305]:
# 自定义版本的全连接层
class MyLinear(nn.Module):
    def __init__(self, in_units, unints):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, unints))
        self.bias = nn.Parameter(torch.zeros(unints))
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

linear = MyLinear(5, 3)
linear.weight.data, linear.bias.data  # 查看模型的参数

(tensor([[ 0.7028, -1.1039,  0.2312],
         [-0.0592,  0.8001,  0.5042],
         [ 1.5984, -0.3153, -0.8094],
         [-0.3796,  0.1617,  0.0610],
         [ 1.1833,  0.7279, -1.1483]]),
 tensor([0., 0., 0.]))

In [306]:
net = nn.Sequential(MyLinear(64, 8), nn.Linear(8, 1))
net(torch.rand(2, 64))

tensor([[0.7795],
        [1.8713]], grad_fn=<AddmmBackward0>)

In [307]:
net

Sequential(
  (0): MyLinear()
  (1): Linear(in_features=8, out_features=1, bias=True)
)

---
# 读写文件

In [308]:
from pathlib import Path
dir_save = Path("/Users/rui/Code/Astronote/32_PyTorch/data/chapter5")

## 保存和加载张量

In [309]:
x = torch.arange(4)
torch.save(x, dir_save / 'x-file')  # 保存进一个二进制文件

In [310]:
x2 = torch.load(dir_save / 'x-file')  # 从文件中加载
x2

tensor([0, 1, 2, 3])

In [311]:
# 保存一个张量为元素的列表
y = torch.zeros(4)
torch.save([x, y], dir_save / 'xy-file')  # 保存进一个二进制文件

In [312]:
x2, y2 = torch.load(dir_save / 'xy-file')  # 从文件中加载
x2, y2

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [313]:
# 保存字典
mydict = {'x': x, 'y': y}
torch.save(mydict, dir_save / 'mydict')  # 保存进一个二进制文件

In [314]:
torch.load(dir_save / 'mydict')  # 从文件中加载

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

## 保存和加载模型参数
- 保存的是模型参数, 而不是模型

In [315]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)  # 隐藏层
        self.out = nn.Linear(256, 10)  # 输出层
    
    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))
    
net = MLP()
X = torch.rand(2, 20)
Y = net(X)

In [316]:
# 保存模型参数
torch.save(net.state_dict(), dir_save / 'mlp.params')  # 保存进一个二进制文件

In [317]:
new_MLP = MLP()
# 加载模型参数
new_MLP.load_state_dict(torch.load(dir_save / 'mlp.params'))
new_MLP.eval()  # 切换到评估模式

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (out): Linear(in_features=256, out_features=10, bias=True)
)

In [318]:
new_MLP(X) == Y  # 相同的模型参数, 所以输出相同

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

---
# GPU
- 默认, 张量在内存中创建, 在CPU中计算
- 计算必须在同一个计算设备上
- 使用GPU会提速, 但CPU和GPU之间的数据传输比计算慢的多
- print操作或将tensor转换成numpy, 数据会复制进内存, 造成额外的数据传输开销

## 查询和指定计算设备

In [319]:
# pytorch中的计算设备表示
torch.device('cpu')  # CPU
torch.device('cuda')  # GPU
torch.device('cuda:0')  # 第一个GPU
torch.device('cuda:1')  # 第二个GPU
torch.device('mps')  # Apple Silicon GPU

device(type='mps')

In [320]:
# 查询可用GPU数量
torch.cuda.device_count()  # 可用GPU数量

0

In [321]:
torch.mps.device_count()

1

In [322]:
# 优先使用GPU的代码
def try_gpu(i=0):
    if torch.cuda.device_count() > i:
        return torch.device(f'cuda:{i}')
    if torch.backends.mps.is_available():
        return torch.device('mps')
    return torch.device('cpu')

try_gpu()  # 返回可用的GPU设备

device(type='mps')

## 张量与GPU

In [323]:
# 创建张量时制定计算设备
X = torch.ones(2, 3, device='mps')
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='mps:0')

In [324]:
Y = torch.rand(2, 3, device='cpu')
Y

tensor([[0.4954, 0.8590, 0.6945],
        [0.6987, 0.3429, 0.7869]])

In [325]:
Y_mps = Y.to('mps')  # 将Y转移到GPU
Y_mps + X

tensor([[1.4954, 1.8590, 1.6945],
        [1.6987, 1.3429, 1.7869]], device='mps:0')

## 神经网络与GPU

In [326]:
%%time
X = torch.rand(10000, 30, device='mps')
net = nn.Sequential(nn.Linear(30, 256), nn.ReLU(), nn.Linear(256, 10))
net = net.to('mps')  # 将模型转移到GPU
net(X)  # 在GPU上运行

CPU times: user 1.7 ms, sys: 334 ms, total: 336 ms
Wall time: 337 ms


tensor([[ 0.0649, -0.0902,  0.0179,  ..., -0.0691, -0.2314, -0.0428],
        [ 0.0025, -0.0709,  0.1100,  ..., -0.0938, -0.2828, -0.1028],
        [-0.0916,  0.0171, -0.0261,  ..., -0.1459, -0.1277,  0.0034],
        ...,
        [ 0.0227, -0.0371, -0.0042,  ..., -0.0379, -0.0863,  0.0070],
        [ 0.1797, -0.0912, -0.0589,  ..., -0.1161, -0.2025, -0.0245],
        [ 0.1110, -0.0159,  0.0244,  ..., -0.1549, -0.2639,  0.0791]],
       device='mps:0', grad_fn=<LinearBackward0>)

In [327]:
net[0].weight.data.device  # 查看模型的参数所在设备

device(type='mps', index=0)

In [328]:
%%time
X = torch.rand(10000, 30, device='cpu')
net = nn.Sequential(nn.Linear(30, 256), nn.ReLU(), nn.Linear(256, 10))
net(X)

CPU times: user 12.6 ms, sys: 9.78 ms, total: 22.3 ms
Wall time: 5.75 ms


tensor([[ 0.0011, -0.2461,  0.0888,  ...,  0.0243,  0.0034, -0.1190],
        [ 0.0667, -0.1309,  0.1439,  ...,  0.0574, -0.0008, -0.0951],
        [ 0.0304, -0.1504,  0.1026,  ...,  0.1787, -0.0491, -0.0970],
        ...,
        [-0.0030, -0.0071,  0.0610,  ...,  0.0751, -0.0715, -0.1405],
        [ 0.0564, -0.1127,  0.0864,  ...,  0.1331, -0.0869, -0.0163],
        [ 0.1161, -0.2570,  0.1685,  ...,  0.0620, -0.0645, -0.1016]],
       grad_fn=<AddmmBackward0>)