In [75]:
# 回顾一下多层感知机
import torch
from torch import nn
from torch.nn import functional as F
net = nn.Sequential(nn.Linear(20,256),nn.ReLU(),nn.Linear(256,10))
X = torch.rand(2,20)
net(X)

tensor([[ 0.0183, -0.3242,  0.0764, -0.3096,  0.0431,  0.2293,  0.1005, -0.0343,
          0.0887, -0.1013],
        [ 0.1371, -0.1336, -0.1101, -0.2671,  0.1160,  0.1105,  0.1406, -0.0447,
         -0.0589,  0.0671]], grad_fn=<AddmmBackward0>)

In [76]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()  # 调用父类的__init__函数
        self.hidden = nn.Linear(20,256)
        self.out = nn.Linear(256,10)
        
    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))
    
# 实例化多层感知机的层，然后在每次调用正向传播函数调用这些层
net = MLP()
X = torch.rand(2,20)
net(X)

tensor([[ 0.0240,  0.1362,  0.0160, -0.0853,  0.2861, -0.0027,  0.0185,  0.0841,
         -0.0452, -0.1150],
        [-0.1350,  0.1716, -0.0538,  0.0069,  0.1952,  0.1092,  0.0853,  0.1694,
          0.1089, -0.2168]], grad_fn=<AddmmBackward0>)

In [77]:
# 在正向传播函数中执行代码
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.rand_weight = torch.rand((20,20),requires_grad=False)
        self.linear = nn.Linear(20,20)
    
    def forward(self, X):
        print(X)
        X = self.linear(X)
        print(X)
        X = F.relu(torch.mm(X, self.rand_weight + 1))
        X = self.linear(X)
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()
    
net = FixedHiddenMLP()
X = torch.rand(2,20)
net(X)

tensor([[0.6199, 0.4562, 0.5270, 0.5133, 0.8526, 0.3512, 0.3671, 0.6402, 0.0031,
         0.8154, 0.1975, 0.6630, 0.8635, 0.5900, 0.8252, 0.2710, 0.1777, 0.8980,
         0.7194, 0.4881],
        [0.9631, 0.0761, 0.2152, 0.3091, 0.5773, 0.6338, 0.2827, 0.6378, 0.2267,
         0.1501, 0.1072, 0.1524, 0.7047, 0.5498, 0.1173, 0.0652, 0.8560, 0.7080,
         0.4978, 0.3766]])
tensor([[ 1.0508,  0.3765,  0.5069,  0.2577,  0.3989, -0.0728, -0.5046,  0.1675,
         -0.1832,  0.3378, -0.1332,  0.5261,  0.0921,  0.1132, -0.2756, -0.2559,
         -0.3223,  0.5766, -0.2128,  0.6017],
        [ 0.8833,  0.6983,  0.4416,  0.2572,  0.2335, -0.2249, -0.5592, -0.0786,
         -0.1174,  0.4442, -0.1818,  0.2891,  0.0508,  0.3221,  0.2196, -0.3228,
         -0.2398,  0.2732,  0.0820,  0.5574]], grad_fn=<AddmmBackward0>)


tensor(0.2110, grad_fn=<SumBackward0>)

In [78]:
# 混合代培各种组合块的方法
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20,64),nn.ReLU(),
                                nn.Linear(64,32),nn.ReLU())
        self.linear = nn.Linear(32,16)
        
    def forward(self, X):
        return self.linear(self.net(X))
    
chimear = nn.Sequential(NestMLP(),nn.Linear(16,20),FixedHiddenMLP())
X = torch.rand(2,20)
chimear(X)

tensor([[ 0.0509,  0.0343, -0.2323,  0.2354, -0.0577,  0.0122, -0.0307,  0.0611,
          0.2376, -0.0769, -0.2693, -0.0558, -0.1228, -0.0960,  0.0741, -0.1657,
          0.0214,  0.1699, -0.1447,  0.2083],
        [ 0.0612,  0.0089, -0.2578,  0.2262, -0.0653,  0.0327, -0.0245,  0.0885,
          0.2383, -0.0579, -0.2597, -0.0499, -0.1326, -0.1080,  0.0822, -0.1370,
          0.0446,  0.1525, -0.1215,  0.2356]], grad_fn=<AddmmBackward0>)
tensor([[ 0.4135, -0.2373, -0.0962,  0.0167,  0.1737,  0.2596, -0.1876,  0.0925,
          0.1489,  0.1725,  0.0541,  0.1648,  0.3032, -0.0051,  0.1163, -0.2955,
         -0.1123, -0.2610, -0.2879,  0.1156],
        [ 0.4079, -0.2470, -0.0980,  0.0221,  0.1780,  0.2425, -0.1818,  0.0977,
          0.1692,  0.1671,  0.0522,  0.1547,  0.2968, -0.0144,  0.1135, -0.3089,
         -0.1019, -0.2553, -0.2953,  0.1174]], grad_fn=<AddmmBackward0>)


tensor(0.1978, grad_fn=<SumBackward0>)

# 参数管理

In [79]:
# 首先关注具有单隐藏层的多层感知机
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4,8),nn.ReLU(),nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

tensor([[-0.1850],
        [-0.0858]], grad_fn=<AddmmBackward0>)

In [80]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.0870, -0.3534,  0.0892,  0.3253, -0.0684,  0.2017,  0.1776, -0.1787]])), ('bias', tensor([-0.2428]))])


In [81]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)
print(net[2].bias.grad)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.2428], requires_grad=True)
tensor([-0.2428])
None


In [82]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])  # 一次性访问所有参数         
print(*[(name, param.shape) for name, param in net.named_parameters()])  # 0是第一层名字，1是ReLU，它没有参数
print(net.state_dict()['2.bias'].data) # 通过名字获取参数

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))
tensor([-0.2428])


In [83]:
# 从嵌套块收集参数
def block1():
    return nn.Sequential(nn.Linear(4,8),nn.ReLU(),nn.Linear(8,4),nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block{i}',block1()) # f'block{i}' 可以传一个字符串名字过来，block2可以嵌套四个block1                                      
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4,1))
print(rgnet)

Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [84]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01) # 下划线表示把m.weight的值替换掉   
        nn.init.zeros_(m.bias)
        
net.apply(init_normal) # 会递归调用 直到所有层都初始化
print(net[0].weight.data[0])
print(net[0].bias.data[0])

tensor([-0.0005,  0.0028,  0.0073, -0.0013])
tensor(0.)


In [85]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight,1)
        nn.init.zeros_(m.bias)
        
net.apply(init_constant)
print(net[0].weight.data[0]) 
print(net[0].bias.data[0])

tensor([1., 1., 1., 1.])
tensor(0.)


In [86]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)
        
net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.3354,  0.5250,  0.6564,  0.5923])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [87]:
# 自定义初始化
def my_init(m):
    if type(m) == nn.Linear:
        print("Init",*[(name, param.shape) for name, param in m.named_parameters()][0])  # 打印名字是啥，形状是啥       
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >=  5 # 这里*=的代码相当于先计算一个布尔矩阵(先判断>=)，然后再用布尔矩阵的对应元素去乘以原始矩阵的每个元素。保留绝对值大于5的权重，不是的话就设为0

net = nn.Sequential(nn.Linear(4,8),nn.ReLU(),nn.Linear(8,1))
net.apply(my_init)
print(net[0].weight[:2])
net[0].weight.data[:] += 1 # 参数替换
net[0].weight.data[0,0] = 42
print(net[0].weight.data[0])

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])
tensor([[-0.0000, -6.8228, -9.7396,  7.8894],
        [-0.0000,  0.0000,  0.0000, -6.2765]], grad_fn=<SliceBackward0>)
tensor([42.0000, -5.8228, -8.7396,  8.8894])


In [88]:
# 参数绑定
shared = nn.Linear(8,8)
net = nn.Sequential(nn.Linear(4,8),nn.ReLU(),shared,nn.ReLU(),shared,nn.ReLU(),nn.Linear(8,1))  # 第2个隐藏层和第3个隐藏层是share权重的，第一个和第四个是自己的  
net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0,0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


# 自定义层

In [89]:
import torch
import torch.nn.functional as F
from torch import nn

class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, X):
        return X - X.mean()
    
layer = CenteredLayer()
print(layer(torch.FloatTensor([1,2,3,4,5])))

tensor([-2., -1.,  0.,  1.,  2.])


In [90]:
# 将层作为组件合并到构建更复杂的模型中
net = nn.Sequential(nn.Linear(8,128),CenteredLayer())
Y = net(torch.rand(4,8))
print(Y.mean())

tensor(-6.9849e-09, grad_fn=<MeanBackward0>)


In [91]:
# 带参数的图层
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units,units)) # nn.Parameter使得这些参数加上了梯度    
        self.bias = nn.Parameter(torch.randn(units,))

    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)
    
dense = MyLinear(5,3)
print(dense.weight)

Parameter containing:
tensor([[ 1.5336,  0.2514, -0.2967],
        [ 0.6468,  0.5171, -0.9578],
        [-0.8309, -0.5955, -0.2437],
        [ 1.0786,  0.7183, -0.2313],
        [-0.0348, -0.4845, -1.4426]], requires_grad=True)


In [92]:
dense(torch.rand(2,5))

tensor([[1.0831, 0.5695, 0.0000],
        [1.2941, 1.0011, 0.0000]])

In [93]:
net = nn.Sequential(MyLinear(64,8),MyLinear(8,1))
net(torch.rand(2,64))

tensor([[23.1863],
        [14.4605]])

# 读写文件

In [96]:
import torch
from torch import nn
from torch.nn import functional as F

save_path = 'temp/x-file'

x = torch.arange(4)
torch.save(x, save_path)
x2 = torch.load(save_path)
print(x2)

tensor([0, 1, 2, 3])


In [98]:
y = torch.zeros(4)
torch.save([x,y],save_path)
x2, y2 = torch.load(save_path)
print(x2)
print(y2)

tensor([0, 1, 2, 3])
tensor([0., 0., 0., 0.])


In [99]:
# 写入或读取从字符串映射到张量的字典
mydict = {'x':x,'y':y}
torch.save(mydict,'mydict')
mydict2 = torch.load('mydict')
print(mydict2)

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}


In [100]:
# 加载和保存模型参数
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20,256)
        self.output = nn.Linear(256,10)
    
    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))
    
net = MLP()
X = torch.randn(size=(2,20))
Y = net(X)


In [101]:
torch.save(net.state_dict(),'temp/mlp.params')

In [102]:
# 实例化了原始多层感知机模型的一个备份。直接读取文件中存储的参数
clone = MLP() # 必须要先声明一下，才能导入参数
clone.load_state_dict(torch.load('temp/mlp.params'))
print(clone.eval()) # eval()是进入测试模式

Y_clone = clone(X)
print(Y_clone == Y)

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)
tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])
