In [2]:
import torch
from torch import nn
from torch.nn import functional as F

net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(2, 20)
net(X)

tensor([[-0.0347,  0.2066,  0.3187,  0.1235, -0.0037, -0.0595,  0.0416,  0.0491,
         -0.0042,  0.1857],
        [ 0.1005,  0.1254,  0.3764,  0.1440,  0.0160, -0.2694,  0.1144, -0.0282,
         -0.0360,  0.1803]], grad_fn=<AddmmBackward0>)

In [3]:
class MLP(nn.Module):
    # 用模型参数声明层。这里，我们声明两个全连接的层
    def __init__(self):
        # 调用MLP的父类Module的构造函数来执行必要的初始化。
        # 这样，在类实例化时也可以指定其他函数参数，例如模型参数params（稍后将介绍）
        super().__init__()
        self.hidden = nn.Linear(20, 256)  # 隐藏层
        self.out = nn.Linear(256, 10)  # 输出层

    # 定义模型的前向传播，即如何根据输入X返回所需的模型输出
    def forward(self, X):
        # 注意，这里我们使用ReLU的函数版本，其在nn.functional模块中定义。
        return self.out(F.relu(self.hidden(X)))

In [4]:
net = MLP()
net(X)

tensor([[ 0.0388, -0.3408,  0.2042,  0.0306, -0.2758, -0.0026,  0.1648, -0.1144,
          0.1311,  0.0648],
        [ 0.1089, -0.1202,  0.1529, -0.0839, -0.3292, -0.0576,  0.4710, -0.3658,
          0.2675,  0.1157]], grad_fn=<AddmmBackward0>)

In [7]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # 不计算梯度的随机权重参数。因此其在训练期间保持不变
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, X):
        X = self.linear(X)
        # 使用创建的常量参数以及relu和mm函数
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        # 复用全连接层。这相当于两个全连接层共享参数
        X = self.linear(X)
        # 控制流
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

In [8]:
net = FixedHiddenMLP()
net(X)

tensor(-0.0580, grad_fn=<SumBackward0>)

In [9]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 32), nn.ReLU())
        self.linear = nn.Linear(32, 16)

    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
chimera(X)

tensor(0.2628, grad_fn=<SumBackward0>)

In [12]:
def create_network(block, num_instances):
    # 用dict存储多个实例
    network = nn.Sequential()
    for i in range(num_instances):
        network.add_module(f'block_{i+1}', block)
    return network

network = create_network(NestMLP(), 5)
print(network)

Sequential(
  (block_1): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=20, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=32, bias=True)
      (3): ReLU()
    )
    (linear): Linear(in_features=32, out_features=16, bias=True)
  )
  (block_2): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=20, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=32, bias=True)
      (3): ReLU()
    )
    (linear): Linear(in_features=32, out_features=16, bias=True)
  )
  (block_3): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=20, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=32, bias=True)
      (3): ReLU()
    )
    (linear): Linear(in_features=32, out_features=16, bias=True)
  )
  (block_4): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=20, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_fe

In [17]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[0.0722],
        [0.0720]], grad_fn=<AddmmBackward0>)

In [20]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.2446,  0.1207,  0.2551, -0.0940, -0.2056,  0.0566, -0.1113,  0.0060]])), ('bias', tensor([0.0621]))])


In [21]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [None]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)
print(rgnet)
rgnet[0][1][0].bias.data

In [30]:
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data)
print(net[2].weight.data)

tensor([[-0.6787,  0.1801, -0.4717,  0.3162],
        [ 0.3659,  0.5076,  0.2007,  0.6033],
        [-0.2916,  0.4648,  0.7005,  0.6386],
        [ 0.6129, -0.1688,  0.1507,  0.0872],
        [-0.6241,  0.4174,  0.1407,  0.3497],
        [-0.0282, -0.1201,  0.2963,  0.1715],
        [-0.2630, -0.2499,  0.5229,  0.0835],
        [ 0.5809, -0.6738, -0.5921, -0.3496]])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [33]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:7]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-0.0000,  6.0503,  7.8247, -5.8689],
        [ 0.0000, -0.0000,  0.0000,  8.3344],
        [ 0.0000, -0.0000, -0.0000, -0.0000],
        [ 5.1191, -0.0000,  9.7923,  0.0000],
        [-5.8740,  0.0000, -0.0000, -0.0000],
        [-0.0000, -9.4741, -6.6798, -0.0000],
        [-7.1783, -0.0000, -7.1985,  0.0000]], grad_fn=<SliceBackward0>)

In [34]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000,  7.0503,  8.8247, -4.8689])

In [36]:
# 我们需要给共享层一个名称，以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
print(net[4].weight.data)
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([[ 1.0000e+02,  1.5408e-01, -1.1503e-01,  2.6290e-01,  1.3124e-01,
         -6.6678e-02,  7.7402e-02, -2.6386e-01],
        [-3.3140e-01, -2.4262e-02,  1.0038e-01, -1.1308e-01, -2.4650e-01,
         -2.7085e-01, -3.1760e-01,  1.0358e-01],
        [-2.4603e-01, -1.0315e-02,  1.9020e-01,  1.4071e-01, -1.3780e-01,
         -4.7220e-02,  1.6639e-01,  2.0375e-01],
        [-3.2473e-01, -9.5662e-02,  1.6638e-01, -1.0988e-01, -2.9605e-01,
         -2.7116e-01, -5.7782e-02,  1.1711e-01],
        [ 2.6642e-01, -1.8178e-01,  3.4457e-01, -7.2284e-02,  2.2812e-01,
         -2.1333e-01,  1.3605e-01, -9.4919e-02],
        [-3.3886e-01, -1.7913e-01,  1.6338e-01, -1.3621e-01, -1.7881e-01,
         -6.8638e-02,  1.1468e-01, -2.8445e-01],
        [ 1.1786e-01,  3.5942e-02,  7.7553e-02,  7.9444e-02,  3.4760e-01,
          1.1686e-01, -2.3694e-01, -1.1211e-01],
        [-2.7277e-01,  3.5126e-01, -2.6578e-01,  2.0601e-01, -3.0520e-01,
        

In [37]:
class FancyMLP(nn.Module):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        self.rand_weight = torch.rand((20, 20), requires_grad=False) # 不可训练参数（常数参数）
        self.linear = nn.Linear(20, 20)
    def forward(self, x):
        x = self.linear(x)
        # 使用创建的常数参数，以及nn.functional中的relu函数和mm函数
        x = nn.functional.relu(torch.mm(x, self.rand_weight.data) + 1)
        # 复用全连接层。等价于两个全连接层共享参数
        x = self.linear(x)
        # 控制流，这里我们需要调用item函数来返回标量进行比较
        while x.norm().item() > 1:
            x /= 2
        if x.norm().item() < 0.8:
            x *= 10
        return x.sum()

net = FancyMLP()    
print(*[(name, param.shape) for name, param in net.named_parameters()])

('linear.weight', torch.Size([20, 20])) ('linear.bias', torch.Size([20]))


In [51]:
#延后初始化
net = nn.Sequential(nn.Linear(10,256), nn.ReLU(), nn.LazyLinear(10))
net[0].weight
X = torch.rand(2, 10)
net(X)
print(X)
print(net[0].weight.shape)
net[2].weight.shape

tensor([[0.6289, 0.3533, 0.0021, 0.6800, 0.9287, 0.8381, 0.5899, 0.5930, 0.0925,
         0.4640],
        [0.6616, 0.8347, 0.8842, 0.6482, 0.0401, 0.8691, 0.8436, 0.0733, 0.0607,
         0.8395]])
torch.Size([256, 10])


torch.Size([10, 256])