In [1]:
import torch
from torch import nn

In [2]:
net = nn.Sequential(
        nn.LazyLinear(8),
        nn.ReLU(),
        nn.LazyLinear(1))
X = torch.rand(size=(2,4))
net(X).shape



torch.Size([2, 1])

In [3]:
net[2].state_dict()

OrderedDict([('weight',
              tensor([[ 0.2287, -0.1146,  0.3449, -0.0366,  0.0884,  0.0805,  0.2402,  0.2717]])),
             ('bias', tensor([-0.2214]))])

In [4]:
## a parameter access

In [7]:
net[0].state_dict()

OrderedDict([('weight',
              tensor([[ 0.3454,  0.4216,  0.2427, -0.4341],
                      [ 0.1167,  0.1879, -0.3469,  0.4128],
                      [ 0.1604, -0.1559,  0.1518,  0.4788],
                      [-0.2936,  0.2885,  0.0866,  0.0140],
                      [ 0.3364,  0.3144,  0.0942, -0.4839],
                      [ 0.1579, -0.1807, -0.2815, -0.3517],
                      [ 0.0042,  0.0684, -0.2707,  0.1062],
                      [ 0.3793, -0.1321, -0.3124,  0.3092]])),
             ('bias',
              tensor([ 0.4356,  0.3989, -0.3122, -0.0047, -0.4070, -0.0933,  0.2532, -0.2828]))])

In [8]:
## Targeted parameters

In [9]:
type(net[2].bias),net[2].bias.data

(torch.nn.parameter.Parameter, tensor([-0.2214]))

In [11]:
net[2].weight.grad == None

True

In [12]:
net[2].weight.grad

In [13]:
## all parameters at once

In [14]:
[(name, param.shape) for name, param in net.named_parameters()]

[('0.weight', torch.Size([8, 4])),
 ('0.bias', torch.Size([8])),
 ('2.weight', torch.Size([1, 8])),
 ('2.bias', torch.Size([1]))]

In [15]:
## Tied Parameters

In [16]:
# sharing parameters accross multiple layers

In [17]:
shared = nn.LazyLinear(8)
net = nn.Sequential(nn.LazyLinear(8),nn.ReLU(),
                    shared,nn.ReLU(),
                    shared,nn.ReLU(),
                    nn.LazyLinear(1))
net(X)

tensor([[-0.0300],
        [-0.0417]], grad_fn=<AddmmBackward0>)

In [18]:
## check weather the parameters are the same

In [19]:
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0,0] = 100

print(net[2].weight.data[0] == net[4].weight.data[0])


tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


## Exercise

In [20]:
## Training and observing gradients