In [2]:
import torch
import numpy as np 
import torch.nn as nn

### Simple Perceptron
[<img align="left"  width="720px" src="https://pythonmachinelearning.pro/wp-content/uploads/2017/09/Single-Perceptron.png.webp" />]

In [3]:
x = torch.arange(0,32).float()
net = torch.nn.Linear(32,10)#Perceptron
y = net(x)
print(y)

tensor([  8.1955,   1.0579,  15.0859,  -6.3511, -20.9711,   1.7192,   8.9273,
         14.1909,  -9.6321,  -1.2501], grad_fn=<AddBackward0>)


## High level.

In [4]:
# create a simple sequential network (`nn.Module` object) from layers (other `nn.Module` objects).
# Here a MLP with 2 layers and sigmoid activation.
net =nn.Sequential(
    nn.Linear(32,128),
    nn.Sigmoid(),
    nn.Linear(128,10))

## Equivalent: Customized Model

In [5]:
# create a more customizable network module (equivalent here)
class MyNetwork(nn.Module):
    # you can use the layer sizes as initialization arguments if you want to
    def __init__(self,input_size, hidden_size, output_size):
        super().__init__()
        self.layer1 = torch.nn.Linear(input_size,hidden_size)
        self.layer2 = torch.nn.Sigmoid()
        self.layer3 = torch.nn.Linear(hidden_size,output_size)

    def forward(self, input_val):
        h = input_val
        h = self.layer1(h)
        h = self.layer2(h)
        h = self.layer3(h)
        return h

net = MyNetwork(32,128,10)

The network tracks parameters, and you can access them through the parameters() method, which returns a python generator.

In [10]:
for param in net.parameters():
    print(param)
    print(param.shape,'\n\n','='*30)

Parameter containing:
tensor([[-0.0445, -0.0993,  0.0723,  ..., -0.1723, -0.0934, -0.0818],
        [-0.1584,  0.0480, -0.0258,  ..., -0.1634, -0.0084,  0.1414],
        [ 0.0462,  0.1074,  0.0803,  ..., -0.0347,  0.0871,  0.0650],
        ...,
        [ 0.1212,  0.0222,  0.0637,  ...,  0.0033, -0.1068,  0.1179],
        [ 0.1629,  0.0222,  0.0655,  ...,  0.1437,  0.0393,  0.0208],
        [ 0.0641, -0.1387,  0.0744,  ...,  0.0038,  0.0276, -0.0968]],
       requires_grad=True)
torch.Size([128, 32]) 

Parameter containing:
tensor([-0.0248, -0.1726, -0.1525, -0.0972, -0.0193, -0.0769, -0.0084, -0.0597,
         0.1447,  0.0664, -0.0304,  0.0216, -0.1537,  0.0302, -0.1199,  0.1206,
        -0.0987, -0.0624, -0.0368,  0.0228, -0.1737, -0.1218,  0.1544,  0.0848,
        -0.1738,  0.0609, -0.0617, -0.1352,  0.1411,  0.1150,  0.0559,  0.1451,
        -0.0823, -0.0339,  0.0223,  0.1260, -0.1543, -0.1100,  0.0125, -0.0470,
         0.0596, -0.0356,  0.1399, -0.0292, -0.1046,  0.0335, -0.1106, 

Parameters are of type Parameter, which is basically a wrapper for a tensor. How does pytorch retrieve your network's parameters ? They are simply all the attributes of type Parameter in your network. Moreover, if an attribute is of type nn.Module, its own parameters are added to your network's parameters ! This is why, when you define a network by adding up basic components such as nn.Linear, you should never have to explicitely define parameters.

However, if you are in a case where no pytorch default module does what you need, you can define parameters explicitely (this should be rare). For the record, let's build the previous MLP with personnalized parameters.
<br>
<br>
Parameters are useful in that they are meant to be all the network's weights that will be optimized during training. If you were needing to use a tensor in your computational graph that you want to remain constant, just define it as a regular tensor.

## From Scratch: MLP Model

In [11]:
class MyNetworkWithParams(nn.Module):
    def __init__(self,input_size, hidden_size, output_size):
        super(MyNetworkWithParams,self).__init__()
        self.layer1_weights = nn.Parameter(torch.randn(input_size,hidden_size))
        self.layer1_bias = nn.Parameter(torch.randn(hidden_size))
        self.layer2_weights = nn.Parameter(torch.randn(hidden_size,output_size))
        self.layer2_bias = nn.Parameter(torch.randn(output_size))
        
    def forward(self,x):
        h1 = torch.matmul(x,self.layer1_weights) + self.layer1_bias
        h1_act = torch.max(h1, torch.zeros(h1.size())) # ReLU
        output = torch.matmul(h1_act,self.layer2_weights) + self.layer2_bias
        return output

net = MyNetworkWithParams(32,128,10)

# **Training the Model**

In [13]:
net = nn.Sequential(
    nn.Linear(32,128),
    nn.Sigmoid(),
    nn.Linear(128,10))

In [15]:
x = torch.tensor([np.arange(32), np.zeros(32),np.ones(32)]).float()
y = torch.tensor([0,3,9])
criterion = nn.CrossEntropyLoss()

output = net(x)
print(output.shape)
loss = criterion(output,y)
print(loss)

torch.Size([3, 10])
tensor(2.3651, grad_fn=<NllLossBackward>)


nn.CrossEntropyLoss does both the softmax and the actual cross-entropy : given $output$ of size $(n,d)$ and $y$ of size $n$ and values in $0,1,...,d-1$, it computes $\sum_{i=0}^{n-1}log(s[i,y[i]])$ where $s[i,j] = \frac{e^{output[i,j]}}{\sum_{j'=0}^{d-1}e^{output[i,j']}}$

You can also compose nn.LogSoftmax and nn.NLLLoss to get the same result. Note that all these use the log-softmax rather than the softmax, for stability in the computations.

## LogSum trick for Softmax
\begin{aligned}
\log \left(\frac{e^{x_{j}}}{\sum_{i=1}^{n} e^{x_{i}}}\right) &=\log \left(e^{x_{j}}\right)-\log \left(\sum_{i=1}^{n} e^{x_{i}}\right) \\
&=x_{j}-\log \left(\sum_{i=1}^{n} e^{x_{i}}\right)
\end{aligned}
---
\begin{aligned}
\log \operatorname{Sum} \operatorname{Exp}\left(x_{1} \ldots x_{n}\right) &=\log \left(\sum_{i=1}^{n} e^{x_{i}}\right) \\
&=\log \left(\sum_{i=1}^{n} e^{x_{i}-c} e^{c}\right) \\
&=\log \left(e^{c} \sum_{i=1}^{n} e^{x_{i}-c}\right) \\
&=\log \left(\sum_{i=1}^{n} e^{x_{i}-c}\right)+\log \left(e^{c}\right) \\
&=\log \left(\sum_{i=1}^{n} e^{x_{i}-c}\right)+c
\end{aligned}
---
\begin{aligned}
\log \left(\operatorname{Softmax}\left(x_{j}, x_{1} \ldots x_{n}\right)\right) &=x_{j}-\log \operatorname{Sum} \operatorname{Exp}\left(x_{1} \ldots x_{n}\right) \\
&=x_{j}-\log \left(\sum_{i=1}^{n} e^{x_{i}-c}\right)-c
\end{aligned}
---
Choice of c: $max(x_{1}.... x_{n})$

In [16]:
# equivalent
criterion2 = nn.NLLLoss()
sf = nn.LogSoftmax()
output = net(x)
loss = criterion(sf(output),y)
loss

  """


tensor(2.3651, grad_fn=<NllLossBackward>)

Now, to perform the backward pass, just execute loss.backward() ! It will update gradients in all differentiable tensors in the graph, which in particular includes all the network parameters

In [17]:
loss.backward()

# Check that the parameters now have gradients
for param in net.parameters():
    print(param.grad)

tensor([[-0.0015, -0.0015, -0.0015,  ..., -0.0014, -0.0014, -0.0014],
        [ 0.0038,  0.0038,  0.0038,  ...,  0.0038,  0.0038,  0.0038],
        [ 0.0003, -0.0033, -0.0069,  ..., -0.1045, -0.1081, -0.1117],
        ...,
        [ 0.0052,  0.0052,  0.0052,  ...,  0.0052,  0.0052,  0.0052],
        [-0.0033, -0.0033, -0.0033,  ..., -0.0033, -0.0033, -0.0033],
        [ 0.0028,  0.0031,  0.0035,  ...,  0.0127,  0.0131,  0.0134]])
tensor([-4.5962e-03,  6.6439e-03, -2.7030e-03, -1.2382e-02,  8.0240e-04,
         6.4603e-03,  8.7266e-03,  3.0661e-03, -2.4326e-03, -1.1182e-04,
         4.1626e-03, -3.0078e-05, -3.1039e-03, -9.1784e-03, -7.4236e-03,
         5.0126e-03, -3.1953e-03, -1.2337e-03, -8.7900e-03, -3.3571e-03,
         5.6152e-03,  4.8836e-03,  6.8169e-03,  6.4579e-03,  5.8289e-03,
        -5.0017e-03,  1.5008e-03, -3.3783e-03,  4.6842e-03, -7.7432e-04,
        -4.2872e-03,  1.6882e-04,  8.5564e-04, -9.7265e-03, -9.2429e-03,
        -6.4451e-03,  4.6472e-03, -5.6689e-03,  8.3494e

In [19]:
# if I forward prop and backward prop again, gradients accumulate :
output = net(x)
loss = criterion(output,y)
loss.backward()
for param in net.parameters():
    print(param.grad)


tensor([[-0.0029, -0.0029, -0.0029,  ..., -0.0028, -0.0028, -0.0028],
        [ 0.0076,  0.0076,  0.0076,  ...,  0.0076,  0.0076,  0.0076],
        [ 0.0007, -0.0066, -0.0138,  ..., -0.2090, -0.2162, -0.2235],
        ...,
        [ 0.0105,  0.0105,  0.0105,  ...,  0.0105,  0.0105,  0.0105],
        [-0.0067, -0.0067, -0.0067,  ..., -0.0067, -0.0067, -0.0067],
        [ 0.0056,  0.0063,  0.0070,  ...,  0.0255,  0.0262,  0.0269]])
tensor([-9.1924e-03,  1.3288e-02, -5.4060e-03, -2.4764e-02,  1.6048e-03,
         1.2921e-02,  1.7453e-02,  6.1321e-03, -4.8652e-03, -2.2364e-04,
         8.3251e-03, -6.0157e-05, -6.2078e-03, -1.8357e-02, -1.4847e-02,
         1.0025e-02, -6.3905e-03, -2.4674e-03, -1.7580e-02, -6.7142e-03,
         1.1230e-02,  9.7673e-03,  1.3634e-02,  1.2916e-02,  1.1658e-02,
        -1.0003e-02,  3.0016e-03, -6.7565e-03,  9.3684e-03, -1.5486e-03,
        -8.5744e-03,  3.3764e-04,  1.7113e-03, -1.9453e-02, -1.8486e-02,
        -1.2890e-02,  9.2944e-03, -1.1338e-02,  1.6699e

In [22]:

# you can remove this behavior by reinitializing the gradients in your network parameters :
net.zero_grad()
output = net(x)
loss = criterion(output,y)
loss.backward()
for param in net.parameters():
    print(param.grad)

tensor([[-0.0015, -0.0015, -0.0015,  ..., -0.0014, -0.0014, -0.0014],
        [ 0.0037,  0.0037,  0.0037,  ...,  0.0037,  0.0037,  0.0037],
        [ 0.0003, -0.0026, -0.0055,  ..., -0.0840, -0.0870, -0.0899],
        ...,
        [ 0.0050,  0.0050,  0.0050,  ...,  0.0050,  0.0050,  0.0050],
        [-0.0034, -0.0034, -0.0034,  ..., -0.0034, -0.0034, -0.0034],
        [ 0.0027,  0.0030,  0.0033,  ...,  0.0123,  0.0126,  0.0130]])
tensor([-4.6727e-03,  6.3220e-03, -2.0580e-03, -1.2365e-02,  6.6032e-04,
         6.1108e-03,  8.4204e-03,  2.8265e-03, -2.5810e-03, -2.3217e-04,
         3.7667e-03, -1.6370e-04, -3.2447e-03, -9.1137e-03, -7.3391e-03,
         4.9606e-03, -3.3576e-03, -1.4155e-03, -8.7772e-03, -3.4888e-03,
         5.3576e-03,  4.5596e-03,  6.4182e-03,  5.8286e-03,  5.4092e-03,
        -4.8970e-03,  1.4591e-03, -3.5905e-03,  4.5060e-03, -1.0049e-03,
        -4.2782e-03,  4.3726e-07,  4.8739e-04, -9.5770e-03, -9.1103e-03,
        -6.4961e-03,  4.3143e-03, -5.8625e-03,  7.4166e

We did backpropagation, but still didn't perform gradient descent. Let's define an optimizer on the network parameters.

In [21]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.01)

print("Parameters before gradient descent :")
for param in net.parameters():
    print(param)



Parameters before gradient descent :
Parameter containing:
tensor([[-0.0665,  0.0787,  0.0762,  ...,  0.0451, -0.0749,  0.0715],
        [-0.0086,  0.0721, -0.1677,  ...,  0.0910,  0.1097,  0.1713],
        [-0.0464,  0.1281,  0.0059,  ..., -0.0247, -0.0458,  0.1560],
        ...,
        [ 0.1053, -0.0461, -0.1470,  ...,  0.1520, -0.1692,  0.0592],
        [-0.0015, -0.0106,  0.0541,  ...,  0.0139, -0.1742,  0.0896],
        [-0.1763,  0.1305, -0.0865,  ...,  0.0803,  0.1632, -0.1059]],
       requires_grad=True)
Parameter containing:
tensor([-0.0617, -0.0160, -0.1068,  0.0258, -0.0715, -0.0946, -0.1729,  0.1606,
         0.1488,  0.1733,  0.0291,  0.0723,  0.0955,  0.0350,  0.0553, -0.0803,
        -0.1007, -0.0082, -0.0058, -0.0245, -0.0759, -0.1736, -0.1110,  0.0062,
        -0.0482,  0.0586, -0.0710,  0.0856, -0.0387, -0.1235,  0.1474,  0.0184,
        -0.0450, -0.0980, -0.1758,  0.1344,  0.1751, -0.0039, -0.1666,  0.0593,
         0.1385, -0.0986,  0.0507,  0.0365,  0.1493, -0.05

In [23]:
optimizer.step()

print("Parameters after gradient descent :")
for param in net.parameters():
    print(param)

Parameters after gradient descent :
Parameter containing:
tensor([[-0.0664,  0.0787,  0.0762,  ...,  0.0451, -0.0748,  0.0716],
        [-0.0087,  0.0720, -0.1678,  ...,  0.0909,  0.1096,  0.1712],
        [-0.0465,  0.1281,  0.0060,  ..., -0.0228, -0.0439,  0.1580],
        ...,
        [ 0.1052, -0.0462, -0.1471,  ...,  0.1519, -0.1693,  0.0591],
        [-0.0014, -0.0105,  0.0542,  ...,  0.0140, -0.1742,  0.0897],
        [-0.1764,  0.1305, -0.0866,  ...,  0.0800,  0.1630, -0.1061]],
       requires_grad=True)
Parameter containing:
tensor([-0.0616, -0.0161, -0.1067,  0.0261, -0.0715, -0.0948, -0.1730,  0.1606,
         0.1488,  0.1733,  0.0290,  0.0723,  0.0955,  0.0352,  0.0555, -0.0804,
        -0.1006, -0.0081, -0.0056, -0.0244, -0.0761, -0.1737, -0.1112,  0.0061,
        -0.0483,  0.0587, -0.0710,  0.0857, -0.0388, -0.1235,  0.1475,  0.0184,
        -0.0450, -0.0978, -0.1757,  0.1346,  0.1750, -0.0038, -0.1666,  0.0592,
         0.1385, -0.0989,  0.0507,  0.0365,  0.1494, -0.052

In [24]:
# In a training loop, we should perform many GD iterations.
n_iter = 1000
for i in range(n_iter):
    optimizer.zero_grad() # equivalent to net.zero_grad()
    output = net(x)
    loss = criterion(output,y)
    loss.backward()
    optimizer.step()
    print(loss)

tensor(2.0970, grad_fn=<NllLossBackward>)
tensor(1.9778, grad_fn=<NllLossBackward>)
tensor(1.8709, grad_fn=<NllLossBackward>)
tensor(1.7751, grad_fn=<NllLossBackward>)
tensor(1.6898, grad_fn=<NllLossBackward>)
tensor(1.6134, grad_fn=<NllLossBackward>)
tensor(1.5446, grad_fn=<NllLossBackward>)
tensor(1.4822, grad_fn=<NllLossBackward>)
tensor(1.4254, grad_fn=<NllLossBackward>)
tensor(1.3737, grad_fn=<NllLossBackward>)
tensor(1.3265, grad_fn=<NllLossBackward>)
tensor(1.2836, grad_fn=<NllLossBackward>)
tensor(1.2445, grad_fn=<NllLossBackward>)
tensor(1.2089, grad_fn=<NllLossBackward>)
tensor(1.1762, grad_fn=<NllLossBackward>)
tensor(1.1462, grad_fn=<NllLossBackward>)
tensor(1.1184, grad_fn=<NllLossBackward>)
tensor(1.0926, grad_fn=<NllLossBackward>)
tensor(1.0685, grad_fn=<NllLossBackward>)
tensor(1.0460, grad_fn=<NllLossBackward>)
tensor(1.0248, grad_fn=<NllLossBackward>)
tensor(1.0050, grad_fn=<NllLossBackward>)
tensor(0.9862, grad_fn=<NllLossBackward>)
tensor(0.9685, grad_fn=<NllLossBac

In [30]:
output = net(x)
print(output)

print(f'Pred: {torch.argmax(output,axis = 1)}')
print(f'Actual: {y}')

tensor([[ 7.8674, -1.5083, -1.4658, -0.2442, -1.9451, -1.4340, -1.7717, -1.9070,
         -1.4887,  4.0256],
        [ 0.1304, -1.3213, -1.4189,  6.0413, -1.3553, -1.2953, -1.1516, -1.2912,
         -1.4824,  3.4400],
        [ 1.9529, -1.3238, -1.4886,  3.1046, -1.5425, -1.5455, -1.3501, -1.6006,
         -1.5675,  5.7876]], grad_fn=<AddmmBackward>)
Pred: tensor([0, 3, 9])
Actual: tensor([0, 3, 9])


## Saving And Loading Models

In [31]:
# get dictionary of keys to weights using `state_dict`
net = torch.nn.Sequential(
    torch.nn.Linear(28*28,256),
    torch.nn.Sigmoid(),
    torch.nn.Linear(256,10))
print(net.state_dict().keys())

odict_keys(['0.weight', '0.bias', '2.weight', '2.bias'])


In [32]:
# save a dictionary
torch.save(net.state_dict(),'test.t7')
# load a dictionary
net.load_state_dict(torch.load('test.t7'))

<All keys matched successfully>

In [None]:
class MyNet(nn.Module):
    def __init__(self,n_hidden_layers):
        super(MyNet,self).__init__()
        self.n_hidden_layers=n_hidden_layers
        self.final_layer = nn.Linear(128,10)
        self.act = nn.ReLU()
        self.hidden = []
        for i in range(n_hidden_layers):
            self.hidden.append(nn.Linear(128,128))
        self.hidden = nn.ModuleList(self.hidden)## without this just python list
                                                ## Parameter will not be absorbed to bigger Network
                                                ## So doesnt update on optim.step()
            
    def forward(self,x):
        h = x
        for i in range(self.n_hidden_layers):
            h = self.hidden[i](h)
            h = self.act(h)
        out = self.final_layer(h)
        return out