In [1]:
# Neural networks can be constructed using the torch.nn package
# nn depends on autograd to define models and differentiate them.
# nn.Module contains layers, and a method forward(input) that returns output.

In [2]:
# Convnet
# It is a simple feed-forward network.It takes the input, feeds it through several layers one after the other, and then finally gives the output.

In [3]:
# A typical training procedure for a neural network is as follows:
    # define the neural network that has some learnable parameters(or weights)
    # iterate over a dataset of inputs
    # process input through the network
    # compute the loss
    # propagate gradients back into the network's parameters
    # update the weights of the network, typically using a simple update rule weight = weight - learning_rate * gradient

In [4]:
# Define the network:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        # 1 input channel image, 6 output channels, 5 x 5 convolutions
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]   # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [5]:
# we just have to define the forward function, and the backward function (where gradients are computed) is automatically defined for us using autograd
# we can use any of the Tensor operations in the forward function.
# The learnable parameters of a model are returned by net.parameters()

params = list(net.parameters())
print(len(params))
print(params[0].size())    # conv1's .weights

10
torch.Size([6, 1, 5, 5])


In [6]:
# Expected input size of this net is 32 x 32. To use this on MNIST dataset we need to resize the images to 32 x 32.
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)

tensor([[ 0.0619, -0.0816,  0.0347, -0.0521, -0.1245, -0.1010,  0.1808, -0.0026,
         -0.1364, -0.1164]], grad_fn=<ThAddmmBackward>)


In [7]:
# zero the gradient buffers of all parameters and backprop with random gradients
net.zero_grad()
out.backward(torch.randn(1, 10))

In [8]:
# torch.nn only supports mini-batches. The entire torch.nn package only supports inputs which are a mini-batch of samples and not a single sample.
# For example, nn.Conv2d will take in a 4D tensor of nSamples x nChannels x Height x Width.
# If we have a single sample, just use input.unsqueeze(0) to add a fake batch dimension.

In [9]:
# Recap:
    # torch.Tensor: A multi-dimensional array with support for autograd operations like backward(). Also holds gradients w.r.t. the tensor.
    # nn.Module: Neural Network module. Convinient way of encapsulating parameters, with helpers for moving them to GPU, exporting, loading etc.
    # nn.Parameters: A kind of Tensor, that is automatically registered as a parameter when assigned as an attribute to Module.
    # autograd.Function: implements forward and backward definitions of an autograd operation. Every Tensor operation, creates at least a single Function node, that connects to functions that created a Tensor and encodes its history. 

In [10]:
# we covered:
    # Defining a neural network.
    # Processing inputs and calling  backwards
# still left:
    # Computing the Loss
    # updating the weights of the network

In [11]:
# Loss Function
# A loss function takes the (output, target) pair of inputs, and estimates how far away the output is from the target.
# There are several loss functions under the nn.package.
# A simple loss is nn.MSELoss which computes the mean-squared error between the input and the target.

print(input)
output = net(input)
print(output)
target = torch.randn(10)   # dummy target
print(target)
target = target.view(1, -1)   # make it the same shape as output
print(target)
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor([[[[ 0.5620, -0.6591,  1.3516,  ..., -0.1924,  1.0999,  1.1851],
          [-0.8484,  1.0788, -0.1082,  ...,  1.2050, -1.1887,  0.1815],
          [ 1.1773, -0.8018,  0.5604,  ..., -0.6374, -1.3224,  0.0938],
          ...,
          [-2.7133, -0.3951,  0.4170,  ...,  1.7880, -0.8295, -0.1805],
          [-0.2353, -0.9250,  1.4448,  ..., -0.6633,  0.7067,  0.3530],
          [-1.4725,  0.5542, -0.5509,  ..., -1.5493, -1.8213,  1.0755]]]])
tensor([[ 0.0619, -0.0816,  0.0347, -0.0521, -0.1245, -0.1010,  0.1808, -0.0026,
         -0.1364, -0.1164]], grad_fn=<ThAddmmBackward>)
tensor([ 0.3116, -1.9741,  0.7914,  0.1806,  0.5653, -0.5590, -0.6772, -0.4924,
         1.3818, -0.1223])
tensor([[ 0.3116, -1.9741,  0.7914,  0.1806,  0.5653, -0.5590, -0.6772, -0.4924,
          1.3818, -0.1223]])
tensor(0.8237, grad_fn=<MseLossBackward>)


In [12]:
# now if we follow loss in backward direction, using its .grad_fn attribute, we will see a graph of computations that looks like this:
# input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
#       -> view -> linear -> relu -> linear -> relu -> linear
#       -> MSELoss
#       -> loss

In [13]:
# so when we call loss.backward(), the whole graph is differentiated w.r.t. the loss, and all Tensors in the graph that has requires_grad=True will have their .grad Tensor accumulated with the gradient.

print(loss.grad_fn)   # MSELoss
print(loss.grad_fn.next_functions[0][0])   # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])    # ReLU

<MseLossBackward object at 0x000001CBC13CCFD0>
<ThAddmmBackward object at 0x000001CBC13CCE80>
<ExpandBackward object at 0x000001CBC13CCFD0>


In [14]:
# Backprop
# To backpropagate the error all we have to do is to loss.backward().
# You need to clear the existing gradients though, else gradients will be accumulated to existing gradients.
net.zero_grad()

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0123, -0.0140, -0.0122, -0.0031, -0.0040,  0.0202])


In [15]:
# Update the weights
# The simplest update rule used in practice is the Stochastic Gradient Descent (SGD):
    # weight = weight - learning_rate * gradient
# We can implement this using simple python code:
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data*learning_rate)

In [16]:
# However, as we are using neural networks, we want to use various different update rules such as SGD, Nesterov-SGD, Adam, RMSProp etc.
# To enable this, a small package torch.optim implements all these methods.

In [17]:
import torch.optim as optim

# create the optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in the training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()   # Does the update

In [18]:
# Observe how gradient buffers had to be manually set to zero using optimizer.zero_grad().
# This is because gradients are accumulated as explained in Backprop section.