In [2]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

We use a module to define layers:

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self,x):
        '''Used to define the output of the network'''
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        
        # .view simply performs a reshape
        x = x.view(-1, self.num_flat_features(x))
        
        # Apply our non-linear transformations. Note, no normalization in output layer
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    
    
net = Net()
print(net)

Net (
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear (400 -> 120)
  (fc2): Linear (120 -> 84)
  (fc3): Linear (84 -> 10)
)


Having defined the forward operation, `backwards` is automatically defined using autograd.

In [4]:
params = list(net.parameters())
print(len(params)) # This is weights x 5 + biases x 5?
print(params[0].size())
print(params[1].size())

10
torch.Size([6, 1, 5, 5])
torch.Size([6])


In [5]:
input = Variable(torch.randn(1, 1, 32, 32)) # Inputs are Variables
out = net(input)
print(out)

Variable containing:
 0.0249  0.0601  0.1530 -0.0804 -0.0951  0.1624 -0.0149 -0.1549  0.0246  0.0109
[torch.FloatTensor of size 1x10]



There is no concept of a single example input: all inputs have a batch dimension

In [6]:
input = Variable(torch.randn(10, 1, 32, 32)) # Inputs are Variables
out = net(input)
print(out)

Variable containing:
 0.0019  0.0837  0.1345 -0.0872 -0.1118  0.1922  0.0327 -0.1203  0.0101  0.0264
 0.0026  0.0672  0.1460 -0.0898 -0.0836  0.1745  0.0239 -0.1283  0.0158  0.0238
 0.0093  0.0786  0.1222 -0.0836 -0.1114  0.1349  0.0013 -0.1126  0.0274  0.0136
 0.0129  0.0640  0.1037 -0.0885 -0.1120  0.1566  0.0100 -0.1213  0.0460  0.0064
 0.0075  0.0696  0.1498 -0.0649 -0.1061  0.1430  0.0279 -0.1167  0.0275  0.0397
-0.0169  0.0872  0.1331 -0.0714 -0.1036  0.1531 -0.0005 -0.1374  0.0372  0.0090
 0.0266  0.0553  0.1458 -0.0767 -0.0857  0.1571  0.0084 -0.1206  0.0267  0.0239
 0.0150  0.1086  0.1478 -0.0777 -0.1126  0.1626 -0.0151 -0.1264  0.0134  0.0314
 0.0062  0.0858  0.1271 -0.0827 -0.1105  0.1486  0.0137 -0.1179  0.0230  0.0064
 0.0159  0.0740  0.1464 -0.0614 -0.1249  0.1153 -0.0037 -0.1509  0.0485  0.0021
[torch.FloatTensor of size 10x10]



Zero the gradients and then initialize with random example.

In [8]:
net.zero_grad()
out.backward(torch.randn(10, 10))

## Defining loss

Loss is defined using the `nn` module

In [20]:
input = Variable(torch.randn(1, 1, 32, 32)) 
output = net(input)
target = Variable(torch.arange(1, 11))  # a dummy target, for example
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss) # Note that this automatically sums across examples

Variable containing:
 38.5496
[torch.FloatTensor of size 1]



In [21]:
loss.grad_fn

<torch.autograd.function.MSELossBackward at 0x1117fc228>

In [22]:
loss.backward() # The whole graph is differentiated
# All the graph variables now have their .grad updated:

In [27]:
net.conv1.weight.grad

Variable containing:
(0 ,0 ,.,.) = 
 -0.1817  0.0387  0.2465  0.0134  0.0687
 -0.0709  0.1390  0.0607 -0.0733  0.0229
 -0.0333 -0.3088  0.1943  0.0076 -0.3491
  0.1605  0.0167  0.2333  0.0434  0.0548
 -0.1442 -0.0616 -0.0858  0.0664 -0.0981

(1 ,0 ,.,.) = 
  0.1851  0.2014  0.2626 -0.1695  0.1036
  0.1662  0.2452  0.0833 -0.2856  0.2386
 -0.1514 -0.1682  0.0652 -0.0119  0.1201
  0.1846  0.2221 -0.5222  0.0968  0.2813
 -0.3031 -0.1098  0.0267  0.1611 -0.0775

(2 ,0 ,.,.) = 
 -0.2471 -0.1852  0.1820  0.0186 -0.1737
 -0.2447  0.1666  0.2954  0.0391  0.2407
  0.2722  0.2943  0.0079  0.3076 -0.2122
 -0.4312 -0.1070  0.0499 -0.0747 -0.1921
 -0.1333  0.2013 -0.1804  0.2001 -0.1866

(3 ,0 ,.,.) = 
 -0.1762 -0.0049  0.1020 -0.1775 -0.2316
  0.0612  0.0491 -0.5485 -0.1238 -0.0899
  0.1017 -0.0156 -0.0949 -0.1207 -0.0123
 -0.2272 -0.1303  0.0264  0.0550 -0.2789
 -0.1655  0.1447  0.3437  0.0388  0.2036

(4 ,0 ,.,.) = 
  0.4420  0.1717 -0.1792 -0.1662 -0.1717
  0.2337 -0.1935 -0.1935  0.0074 -0.384

In [26]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<torch.autograd.function.MSELossBackward object at 0x1117fc228>
<torch.autograd.function.AddmmBackward object at 0x1117fc138>
<AccumulateGrad object at 0x1117422b0>


## Updating weights

In [28]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

This would be placed in a loop for training:

In [29]:
# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update