In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3, padding=2)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16*28*28, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.relu(self.conv1(x))
        # If the size is a square you can only specify a single number
        x = F.relu(self.conv2(x))
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=12544, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [50]:
input = torch.randn(1, 1, 28, 28)
output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

torch.Size([1, 16, 28, 28])
tensor(0.9511, grad_fn=<MseLossBackward>)


In [56]:
net.zero_grad()
loss.backward()
print(net.conv1.bias.grad)

tensor([-0.0011,  0.0100,  0.0023,  0.0042, -0.0052,  0.0050])


In [59]:
import torch.optim as optim

In [60]:
# new optimizer, first arg is all weights
optimizer = optim.SGD(net.parameters(), lr=0.1)

In [67]:
examples = [(torch.randn(1, 1, 28, 28), torch.randn(1)) for _ in range(5)]

for input, target in examples:
    optimizer.zero_grad() # reset gradient buffers
    output = net(input) # run net on input, get output
    loss = criterion(output, target) # compute loss
    loss.backward() # compute gradient (dweights/dloss)
    optimizer.step() # update weights (using SGD for example)


torch.Size([1, 16, 28, 28])
torch.Size([1, 16, 28, 28])
torch.Size([1, 16, 28, 28])
torch.Size([1, 16, 28, 28])
torch.Size([1, 16, 28, 28])


  return F.mse_loss(input, target, reduction=self.reduction)
