In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [15]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        # 1 channel (32x32) image => (28x28) image, so (5x5) kernel with stride 1
        
        # conv layer
        self.conv1 = nn.Conv2d(1, 6, 5) # (# input channel, # output channel, kernel size)
        self.conv2 = nn.Conv2d(6, 16 ,5) # (# input channel, # output channel, kernel size)
        
        # fc layer
        self.fc1 = nn.Linear(16 * 5 * 5, 120) # 16 * (5x5) feature map => 120 feature
        self.fc2 = nn.Linear(120, 84) # 120 feature => 84 feature
        self.fc3 = nn.Linear(84, 10) # 84 feature => 10 feature
        
    def forward(self, x):
        # max pooling over (2x2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        
        # if size is square, can specify only single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:] # all dimension except batch dimension, (# batch, # channel, width, height)
        num_features = 1
        for s in size:
            num_features *= s
            
        return num_features
    
    
net = Net()
print(net)
        
        

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [16]:
params = net.parameters()
print(type(params))
params = list(net.parameters())
print(len(params))

# weight1 -> bias1 -> weight2 -> bias2 ...
for param in params:
    print(param.size())

<class 'generator'>
10
torch.Size([6, 1, 5, 5])
torch.Size([6])
torch.Size([16, 6, 5, 5])
torch.Size([16])
torch.Size([120, 400])
torch.Size([120])
torch.Size([84, 120])
torch.Size([84])
torch.Size([10, 84])
torch.Size([10])


In [28]:
# test with (32x32) random input
input = torch.randn(1, 1, 32, 32) # (# batch, # channel, weight, height)
out = net(input)
print(out)


tensor([[ 0.0545,  0.0282, -0.0131, -0.0900,  0.0643, -0.0489, -0.1211, -0.0220,
          0.0320, -0.0396]], grad_fn=<AddmmBackward>)


In [29]:
# Zero the gradient buffers of all parameters and backprops with random gradients:
net.zero_grad()
out.backward(torch.randn(1, 10))


In [39]:
'''
torch.nn은 mini-batch만 취급함, single input은 취급 안함 => 1개짜리의 경우 1개 짜리 mini-batch의 shape를 만들어줘야함
ex) conv2d will take 4D tensor (batch number, # input channels, height, width)
If you have a single sample, just use input.unsqueeze(0) to add a fake batch dimension.
'''


'\ntorch.nn은 mini-batch만 취급함, single input은 취급 안함 => 1개짜리의 경우 1개 짜리 mini-batch의 shape를 만들어줘야함\nex) conv2d will take 4D tensor (batch number, # input channels, height, width)\nIf you have a single sample, just use input.unsqueeze(0) to add a fake batch dimension.\n'

In [59]:
# apply loss
output = net(input)
target = torch.randn(10) # dummy target
target = target.view(1, -1) # output과 같은 shape로 만들어줌
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor(0.6187, grad_fn=<MseLossBackward>)


In [60]:
'''
So, when we call loss.backward(), the whole graph is differentiated w.r.t. the loss, 
and all Tensors in the graph that has requires_grad=True will have their .grad Tensor 
accumulated with the gradient.
'''

'\nSo, when we call loss.backward(), the whole graph is differentiated w.r.t. the loss, \nand all Tensors in the graph that has requires_grad=True will have their .grad Tensor \naccumulated with the gradient.\n'

In [61]:
print(loss.grad_fn) # MSE Loss
print(loss.grad_fn.next_functions[0][0]) # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0]) # ReLU

<MseLossBackward object at 0x113181470>
<AddmmBackward object at 0x1131814e0>
<AccumulateGrad object at 0x113181470>


In [62]:
'''
backprop

To backpropagate the error all we have to do is to loss.backward(). 
You need to clear the existing gradients though, else gradients will be accumulated to existing gradients.
'''

net.zero_grad() # zeroes the gradient buffers of all parameters
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 1.0062e-02, -9.0888e-03, -2.1298e-03,  6.0976e-03,  6.6808e-05,
        -2.5223e-03])


In [64]:
'''
The simplest update rule used in practice is the Stochastic Gradient Descent (SGD):

weight = weight - learning_rate * gradient
However, as you use neural networks, you want to use various different update rules such as SGD, 
Nesterov-SGD, Adam, RMSProp, etc. To enable this, we built a small package: 
torch.optim that implements all these methods. Using it is very simple:
'''

import torch.optim as optim

# create optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# training loop
optimizer.zero_grad() # zero gradient buffer
output = net(input)
loss = criterion(output, target)

print('before update', net.conv1.bias)

loss.backward()
optimizer.step() # update

print('after update', net.conv1.bias)

before update Parameter containing:
tensor([ 0.1585, -0.0590,  0.0572,  0.0071, -0.1634, -0.1086],
       requires_grad=True)
after update Parameter containing:
tensor([ 0.1584, -0.0589,  0.0572,  0.0071, -0.1634, -0.1086],
       requires_grad=True)
