In [21]:
import torch as t
t.__version__

'1.5.0'

In [22]:
x = t.Tensor(5,3)
x = t.Tensor([[1,2],[3,4]])
x

tensor([[1., 2.],
        [3., 4.]])

In [23]:
x = t.rand(5, 3)
print(x)
print(x.shape)

tensor([[0.7537, 0.2159, 0.5056],
        [0.4562, 0.9566, 0.8105],
        [0.8472, 0.6623, 0.3249],
        [0.0277, 0.7713, 0.9531],
        [0.2288, 0.3497, 0.0284]])
torch.Size([5, 3])


In [24]:
x = t.ones(2, 2, requires_grad=True) # We want to let Tensor to use 'autograd'
print(x)
print(x.grad)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
None


In [25]:
y = x.sum()
print(y)
print(y.grad_fn)

tensor(4., grad_fn=<SumBackward0>)
<SumBackward0 object at 0x000002ADA2F23E08>


In [26]:
y.backward() # Backward Propagation
print(y)
print(x.grad)

tensor(4., grad_fn=<SumBackward0>)
tensor([[1., 1.],
        [1., 1.]])


### LeNet

In [90]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        
        # nn.Module.__init__(self)
        super(Net, self).__init__()
        
        # '1'表示輸入圖片為單通道，'6'表示輸出通道數，'5'表示卷積核為5 * 5
        # Convolutional Layer (卷積層)
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        
        # Affine Layer (仿射層) / Fully Connected Layer
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Convolution -> Activation -> Pooling
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # reshape '-1'表示自適應
        x = x.view(x.size()[0], -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


#### 只要在nn.Module的子類中定義了forward函數，backward函數就會自動被實現(利用`autograd`)。在`forward` 函數中可使用任何tensor支持的函數，還可以使用if、for循環、print、log等Python語法，寫法和標準的Python寫法一致。

#### 網絡的可學習參數通過`net.parameters()`返回，`net.named_parameters`可同時返回可學習的參數及名稱。

In [91]:
params = list(net.parameters())
print(len(params))

10


In [92]:
for name,parameters in net.named_parameters():
    print(name,':',parameters.size())

conv1.weight : torch.Size([6, 1, 5, 5])
conv1.bias : torch.Size([6])
conv2.weight : torch.Size([16, 6, 5, 5])
conv2.bias : torch.Size([16])
fc1.weight : torch.Size([120, 400])
fc1.bias : torch.Size([120])
fc2.weight : torch.Size([84, 120])
fc2.bias : torch.Size([84])
fc3.weight : torch.Size([10, 84])
fc3.bias : torch.Size([10])


##### 需要注意的是，torch.nn只支持mini-batches，不支持一次只输入一个样本，即一次必须是一个batch。但如果只想输入一个样本，则用 `input.unsqueeze(0)`将batch_size设为１。例如 `nn.Conv2d` 输入必须是4维的，形如$nSamples \times nChannels \times Height \times Width$。可将nSample设为1，即$1 \times nChannels \times Height \times Width$。

In [93]:
Input = t.randn(1, 1, 32, 32)
print(Input.shape)
print(Input)

torch.Size([1, 1, 32, 32])
tensor([[[[ 0.3459, -2.0160,  0.1444,  ...,  0.2156, -0.6544, -1.8561],
          [-0.1484,  0.4291,  0.9794,  ..., -0.4742, -0.1758, -2.0740],
          [-1.2784,  1.5764,  0.2369,  ..., -0.4243, -1.1019, -0.8197],
          ...,
          [-0.2057,  1.0419, -1.4551,  ...,  0.4286, -0.9357, -0.4304],
          [ 0.6962, -1.0762,  1.3391,  ...,  1.3006, -1.3452,  1.9534],
          [-0.8572, -0.8661,  1.8086,  ..., -0.9926, -1.0486, -0.8383]]]])


In [94]:
Out = net(Input)
print(Out.shape)
print(Out)

torch.Size([1, 10])
tensor([[ 0.0890,  0.0012, -0.0146,  0.0189,  0.0161, -0.1236, -0.0203,  0.0569,
         -0.0710, -0.0031]], grad_fn=<AddmmBackward>)


#### Loss Function

nn實現了神經網絡中大多數的損失函數，例如nn.MSELoss用來計算均方誤差，nn.CrossEntropyLoss用來計算交叉熵損失。

In [95]:
Ouput = net(Input)
target = t.arange(0,10).view(1,10).float() #just like reshape
criterion = nn.MSELoss()
loss = criterion(Ouput, target)
loss

tensor(28.6721, grad_fn=<MseLossBackward>)

如果對loss進行反向傳播溯源(使用`gradfn`屬性)，可看到它的計算圖如下：
```
input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d  
      -> view -> linear -> relu -> linear -> relu -> linear 
      -> MSELoss
      -> loss
```
當調用`loss.backward()`時，該圖會動態生成並自動微分，也即會自動計算圖中參數(Parameter)的導數。

In [96]:
# Backpropagation
net.zero_grad() # 把net中所有可學習參數的梯度清0
print('反向傳播之前 conv1.bias的梯度')
print(net.conv1.bias.grad)
loss.backward()
print('反向傳播之後 conv1.bias的梯度')
print(net.conv1.bias.grad)

反向傳播之前 conv1.bias的梯度
None
反向傳播之後 conv1.bias的梯度
tensor([-0.0300,  0.1192,  0.0406, -0.1068, -0.0778,  0.0823])


In [97]:
print(net.conv1.weight.grad)

tensor([[[[-0.0958, -0.0821, -0.1688, -0.2241, -0.0565],
          [ 0.0513, -0.1301, -0.0328,  0.0529, -0.1633],
          [ 0.0319, -0.0489,  0.0105, -0.1292, -0.0387],
          [ 0.0142,  0.0369,  0.0479, -0.1536, -0.0206],
          [ 0.0713, -0.0626, -0.1043, -0.1084, -0.0190]]],


        [[[-0.0252,  0.1546, -0.0709,  0.0597,  0.1519],
          [-0.0699,  0.0589,  0.0693, -0.0234,  0.1426],
          [ 0.0510,  0.0026, -0.1293, -0.0176,  0.0785],
          [-0.0019,  0.1655,  0.0004, -0.1150,  0.0089],
          [ 0.0150,  0.0066,  0.0283,  0.0059, -0.0356]]],


        [[[-0.0178, -0.0336, -0.0654,  0.0027, -0.0717],
          [-0.0642,  0.1092, -0.1425, -0.1365, -0.0334],
          [-0.0085,  0.0558,  0.0266, -0.1151, -0.0742],
          [ 0.0342, -0.1131, -0.0182, -0.0647, -0.0768],
          [-0.0952, -0.0263, -0.1068,  0.0266, -0.0138]]],


        [[[ 0.1121,  0.0271, -0.0050,  0.0367,  0.1382],
          [ 0.0828,  0.0312, -0.0333,  0.0853,  0.0702],
          [-0.0440,

#### Optimizer

在反向傳播計算完所有參數的梯度後，還需要使用優化方法來更新網絡的權重和參數，例如隨機梯度下降法(SGD)的更新策略如下：
```
weight = weight - learning_rate * gradient
```

手動實現如下：

```python
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)# inplace 減法
```

`torch.optim`中實現了深度學習中絕大多數的優化方法，例如RMSProp、Adam、SGD等，更便於使用，因此大多數時候並不需要手動寫上述代碼。

In [98]:
import torch.optim as optim
#新建一個優化器，指定要調整的參數和學習率
optimizer = optim.SGD(net.parameters(), lr = 0.01)


# 在訓練過程中
# 先梯度清零(與net.zero_grad()效果一樣)
optimizer.zero_grad()

# 計算損失
Output = net(Input)
loss = criterion(Output, target)

#反向傳播
loss.backward()

#更新參數
optimizer.step()