In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim


### 搭建模型

In [2]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [3]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128, 64)
        self.layer3 = LinearBNAC(64, 32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 
        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [4]:
model = Model(input_dimention=256,output_classes=10)
optimizer = optim.Adam(params=model.parameters(), lr=1e-3, weight_decay=1e-3)

In [5]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

In [6]:
output = model(dummy_input)
print(output)

tensor([[0.0874, 0.1842, 0.0491, 0.0396, 0.1998, 0.0708, 0.1065, 0.0631, 0.0701,
         0.1293],
        [0.0960, 0.1081, 0.0939, 0.0393, 0.1207, 0.1086, 0.0837, 0.1002, 0.0565,
         0.1930],
        [0.1393, 0.0885, 0.1273, 0.1085, 0.0754, 0.0890, 0.1079, 0.0776, 0.1087,
         0.0778],
        [0.0824, 0.2417, 0.0507, 0.1204, 0.0979, 0.0742, 0.0661, 0.0408, 0.0799,
         0.1459]], grad_fn=<SoftmaxBackward>)


In [7]:
output.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000], grad_fn=<SumBackward1>)

### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [8]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [9]:
criterion = NLLLoss()

In [10]:
loss = criterion(torch.log(output), target)

### 完成back propagation並更新梯度

In [11]:
loss.backward()

In [12]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0553, -0.0504,  0.0609,  ..., -0.0287, -0.0494,  0.0568],
        [-0.0212, -0.0117, -0.0313,  ..., -0.0621, -0.0211,  0.0202],
        [-0.0071,  0.0176,  0.0494,  ..., -0.0320,  0.0170, -0.0229],
        ...,
        [-0.0461,  0.0043,  0.0245,  ...,  0.0537, -0.0572,  0.0399],
        [-0.0402, -0.0555, -0.0086,  ..., -0.0239,  0.0106,  0.0425],
        [ 0.0436,  0.0020,  0.0081,  ...,  0.0619,  0.0623,  0.0491]],
       requires_grad=True)


grad : tensor([[-4.3168e-03, -8.5267e-04,  2.0987e-03,  ...,  4.6138e-03,
          1.7091e-03, -3.0851e-03],
        [ 1.3485e-02,  1.6925e-02,  1.4458e-02,  ...,  4.4687e-03,
          3.4342e-03, -1.5725e-02],
        [-4.3505e-02, -1.7017e-02,  6.9451e-03,  ...,  3.2420e-02,
          4.4675e-02, -1.2619e-02],
        ...,
        [-5.2509e-07, -1.7769e-07,  1.7438e-07,  ...,  5.1723e-07,
          7.0840e-07, -2.1821e-07],
        [-8.3354e-03, -3.4571e-03, -1.1615e-03,  ...,  2.0523e-03,
       

In [13]:
optimizer.step()

In [14]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0563, -0.0494,  0.0599,  ..., -0.0297, -0.0504,  0.0578],
        [-0.0222, -0.0127, -0.0323,  ..., -0.0631, -0.0221,  0.0212],
        [-0.0061,  0.0186,  0.0484,  ..., -0.0330,  0.0160, -0.0219],
        ...,
        [-0.0451,  0.0033,  0.0235,  ...,  0.0527, -0.0562,  0.0389],
        [-0.0392, -0.0545, -0.0076,  ..., -0.0249,  0.0096,  0.0435],
        [ 0.0446,  0.0010,  0.0091,  ...,  0.0629,  0.0613,  0.0501]],
       requires_grad=True)


grad : tensor([[-4.3168e-03, -8.5267e-04,  2.0987e-03,  ...,  4.6138e-03,
          1.7091e-03, -3.0851e-03],
        [ 1.3485e-02,  1.6925e-02,  1.4458e-02,  ...,  4.4687e-03,
          3.4342e-03, -1.5725e-02],
        [-4.3505e-02, -1.7017e-02,  6.9451e-03,  ...,  3.2420e-02,
          4.4675e-02, -1.2619e-02],
        ...,
        [-5.2509e-07, -1.7769e-07,  1.7438e-07,  ...,  5.1723e-07,
          7.0840e-07, -2.1821e-07],
        [-8.3354e-03, -3.4571e-03, -1.1615e-03,  ...,  2.0523e-03,
       

### 清空 gradient

In [15]:
optimizer.zero_grad()

In [16]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0563, -0.0494,  0.0599,  ..., -0.0297, -0.0504,  0.0578],
        [-0.0222, -0.0127, -0.0323,  ..., -0.0631, -0.0221,  0.0212],
        [-0.0061,  0.0186,  0.0484,  ..., -0.0330,  0.0160, -0.0219],
        ...,
        [-0.0451,  0.0033,  0.0235,  ...,  0.0527, -0.0562,  0.0389],
        [-0.0392, -0.0545, -0.0076,  ..., -0.0249,  0.0096,  0.0435],
        [ 0.0446,  0.0010,  0.0091,  ...,  0.0629,  0.0613,  0.0501]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
