# 批归一化（LeNet）

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.utils.data as data
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import utils as d2l

## dataset

In [2]:
def load_dataset(batch_size, size=None, num_workers=4):
    
    # dataset process
    trans = []
    if size:
        trans.append(torchvision.transforms.Resize(size=size))
    trans.append(torchvision.transforms.ToTensor())
    
    transform = torchvision.transforms.Compose(trans)
    
    # load 
    mnist_train = torchvision.datasets.FashionMNIST(root='../Datasets/FashionMNIST', train=True, download=True,
                                                    transform=transform)
    mnist_test = torchvision.datasets.FashionMNIST(root='../Datasets/FashionMNIST', train=False, download=True,
                                                   transform=transform)
    # generate
    train_generator = data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_generator = data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    
    return train_generator, test_generator

In [3]:
# resize to 28 * 28
train_generator, test_generator = load_dataset(batch_size=256, size=(32, 32))
for x, y in train_generator:
    print(x.shape, y.shape)
    break

torch.Size([256, 1, 32, 32]) torch.Size([256])


## model 

In [4]:
class LeNet(nn.Module):
    
    def __init__(self):
        super(LeNet, self).__init__()
        # Conv2d: in_channels, out_channels, kernel_size, stride=1, padding=0
        # 1,32,32
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5) # 6,28 ,28
        self.sigmoid1 = nn.Sigmoid()
        self.maxpool1 = nn.MaxPool2d(2, 2) # 6,14,14
        
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5) # 16,10,10
        self.sigmoid2 = nn.Sigmoid()
        self.maxpool2 = nn.MaxPool2d(2, 2) # 16,5,5
        
        # flatten 16*5*5
        
        # Linear: in_features, out_features, bias=True
        # fc1 
        self.fc1 = nn.Linear(16*4*4, 120)
        self.sigmoid3 = nn.Sigmoid()
        
        # fc2
        self.fc2 = nn.Linear(120, 84)
        self.sigmoid4 = nn.Sigmoid()
        
        # fc3
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.sigmoid1(x)
        x = self.maxpool1(x)
        
        x = self.conv2(x)
        x = self.sigmoid2(x)
        x = self.maxpool2(x)
        
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        x = self.sigmoid3(x)
        
        x = self.fc2(x)
        x = self.sigmoid4(x)
        
        x = self.fc3(x)
        
        return x
        
        

In [5]:
net = LeNet()
print(net)

LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (sigmoid1): Sigmoid()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (sigmoid2): Sigmoid()
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=256, out_features=120, bias=True)
  (sigmoid3): Sigmoid()
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (sigmoid4): Sigmoid()
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


## test

In [6]:
def test(model, test_loader, epoch, device=None):
    """

    """
    model.eval()  # convert to eval(model)

    if device is None and isinstance(model, torch.nn.Module):
        # if device is None, use the net device
        device = list(model.parameters())[0].device
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)  # load data to device
            acc_sum += (model(x).argmax(dim=1) == y).float().sum().cpu().item()
            n += x.shape[0]

    print('Eval epoch {} => acc {:.4f}'.format(epoch, acc_sum / n))

## train 

In [7]:
def train(model, train_loader, loss, optimizer, epoch, device=None):
    """
    convert train model
    """
    model.train()
    
    train_acc, train_loss, num_samples = 0, 0.0, 0
    num_batch = 0
    
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        pred_y = model(x)
        l = loss(pred_y, y)
        # grad clearing
        optimizer.zero_grad()
        # computer grad
        l.backward()
        # update grad
        optimizer.step()
        
        train_loss += l.cpu().item()
        train_acc += (pred_y.argmax(dim=1) == y).float().sum().cpu().item()
        
        num_samples += x.shape[0]
        num_batch += 1
        
    print('Train epoch {} => loss {:.4f}, acc {:.4f}'.
          format(epoch, train_loss / num_batch, train_acc / num_samples))
        

### SGDM  优化器

In [8]:
# SGDM
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)  

num_epochs = 20
batch_size = 256
lr, gamma = 0.5, 0.9
model = LeNet().to(device)
loss = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)  # SGDM

# optimizer = optim.Adam(params=model.parameters(), lr=lr) # Adam
scheduler = StepLR(optimizer, step_size=2, gamma=gamma)

train_loader, test_loader = load_dataset(batch_size)

for epoch in range(num_epochs):
    train(model, train_loader, loss, optimizer, epoch+1, device)
    test(model, test_loader, epoch+1, device=device)
    scheduler.step(epoch)
    # print('epoch {} optimizer learning rate {}'.format(epoch+1, optimizer.param_groups[0]['lr'][0]))
    # print('epoch {} scheduler learning rate {}'.format(epoch+1, scheduler.get_lr()[0]))

cuda
Train epoch 1 => loss 2.3121, acc 0.1003
Eval epoch 1 => acc 0.1000
Train epoch 2 => loss 2.3045, acc 0.1053
Eval epoch 2 => acc 0.1807
Train epoch 3 => loss 1.2315, acc 0.4931
Eval epoch 3 => acc 0.7137
Train epoch 4 => loss 0.6405, acc 0.7522
Eval epoch 4 => acc 0.7588
Train epoch 5 => loss 0.5137, acc 0.8063
Eval epoch 5 => acc 0.8139
Train epoch 6 => loss 0.4506, acc 0.8298
Eval epoch 6 => acc 0.8268
Train epoch 7 => loss 0.4103, acc 0.8480
Eval epoch 7 => acc 0.8363
Train epoch 8 => loss 0.3857, acc 0.8566
Eval epoch 8 => acc 0.8463
Train epoch 9 => loss 0.3660, acc 0.8629
Eval epoch 9 => acc 0.8580
Train epoch 10 => loss 0.3443, acc 0.8721
Eval epoch 10 => acc 0.8570
Train epoch 11 => loss 0.3393, acc 0.8725
Eval epoch 11 => acc 0.8700
Train epoch 12 => loss 0.3289, acc 0.8763
Eval epoch 12 => acc 0.8734
Train epoch 13 => loss 0.3247, acc 0.8798
Eval epoch 13 => acc 0.8731
Train epoch 14 => loss 0.3108, acc 0.8829
Eval epoch 14 => acc 0.8728
Train epoch 15 => loss 0.3064, ac

### Adam 优化器

In [9]:
# Adam
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)  

num_epochs = 20
batch_size = 256
lr, gamma = 0.01, 0.9
model = LeNet().to(device)
loss = nn.CrossEntropyLoss()
# optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)  # SGDM
optimizer = optim.Adam(params=model.parameters(), lr=lr) # Adam
scheduler = StepLR(optimizer, step_size=2, gamma=gamma)

train_loader, test_loader = load_dataset(batch_size)

for epoch in range(num_epochs):
    train(model, train_loader, loss, optimizer, epoch+1, device)
    test(model, test_loader, epoch+1, device=device)
    scheduler.step(epoch)

cuda
Train epoch 1 => loss 1.1344, acc 0.5650
Eval epoch 1 => acc 0.7596
Train epoch 2 => loss 0.5185, acc 0.7993
Eval epoch 2 => acc 0.8056
Train epoch 3 => loss 0.4230, acc 0.8414
Eval epoch 3 => acc 0.8474
Train epoch 4 => loss 0.3606, acc 0.8671
Eval epoch 4 => acc 0.8565
Train epoch 5 => loss 0.3313, acc 0.8763
Eval epoch 5 => acc 0.8635
Train epoch 6 => loss 0.3053, acc 0.8860
Eval epoch 6 => acc 0.8747
Train epoch 7 => loss 0.2922, acc 0.8915
Eval epoch 7 => acc 0.8807
Train epoch 8 => loss 0.2803, acc 0.8956
Eval epoch 8 => acc 0.8800
Train epoch 9 => loss 0.2669, acc 0.8996
Eval epoch 9 => acc 0.8819
Train epoch 10 => loss 0.2548, acc 0.9040
Eval epoch 10 => acc 0.8916
Train epoch 11 => loss 0.2509, acc 0.9047
Eval epoch 11 => acc 0.8795
Train epoch 12 => loss 0.2399, acc 0.9097
Eval epoch 12 => acc 0.8936
Train epoch 13 => loss 0.2337, acc 0.9113
Eval epoch 13 => acc 0.8929
Train epoch 14 => loss 0.2253, acc 0.9147
Eval epoch 14 => acc 0.8922
Train epoch 15 => loss 0.2197, ac

SGDM 与 Adam 训练过程的区别：
* SGDM: 训练初期，模型性能较差，误差较大，准确率很低; 训练后期，模型性能快速提升，最终理想的效果。
* Adam: 训练初期，模型很快收敛，误差较小，准确率较高; 训练后期，模型性能提升较慢，最终达到理想的效果。

## 批归一化