# 批归一化（LeNet）

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.utils.data as data
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import utils as d2l

## dataset

In [2]:
def load_dataset(batch_size, size=None, num_workers=4):
    
    # dataset process
    trans = []
    if size:
        trans.append(torchvision.transforms.Resize(size=size))
    trans.append(torchvision.transforms.ToTensor())
    
    transform = torchvision.transforms.Compose(trans)
    
    # load 
    mnist_train = torchvision.datasets.FashionMNIST(root='../Datasets/FashionMNIST', train=True, download=True,
                                                    transform=transform)
    mnist_test = torchvision.datasets.FashionMNIST(root='../Datasets/FashionMNIST', train=False, download=True,
                                                   transform=transform)
    # generate
    train_generator = data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_generator = data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    
    return train_generator, test_generator

In [3]:
# resize to 28 * 28
train_generator, test_generator = load_dataset(batch_size=256, size=(32, 32))
for x, y in train_generator:
    print(x.shape, y.shape)
    break

torch.Size([256, 1, 32, 32]) torch.Size([256])


## model 

！[LeNet](../Docs/lenet.png)

In [4]:
class LeNet(nn.Module):
    
    def __init__(self):
        super(LeNet, self).__init__()
        # Conv2d: in_channels, out_channels, kernel_size, stride=1, padding=0
        # 1,32,32
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5) # 6,28 ,28
        self.sigmoid1 = nn.Sigmoid()
        self.maxpool1 = nn.MaxPool2d(2, 2) # 6,14,14
        
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5) # 16,10,10
        self.sigmoid2 = nn.Sigmoid()
        self.maxpool2 = nn.MaxPool2d(2, 2) # 16,5,5
        
        # flatten 16*5*5
        
        # Linear: in_features, out_features, bias=True
        # fc1 
        self.fc1 = nn.Linear(16*5*5, 120)
        self.sigmoid3 = nn.Sigmoid()
        
        # fc2
        self.fc2 = nn.Linear(120, 84)
        self.sigmoid4 = nn.Sigmoid()
        
        # fc3
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.sigmoid1(x)
        x = self.maxpool1(x)
        
        x = self.conv2(x)
        x = self.sigmoid2(x)
        x = self.maxpool2(x)
        
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        x = self.sigmoid3(x)
        
        x = self.fc2(x)
        x = self.sigmoid4(x)
        
        x = self.fc3(x)
        
        return x
        
        

In [5]:
net = LeNet()
print(net)

LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (sigmoid1): Sigmoid()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (sigmoid2): Sigmoid()
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (sigmoid3): Sigmoid()
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (sigmoid4): Sigmoid()
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


## test

In [6]:
def test(model, test_loader, epoch, device=None):
    """

    """
    model.eval()  # convert to eval(model)

    if device is None and isinstance(model, torch.nn.Module):
        # if device is None, use the net device
        device = list(model.parameters())[0].device
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)  # load data to device
            acc_sum += (model(x).argmax(dim=1) == y).float().sum().cpu().item()
            n += x.shape[0]

    print('Eval epoch {} => acc {:.4f}'.format(epoch, acc_sum / n))

## train 

In [7]:
def train(model, train_loader, loss, optimizer, epoch, device=None):
    """
    convert train model
    """
    model.train()
    
    train_acc, train_loss, num_samples = 0, 0.0, 0
    num_batch = 0
    
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        pred_y = model(x)
        l = loss(pred_y, y)
        # grad clearing
        optimizer.zero_grad()
        # computer grad
        l.backward()
        # update grad
        optimizer.step()
        
        train_loss += l.cpu().item()
        train_acc += (pred_y.argmax(dim=1) == y).float().sum().cpu().item()
        
        num_samples += x.shape[0]
        num_batch += 1
        
    print('Train epoch {} => loss {:.4f}, acc {:.4f}'.
          format(epoch, train_loss / num_batch, train_acc / num_samples))
        

### SGDM  优化器

In [8]:
# SGDM
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)  

num_epochs = 20
batch_size = 256
lr, gamma = 0.5, 0.9
model = LeNet().to(device)
loss = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)  # SGDM

# optimizer = optim.Adam(params=model.parameters(), lr=lr) # Adam
scheduler = StepLR(optimizer, step_size=2, gamma=gamma)

train_loader, test_loader = load_dataset(batch_size, size=(32, 32))

for epoch in range(num_epochs):
    train(model, train_loader, loss, optimizer, epoch+1, device)
    test(model, test_loader, epoch+1, device=device)
    scheduler.step(epoch)
    # print('epoch {} optimizer learning rate {}'.format(epoch+1, optimizer.param_groups[0]['lr'][0]))
    # print('epoch {} scheduler learning rate {}'.format(epoch+1, scheduler.get_lr()[0]))

cuda
Train epoch 1 => loss 2.3122, acc 0.0999
Eval epoch 1 => acc 0.1000
Train epoch 2 => loss 2.3074, acc 0.1006
Eval epoch 2 => acc 0.1000
Train epoch 3 => loss 2.3065, acc 0.1003
Eval epoch 3 => acc 0.1000
Train epoch 4 => loss 1.6388, acc 0.3524
Eval epoch 4 => acc 0.6640
Train epoch 5 => loss 0.6833, acc 0.7350
Eval epoch 5 => acc 0.7603
Train epoch 6 => loss 0.5143, acc 0.8003
Eval epoch 6 => acc 0.8110
Train epoch 7 => loss 0.4454, acc 0.8349
Eval epoch 7 => acc 0.8364
Train epoch 8 => loss 0.3929, acc 0.8570
Eval epoch 8 => acc 0.8533
Train epoch 9 => loss 0.3561, acc 0.8706
Eval epoch 9 => acc 0.8696
Train epoch 10 => loss 0.3324, acc 0.8785
Eval epoch 10 => acc 0.8677
Train epoch 11 => loss 0.3167, acc 0.8843
Eval epoch 11 => acc 0.8754
Train epoch 12 => loss 0.3000, acc 0.8897
Eval epoch 12 => acc 0.8767
Train epoch 13 => loss 0.2869, acc 0.8948
Eval epoch 13 => acc 0.8830
Train epoch 14 => loss 0.2757, acc 0.8990
Eval epoch 14 => acc 0.8870
Train epoch 15 => loss 0.2678, ac

### Adam 优化器

In [9]:
# Adam
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)  

num_epochs = 20
batch_size = 256
lr, gamma = 0.01, 0.9
model = LeNet().to(device)
loss = nn.CrossEntropyLoss()
# optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)  # SGDM
optimizer = optim.Adam(params=model.parameters(), lr=lr) # Adam
scheduler = StepLR(optimizer, step_size=2, gamma=gamma)

train_loader, test_loader = load_dataset(batch_size, size=(32, 32))

for epoch in range(num_epochs):
    train(model, train_loader, loss, optimizer, epoch+1, device)
    test(model, test_loader, epoch+1, device=device)
    scheduler.step(epoch)

cuda
Train epoch 1 => loss 1.2432, acc 0.5060
Eval epoch 1 => acc 0.7483
Train epoch 2 => loss 0.5392, acc 0.7904
Eval epoch 2 => acc 0.8075
Train epoch 3 => loss 0.4460, acc 0.8317
Eval epoch 3 => acc 0.8367
Train epoch 4 => loss 0.3947, acc 0.8513
Eval epoch 4 => acc 0.8566
Train epoch 5 => loss 0.3686, acc 0.8612
Eval epoch 5 => acc 0.8600
Train epoch 6 => loss 0.3367, acc 0.8727
Eval epoch 6 => acc 0.8561
Train epoch 7 => loss 0.3246, acc 0.8770
Eval epoch 7 => acc 0.8714
Train epoch 8 => loss 0.3061, acc 0.8836
Eval epoch 8 => acc 0.8648
Train epoch 9 => loss 0.2992, acc 0.8871
Eval epoch 9 => acc 0.8781
Train epoch 10 => loss 0.2869, acc 0.8921
Eval epoch 10 => acc 0.8780
Train epoch 11 => loss 0.2813, acc 0.8928
Eval epoch 11 => acc 0.8795
Train epoch 12 => loss 0.2659, acc 0.8991
Eval epoch 12 => acc 0.8802
Train epoch 13 => loss 0.2625, acc 0.9012
Eval epoch 13 => acc 0.8856
Train epoch 14 => loss 0.2524, acc 0.9030
Eval epoch 14 => acc 0.8859
Train epoch 15 => loss 0.2454, ac

SGDM 与 Adam 训练过程的区别：
* SGDM: 训练初期，模型性能较差，误差较大，准确率很低; 训练后期，模型性能快速提升，最终理想的效果。
* Adam: 训练初期，模型很快收敛，误差较小，准确率较高; 训练后期，模型性能提升较慢，最终达到理想的效果。

## 批归一化

批归一化（batch normalization)是为了解决深度模型训练过程中内部协变量转移（internal covariate shift)的问题。

在训练过程中利用小批量的样本的均值和标准差执行批归一化，不断调整神经网络的中间输出，使得每一个神经层的输入分布在训练过程中保持一致，从而使整个神经网络在各层的中间输出的数值更稳定，较深的神经网络的训练变得容易。

### 全连接层

对于全连接层，批量归一化层通常置于全连接层中的仿射变换和激活函数之间，**使用整个仿射变换的输出做批归一化**

### 卷积层

对于卷积层， 批量归一化发生在卷积计算之后， 应用于激活函数之前。如果卷积计算十余处多个通道，需要对这些通道的输出分别做批归一化，其每个通道都拥有独立的拉伸和便宜系数。

### 预测时的批归一化

通过移动平均估算整个训练数据集的样本均值和方差，并在预测时使用它们对隐藏单元z值进行调整。


In [10]:
# batch normalization
def batch_norm(x, gamma, beta, moving_mean, moving_var, eps, momentum, training=False):
    """
    """
    # judge is trainging
    if not training:
        x_hat = (x - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(x.shape) in [2, 4] # only support full-connect or convolution
        # full-connect layer
        if len(x.shape) == 2: 
            # compuer mean and variance on batch dimension
            mean = x.mean(dim=0) 
            var = ((x - mean) ** 2).mean(dim=0)
        # convelution layer
        else:
            # computer mean and variance on channel dimension
            mean = x.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
            var = ((x-mean)**2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
        # training mode use mean and variance of   current batch  
        x_hat = (x - mean) / torch.sqrt(var + eps)
        
        # update moving_mean and moving_var
        moving_mean = momentum * moving_mean + (1 - momentum) * mean
        moving_var = momentum * moving_var + (1 - momentum) * var
        
    # scale transform 
    y = gamma * x_hat + beta
    
    return y, moving_mean, moving_var

In [11]:
## BatchNorm layer
class BatchNorm(nn.Module):
    
    def __init__(self, num_feature, num_dim, eps=1e-5, momentum=0.1):
        super(BatchNorm, self).__init__()
        assert num_dim in [2, 4]
        if num_dim == 2:
            shape = [1, num_feature]
        else:
            shape = [1, num_feature, 1, 1]
        
        # involve in grad iterator
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        
        # not involve in grad iterator
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)
        self.eps = eps
        self.momentum = momentum
        
    def forward(self, x):
        # make sure x. moving_mean, moving_var in euqual device 
        if self.moving_mean.device != x.device:
            self.moving_mean = self.moving_mean.to(x.device)
            self.moving_var = self.moving_var.to(x.device)
        # note self.traing=True when model.train() else self.traing=True when model.eval()
        y, self.moving_mean, self.moving_var = batch_norm(x, self.gamma, self.beta, self.moving_mean, 
                                                          self.moving_var, self.eps, self.momentum, 
                                                          training=self.training)
        
        return y 
                

## Model with batch-normalization

In [12]:
class LeNet(nn.Module):
    
    def __init__(self):
        super(LeNet, self).__init__()
        # Conv2d: in_channels, out_channels, kernel_size, stride=1, padding=0
        # 1,32,32
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5) # 6,28 ,28
        self.batch_norm1 = BatchNorm(num_feature=6, num_dim=4)
        self.sigmoid1 = nn.Sigmoid()
        self.maxpool1 = nn.MaxPool2d(2, 2) # 6,14,14
        
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5) # 16,10,10
        self.batch_norm2 = BatchNorm(num_feature=16, num_dim=4)
        self.sigmoid2 = nn.Sigmoid()
        self.maxpool2 = nn.MaxPool2d(2, 2) # 16,5,5
        
        # flatten 16*5*5
        
        # Linear: in_features, out_features, bias=True
        # fc1 
        self.fc1 = nn.Linear(16*5*5, 120)
        self.batch_norm3 = BatchNorm(num_feature=120, num_dim=2)
        self.sigmoid3 = nn.Sigmoid()
        
        # fc2
        self.fc2 = nn.Linear(120, 84)
        self.batch_norm4= BatchNorm(num_feature=84, num_dim=2)
        self.sigmoid4 = nn.Sigmoid()
        
        # fc3
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.batch_norm1(x)
        x = self.sigmoid1(x)
        x = self.maxpool1(x)
        
        x = self.conv2(x)
        x = self.batch_norm2(x)
        x = self.sigmoid2(x)
        x = self.maxpool2(x)
        
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        x = self.batch_norm3(x)
        x = self.sigmoid3(x)
        
        x = self.fc2(x)
        x = self.batch_norm4(x)
        x = self.sigmoid4(x)
        
        x = self.fc3(x)
        
        return x

In [13]:
model = LeNet()
print(model)

LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (batch_norm1): BatchNorm()
  (sigmoid1): Sigmoid()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (batch_norm2): BatchNorm()
  (sigmoid2): Sigmoid()
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (batch_norm3): BatchNorm()
  (sigmoid3): Sigmoid()
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (batch_norm4): BatchNorm()
  (sigmoid4): Sigmoid()
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [14]:
# show parameters
for name, param in model.named_parameters():
    print(name, param.size())

conv1.weight torch.Size([6, 1, 5, 5])
conv1.bias torch.Size([6])
batch_norm1.gamma torch.Size([1, 6, 1, 1])
batch_norm1.beta torch.Size([1, 6, 1, 1])
conv2.weight torch.Size([16, 6, 5, 5])
conv2.bias torch.Size([16])
batch_norm2.gamma torch.Size([1, 16, 1, 1])
batch_norm2.beta torch.Size([1, 16, 1, 1])
fc1.weight torch.Size([120, 400])
fc1.bias torch.Size([120])
batch_norm3.gamma torch.Size([1, 120])
batch_norm3.beta torch.Size([1, 120])
fc2.weight torch.Size([84, 120])
fc2.bias torch.Size([84])
batch_norm4.gamma torch.Size([1, 84])
batch_norm4.beta torch.Size([1, 84])
fc3.weight torch.Size([10, 84])
fc3.bias torch.Size([10])


In [15]:
# SGDM
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
num_epochs = 20
batch_size = 256
lr, gamma = 0.5, 0.9
model = LeNet().to(device)
loss = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)  # SGDM

# optimizer = optim.Adam(params=model.parameters(), lr=lr) # Adam
scheduler = StepLR(optimizer, step_size=2, gamma=gamma)

train_loader, test_loader = load_dataset(batch_size, size=(32, 32))

for epoch in range(num_epochs):
    train(model, train_loader, loss, optimizer, epoch+1, device)
    test(model, test_loader, epoch+1, device=device)
    scheduler.step(epoch)

cuda
Train epoch 1 => loss 0.5412, acc 0.8043
Eval epoch 1 => acc 0.8558
Train epoch 2 => loss 0.3550, acc 0.8709
Eval epoch 2 => acc 0.8721
Train epoch 3 => loss 0.3102, acc 0.8871
Eval epoch 3 => acc 0.8627
Train epoch 4 => loss 0.2807, acc 0.8974
Eval epoch 4 => acc 0.8851
Train epoch 5 => loss 0.2622, acc 0.9042
Eval epoch 5 => acc 0.8929
Train epoch 6 => loss 0.2394, acc 0.9124
Eval epoch 6 => acc 0.8932
Train epoch 7 => loss 0.2261, acc 0.9170
Eval epoch 7 => acc 0.8996
Train epoch 8 => loss 0.2138, acc 0.9202
Eval epoch 8 => acc 0.8920
Train epoch 9 => loss 0.2058, acc 0.9232
Eval epoch 9 => acc 0.8937
Train epoch 10 => loss 0.1942, acc 0.9283
Eval epoch 10 => acc 0.9002
Train epoch 11 => loss 0.1864, acc 0.9309
Eval epoch 11 => acc 0.9010
Train epoch 12 => loss 0.1781, acc 0.9344
Eval epoch 12 => acc 0.9050
Train epoch 13 => loss 0.1716, acc 0.9356
Eval epoch 13 => acc 0.9017
Train epoch 14 => loss 0.1618, acc 0.9392
Eval epoch 14 => acc 0.9041
Train epoch 15 => loss 0.1553, ac

In [16]:
# Adam
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)  

num_epochs = 20
batch_size = 256
lr, gamma = 0.01, 0.9
model = LeNet().to(device)
loss = nn.CrossEntropyLoss()
# optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)  # SGDM
optimizer = optim.Adam(params=model.parameters(), lr=lr) # Adam
scheduler = StepLR(optimizer, step_size=2, gamma=gamma)

train_loader, test_loader = load_dataset(batch_size, size=(32, 32))

for epoch in range(num_epochs):
    train(model, train_loader, loss, optimizer, epoch+1, device)
    test(model, test_loader, epoch+1, device=device)
    scheduler.step(epoch)

cuda
Train epoch 1 => loss 0.5292, acc 0.8264
Eval epoch 1 => acc 0.8571
Train epoch 2 => loss 0.3369, acc 0.8781
Eval epoch 2 => acc 0.8695
Train epoch 3 => loss 0.2975, acc 0.8920
Eval epoch 3 => acc 0.8883
Train epoch 4 => loss 0.2632, acc 0.9047
Eval epoch 4 => acc 0.8884
Train epoch 5 => loss 0.2484, acc 0.9092
Eval epoch 5 => acc 0.8869
Train epoch 6 => loss 0.2269, acc 0.9163
Eval epoch 6 => acc 0.8944
Train epoch 7 => loss 0.2177, acc 0.9198
Eval epoch 7 => acc 0.9023
Train epoch 8 => loss 0.2030, acc 0.9248
Eval epoch 8 => acc 0.8947
Train epoch 9 => loss 0.1892, acc 0.9296
Eval epoch 9 => acc 0.9028
Train epoch 10 => loss 0.1790, acc 0.9347
Eval epoch 10 => acc 0.9056
Train epoch 11 => loss 0.1690, acc 0.9374
Eval epoch 11 => acc 0.9004
Train epoch 12 => loss 0.1532, acc 0.9436
Eval epoch 12 => acc 0.9017
Train epoch 13 => loss 0.1469, acc 0.9460
Eval epoch 13 => acc 0.9002
Train epoch 14 => loss 0.1323, acc 0.9513
Eval epoch 14 => acc 0.8995
Train epoch 15 => loss 0.1274, ac

** 使用批归一化使得模型能够在前期快速收敛，并使得训练过程更稳定。** 