# 批归一化（LeNet）

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.utils.data as data
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import utils as d2l

## dataset

In [2]:
def load_dataset(batch_size, size=None, num_workers=4):
    
    # dataset process
    trans = []
    if size:
        trans.append(torchvision.transforms.Resize(size=size))
    trans.append(torchvision.transforms.ToTensor())
    
    transform = torchvision.transforms.Compose(trans)
    
    # load 
    mnist_train = torchvision.datasets.FashionMNIST(root='../Datasets/FashionMNIST', train=True, download=True,
                                                    transform=transform)
    mnist_test = torchvision.datasets.FashionMNIST(root='../Datasets/FashionMNIST', train=False, download=True,
                                                   transform=transform)
    # generate
    train_generator = data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_generator = data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    
    return train_generator, test_generator

In [3]:
# resize to 28 * 28
train_generator, test_generator = load_dataset(batch_size=256, size=(32, 32))
for x, y in train_generator:
    print(x.shape, y.shape)
    break

torch.Size([256, 1, 32, 32]) torch.Size([256])


## model 

In [4]:
class LeNet(nn.Module):
    
    def __init__(self):
        super(LeNet, self).__init__()
        # Conv2d: in_channels, out_channels, kernel_size, stride=1, padding=0
        # 1,32,32
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5) # 6,28 ,28
        self.sigmoid1 = nn.Sigmoid()
        self.maxpool1 = nn.MaxPool2d(2, 2) # 6,14,14
        
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5) # 16,10,10
        self.sigmoid2 = nn.Sigmoid()
        self.maxpool2 = nn.MaxPool2d(2, 2) # 16,5,5
        
        # flatten 16*5*5
        
        # Linear: in_features, out_features, bias=True
        # fc1 
        self.fc1 = nn.Linear(16*4*4, 120)
        self.sigmoid3 = nn.Sigmoid()
        
        # fc2
        self.fc2 = nn.Linear(120, 84)
        self.sigmoid4 = nn.Sigmoid()
        
        # fc3
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.sigmoid1(x)
        x = self.maxpool1(x)
        
        x = self.conv2(x)
        x = self.sigmoid2(x)
        x = self.maxpool2(x)
        
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        x = self.sigmoid3(x)
        
        x = self.fc2(x)
        x = self.sigmoid4(x)
        
        x = self.fc3(x)
        
        return x
        
        

In [5]:
net = LeNet()
print(net)

LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (sigmoid1): Sigmoid()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (sigmoid2): Sigmoid()
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=256, out_features=120, bias=True)
  (sigmoid3): Sigmoid()
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (sigmoid4): Sigmoid()
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


## test

In [6]:
def test(model, test_loader, epoch, device=None):
    """

    """
    model.eval()  # convert to eval(model)

    if device is None and isinstance(model, torch.nn.Module):
        # if device is None, use the net device
        device = list(model.parameters())[0].device
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)  # load data to device
            acc_sum += (model(x).argmax(dim=1) == y).float().sum().cpu().item()
            n += x.shape[0]

    print('Eval epoch {} => acc {:.4f}'.format(epoch, acc_sum / n))

## train 

In [7]:
def train(model, train_loader, loss, optimizer, epoch, device=None):
    """
    convert train model
    """
    model.train()
    
    train_acc, train_loss, num_samples = 0, 0.0, 0
    num_batch = 0
    
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        pred_y = model(x)
        l = loss(pred_y, y)
        # grad clearing
        optimizer.zero_grad()
        # computer grad
        l.backward()
        # update grad
        optimizer.step()
        
        train_loss += l.cpu().item()
        train_acc += (pred_y.argmax(dim=1) == y).float().sum().cpu().item()
        
        num_samples += x.shape[0]
        num_batch += 1
        
    print('Train epoch {} => loss {:.4f}, acc {:.4f}'.
          format(epoch, train_loss / num_batch, train_acc / num_samples))
        

### SGDM  优化器

In [8]:
# SGDM
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)  

num_epochs = 20
batch_size = 256
lr, gamma = 0.5, 0.9
model = LeNet().to(device)
loss = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)  # SGDM

# optimizer = optim.Adam(params=model.parameters(), lr=lr) # Adam
scheduler = StepLR(optimizer, step_size=2, gamma=gamma)

train_loader, test_loader = load_dataset(batch_size)

for epoch in range(num_epochs):
    train(model, train_loader, loss, optimizer, epoch+1, device)
    test(model, test_loader, epoch+1, device=device)
    scheduler.step(epoch)
    # print('epoch {} optimizer learning rate {}'.format(epoch+1, optimizer.param_groups[0]['lr'][0]))
    # print('epoch {} scheduler learning rate {}'.format(epoch+1, scheduler.get_lr()[0]))

cuda
Train epoch 1 => loss 2.3129, acc 0.0995
Eval epoch 1 => acc 0.1000
Train epoch 2 => loss 2.3070, acc 0.1006
Eval epoch 2 => acc 0.1000
Train epoch 3 => loss 1.9592, acc 0.2214
Eval epoch 3 => acc 0.5751
Train epoch 4 => loss 0.7987, acc 0.6879
Eval epoch 4 => acc 0.7504
Train epoch 5 => loss 0.5677, acc 0.7793
Eval epoch 5 => acc 0.7904
Train epoch 6 => loss 0.4811, acc 0.8161
Eval epoch 6 => acc 0.8082
Train epoch 7 => loss 0.4345, acc 0.8392
Eval epoch 7 => acc 0.8357
Train epoch 8 => loss 0.3834, acc 0.8601
Eval epoch 8 => acc 0.8508
Train epoch 9 => loss 0.3490, acc 0.8722
Eval epoch 9 => acc 0.8614
Train epoch 10 => loss 0.3303, acc 0.8795
Eval epoch 10 => acc 0.8751
Train epoch 11 => loss 0.3172, acc 0.8844
Eval epoch 11 => acc 0.8740
Train epoch 12 => loss 0.2973, acc 0.8907
Eval epoch 12 => acc 0.8739
Train epoch 13 => loss 0.2856, acc 0.8952
Eval epoch 13 => acc 0.8904
Train epoch 14 => loss 0.2717, acc 0.9000
Eval epoch 14 => acc 0.8762
Train epoch 15 => loss 0.2708, ac

### Adam 优化器

In [9]:
# Adam
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)  

num_epochs = 20
batch_size = 256
lr, gamma = 0.01, 0.9
model = LeNet().to(device)
loss = nn.CrossEntropyLoss()
# optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)  # SGDM
optimizer = optim.Adam(params=model.parameters(), lr=lr) # Adam
scheduler = StepLR(optimizer, step_size=2, gamma=gamma)

train_loader, test_loader = load_dataset(batch_size)

for epoch in range(num_epochs):
    train(model, train_loader, loss, optimizer, epoch+1, device)
    test(model, test_loader, epoch+1, device=device)
    scheduler.step(epoch)

cuda
Train epoch 1 => loss 1.3642, acc 0.4644
Eval epoch 1 => acc 0.7413
Train epoch 2 => loss 0.5565, acc 0.7845
Eval epoch 2 => acc 0.8051
Train epoch 3 => loss 0.4548, acc 0.8258
Eval epoch 3 => acc 0.8195
Train epoch 4 => loss 0.4007, acc 0.8482
Eval epoch 4 => acc 0.8399
Train epoch 5 => loss 0.3698, acc 0.8603
Eval epoch 5 => acc 0.8590
Train epoch 6 => loss 0.3441, acc 0.8699
Eval epoch 6 => acc 0.8666
Train epoch 7 => loss 0.3269, acc 0.8769
Eval epoch 7 => acc 0.8681
Train epoch 8 => loss 0.3109, acc 0.8829
Eval epoch 8 => acc 0.8703
Train epoch 9 => loss 0.3055, acc 0.8832
Eval epoch 9 => acc 0.8668
Train epoch 10 => loss 0.2949, acc 0.8880
Eval epoch 10 => acc 0.8791
Train epoch 11 => loss 0.2852, acc 0.8912
Eval epoch 11 => acc 0.8792
Train epoch 12 => loss 0.2734, acc 0.8962
Eval epoch 12 => acc 0.8847
Train epoch 13 => loss 0.2692, acc 0.8983
Eval epoch 13 => acc 0.8787
Train epoch 14 => loss 0.2597, acc 0.9012
Eval epoch 14 => acc 0.8844
Train epoch 15 => loss 0.2556, ac

SGDM 与 Adam 训练过程的区别：
* SGDM: 训练初期，模型性能较差，误差较大，准确率很低; 训练后期，模型性能快速提升，最终理想的效果。
* Adam: 训练初期，模型很快收敛，误差较小，准确率较高; 训练后期，模型性能提升较慢，最终达到理想的效果。

## 批归一化

批归一化（batch normalization)是为了解决深度模型训练过程中内部协变量转移（internal covariate shift)的问题。

在训练过程中利用小批量的样本的均值和标准差执行批归一化，不断调整神经网络的中间输出，使得每一个神经层的输入分布在训练过程中保持一致，从而使整个神经网络在各层的中间输出的数值更稳定，较深的神经网络的训练变得容易。

### 全连接层

对于全连接层，批量归一化层通常置于全连接层中的仿射变换和激活函数之间，**使用整个仿射变换的输出做批归一化**

### 卷积层

对于卷积层， 批量归一化发生在卷积计算之后， 应用于激活函数之前。如果卷积计算十余处多个通道，需要对这些通道的输出分别做批归一化，其每个通道都拥有独立的拉伸和便宜系数。

### 预测时的批归一化

通过移动平均估算整个训练数据集的样本均值和方差，并在预测时使用它们对隐藏单元z值进行调整。


In [10]:
# batch normalization
def batch_norm(x, gamma, beta, moving_mean, moving_var, eps, momentum, training=False):
    """
    """
    # judge is trainging
    if not training:
        x_hat = (x - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(x.shape) in [2, 4] # only support full-connect or convolution
        # full-connect layer
        if len(x.shape) == 2: 
            # compuer mean and variance on batch dimension
            mean = x.mean(dim=0) 
            var = ((x - mean) ** 2).mean(dim=0)
        # convelution layer
        else:
            # computer mean and variance on channel dimension
            mean = x.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
            var = ((x-mean)**2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
        # training mode use mean and variance of   current batch  
        x_hat = (x - mean) / torch.sqrt(var + eps)
        
        # update moving_mean and moving_var
        moving_mean = momentum * moving_mean + (1 - momentum) * mean
        moving_var = momentum * moving_var + (1 - momentum) * var
        
    # scale transform 
    y = gamma * x_hat + beta
    
    return y, moving_mean, moving_var

In [11]:
## BatchNorm layer
class BatchNorm(nn.Module):
    
    def __init__(self, num_feature, num_dim, eps=1e-5, momentum=0.1):
        super(BatchNorm, self).__init__()
        assert num_dim in [2, 4]
        if num_dim == 2:
            shape = [1, num_feature]
        else:
            shape = [1, num_feature, 1, 1]
        
        # involve in grad iterator
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        
        # not involve in grad iterator
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)
        self.eps = eps
        self.momentum = momentum
        
    def forward(self, x):
        # make sure x. moving_mean, moving_var in euqual device 
        if self.moving_mean.device != x.device:
            self.moving_mean = self.moving_mean.to(x.device)
            self.moving_var = self.moving_var.to(x.device)
        # note self.traing=True when model.train() else self.traing=True when model.eval()
        y, self.moving_mean, self.moving_var = batch_norm(x, self.gamma, self.beta, self.moving_mean, 
                                                          self.moving_var, self.eps, self.momentum, 
                                                          training=self.training)
        
        return y 
                

## Model with batch-normalization

In [12]:
class LeNet(nn.Module):
    
    def __init__(self):
        super(LeNet, self).__init__()
        # Conv2d: in_channels, out_channels, kernel_size, stride=1, padding=0
        # 1,32,32
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5) # 6,28 ,28
        self.batch_norm1 = BatchNorm(num_feature=6, num_dim=4)
        self.sigmoid1 = nn.Sigmoid()
        self.maxpool1 = nn.MaxPool2d(2, 2) # 6,14,14
        
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5) # 16,10,10
        self.batch_norm2 = BatchNorm(num_feature=16, num_dim=4)
        self.sigmoid2 = nn.Sigmoid()
        self.maxpool2 = nn.MaxPool2d(2, 2) # 16,5,5
        
        # flatten 16*5*5
        
        # Linear: in_features, out_features, bias=True
        # fc1 
        self.fc1 = nn.Linear(16*4*4, 120)
        self.batch_norm3 = BatchNorm(num_feature=120, num_dim=2)
        self.sigmoid3 = nn.Sigmoid()
        
        # fc2
        self.fc2 = nn.Linear(120, 84)
        self.batch_norm4= BatchNorm(num_feature=84, num_dim=2)
        self.sigmoid4 = nn.Sigmoid()
        
        # fc3
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.batch_norm1(x)
        x = self.sigmoid1(x)
        x = self.maxpool1(x)
        
        x = self.conv2(x)
        x = self.batch_norm2(x)
        x = self.sigmoid2(x)
        x = self.maxpool2(x)
        
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        x = self.batch_norm3(x)
        x = self.sigmoid3(x)
        
        x = self.fc2(x)
        x = self.batch_norm4(x)
        x = self.sigmoid4(x)
        
        x = self.fc3(x)
        
        return x

In [13]:
model = LeNet()
print(model)

LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (batch_norm1): BatchNorm()
  (sigmoid1): Sigmoid()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (batch_norm2): BatchNorm()
  (sigmoid2): Sigmoid()
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=256, out_features=120, bias=True)
  (batch_norm3): BatchNorm()
  (sigmoid3): Sigmoid()
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (batch_norm4): BatchNorm()
  (sigmoid4): Sigmoid()
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [14]:
# show parameters
for name, param in model.named_parameters():
    print(name, param.size())

conv1.weight torch.Size([6, 1, 5, 5])
conv1.bias torch.Size([6])
batch_norm1.gamma torch.Size([1, 6, 1, 1])
batch_norm1.beta torch.Size([1, 6, 1, 1])
conv2.weight torch.Size([16, 6, 5, 5])
conv2.bias torch.Size([16])
batch_norm2.gamma torch.Size([1, 16, 1, 1])
batch_norm2.beta torch.Size([1, 16, 1, 1])
fc1.weight torch.Size([120, 256])
fc1.bias torch.Size([120])
batch_norm3.gamma torch.Size([1, 120])
batch_norm3.beta torch.Size([1, 120])
fc2.weight torch.Size([84, 120])
fc2.bias torch.Size([84])
batch_norm4.gamma torch.Size([1, 84])
batch_norm4.beta torch.Size([1, 84])
fc3.weight torch.Size([10, 84])
fc3.bias torch.Size([10])


In [15]:
# SGDM
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
num_epochs = 20
batch_size = 256
lr, gamma = 0.5, 0.9
model = LeNet().to(device)
loss = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)  # SGDM

# optimizer = optim.Adam(params=model.parameters(), lr=lr) # Adam
scheduler = StepLR(optimizer, step_size=2, gamma=gamma)

train_loader, test_loader = load_dataset(batch_size)

for epoch in range(num_epochs):
    train(model, train_loader, loss, optimizer, epoch+1, device)
    test(model, test_loader, epoch+1, device=device)
    scheduler.step(epoch)

cuda
Train epoch 1 => loss 0.5923, acc 0.7830
Eval epoch 1 => acc 0.8458
Train epoch 2 => loss 0.3863, acc 0.8597
Eval epoch 2 => acc 0.8408
Train epoch 3 => loss 0.3360, acc 0.8766
Eval epoch 3 => acc 0.8784
Train epoch 4 => loss 0.3063, acc 0.8865
Eval epoch 4 => acc 0.8807
Train epoch 5 => loss 0.2887, acc 0.8940
Eval epoch 5 => acc 0.8785
Train epoch 6 => loss 0.2721, acc 0.8978
Eval epoch 6 => acc 0.8850
Train epoch 7 => loss 0.2605, acc 0.9042
Eval epoch 7 => acc 0.8891
Train epoch 8 => loss 0.2484, acc 0.9081
Eval epoch 8 => acc 0.8908
Train epoch 9 => loss 0.2395, acc 0.9113
Eval epoch 9 => acc 0.8910
Train epoch 10 => loss 0.2267, acc 0.9154
Eval epoch 10 => acc 0.8911
Train epoch 11 => loss 0.2198, acc 0.9179
Eval epoch 11 => acc 0.8899
Train epoch 12 => loss 0.2104, acc 0.9218
Eval epoch 12 => acc 0.8968
Train epoch 13 => loss 0.2093, acc 0.9221
Eval epoch 13 => acc 0.8975
Train epoch 14 => loss 0.1995, acc 0.9262
Eval epoch 14 => acc 0.8966
Train epoch 15 => loss 0.1920, ac

In [16]:
# Adam
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)  

num_epochs = 20
batch_size = 256
lr, gamma = 0.01, 0.9
model = LeNet().to(device)
loss = nn.CrossEntropyLoss()
# optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)  # SGDM
optimizer = optim.Adam(params=model.parameters(), lr=lr) # Adam
scheduler = StepLR(optimizer, step_size=2, gamma=gamma)

train_loader, test_loader = load_dataset(batch_size)

for epoch in range(num_epochs):
    train(model, train_loader, loss, optimizer, epoch+1, device)
    test(model, test_loader, epoch+1, device=device)
    scheduler.step(epoch)

cuda
Train epoch 1 => loss 0.5298, acc 0.8241
Eval epoch 1 => acc 0.8626
Train epoch 2 => loss 0.3451, acc 0.8748
Eval epoch 2 => acc 0.8650
Train epoch 3 => loss 0.3081, acc 0.8882
Eval epoch 3 => acc 0.8774
Train epoch 4 => loss 0.2779, acc 0.8985
Eval epoch 4 => acc 0.8720
Train epoch 5 => loss 0.2592, acc 0.9048
Eval epoch 5 => acc 0.8777
Train epoch 6 => loss 0.2407, acc 0.9105
Eval epoch 6 => acc 0.8941
Train epoch 7 => loss 0.2302, acc 0.9149
Eval epoch 7 => acc 0.8954
Train epoch 8 => loss 0.2139, acc 0.9215
Eval epoch 8 => acc 0.8955
Train epoch 9 => loss 0.2040, acc 0.9231
Eval epoch 9 => acc 0.8932
Train epoch 10 => loss 0.1945, acc 0.9273
Eval epoch 10 => acc 0.8961
Train epoch 11 => loss 0.1824, acc 0.9319
Eval epoch 11 => acc 0.8954
Train epoch 12 => loss 0.1723, acc 0.9355
Eval epoch 12 => acc 0.8976
Train epoch 13 => loss 0.1678, acc 0.9380
Eval epoch 13 => acc 0.8959
Train epoch 14 => loss 0.1507, acc 0.9439
Eval epoch 14 => acc 0.9026
Train epoch 15 => loss 0.1456, ac

** 使用批归一化使得模型能够在前期快速收敛，并使得训练过程更稳定。** 