In [1]:
import time
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import d2lzh as d2l 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def batch_norm(is_training,x,gamma,beta,moving_mean,moving_var,eps,momentum):
    if not is_training:
        #　在预测模式下，直接使用传入的moving mean和moving var
        x_hat = (x-moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(x.shape) in (2,4)
        if len(x.shape) == 2: # 全连接层，计算特征维度的均值和方差
            mean = x.mean(dim=0)
            var = ((x -mean) ** 2).mean(dim=0)
        else:
            # 使用二维卷积，计算通道维度上的均值和方差
            mean = x.mean(dim=0,keepdim=True).mean(dim=2,keepdim=True).mean(dim=3,keepdim=True)
            var = ((x-mean) ** 2).mean(dim=0,keepdim=True).mean(dim=2,keepdim=True).mean(dim=3,keepdim=True)
        x_hat = (x-mean) / torch.sqrt(var+eps)

        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0  - momentum) * var 

    y = gamma * x_hat + beta
    return y,moving_mean,moving_var

class BatchNorm(nn.Module):
    def __init__(self,num_features,num_dims):
        super(BatchNorm,self).__init__()

        if num_dims == 2:
            shape = (1,num_features)
        else:
            shape = (1,num_features,1,1)

        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta  = nn.Parameter(torch.zeros(shape))

        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)

    def forward(self,x):
        if self.moving_var.device != x.device:
            self.moving_var = self.moving_var.to(x.device)
            self.moving_mean = self.moving_mean.to(x.device)

        y,self.moving_mean,self.moving_var = batch_norm(self.training,x,self.gamma,self.beta,self.moving_mean,self.moving_var,eps=1e-5,momentum=0.9)

        return y

In [2]:
net = nn.Sequential(
    nn.Conv2d(1,6,5),
    BatchNorm(6,num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    nn.Conv2d(6,16,5),
    BatchNorm(16,num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    d2l.FlattenLayer(),
    nn.Linear(16 * 4 * 4,120),
    BatchNorm(120,num_dims=2),
    nn.Sigmoid(),
    nn.Linear(120,84),
    BatchNorm(84,num_dims=2),
    nn.Sigmoid(),
    nn.Linear(84,10)
)

batch_size = 256
train_iter,test_iter = d2l.load_data_fashion_mnist(batch_size)

lr,num_epochs = 0.001,5
optimizer = torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)

training on  cuda
epoch 1, loss 0.9969, train acc 0.789,test acc 0.821,time 3.8
epoch 2, loss 0.2267, train acc 0.865,test acc 0.837,time 2.6
epoch 3, loss 0.1224, train acc 0.878,test acc 0.842,time 2.6
epoch 4, loss 0.0833, train acc 0.885,test acc 0.876,time 2.7
epoch 5, loss 0.0625, train acc 0.891,test acc 0.833,time 2.6


In [5]:
net = nn.Sequential(
    nn.Conv2d(1,6,5),
    nn.BatchNorm2d(6),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    nn.Conv2d(6,16,5),
    nn.BatchNorm2d(16),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    d2l.FlattenLayer(),
    nn.Linear(16 * 4 * 4,120),
    nn.BatchNorm1d(120),
    nn.Sigmoid(),
    nn.Linear(120,84),
    nn.BatchNorm1d(84),
    nn.Sigmoid(),
    nn.Linear(84,10)
)

batch_size = 256
train_iter,test_iter = d2l.load_data_fashion_mnist(batch_size)

lr,num_epochs = 0.001,5
optimizer = torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)

training on  cuda
epoch 1, loss 1.3784, train acc 0.764,test acc 0.805,time 2.0
epoch 2, loss 0.3021, train acc 0.858,test acc 0.818,time 1.9
epoch 3, loss 0.1377, train acc 0.877,test acc 0.686,time 1.9
epoch 4, loss 0.0878, train acc 0.886,test acc 0.850,time 1.9
epoch 5, loss 0.0647, train acc 0.891,test acc 0.786,time 1.9
