In [1]:
import time
import torch
from torch import nn,optim
import torch.nn.functional as F
import sys
sys.path.append('../code/')
import d2lzh_pytorch as d2l
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def batch_norm(is_traing,X,gamma,beta,moving_mean,moving_var,eps,momentum):
    if not is_traing:
        X_hat=(X-moving_mean)/torch.sqrt(moving_var+eps)
    else:
        assert len(X.shape) in (2,4)
        if len(X.shape)==2:
            mean=X.mean(dim=0)
            var=((X-mean)**2).mean(dim=0)
        else:
            mean=X.mean(dim=0,keepdim=True).mean(dim=2,keepdim=True).mean(dim=3,keepdim=True)
            var=((X-mean)**2).mean(dim=0,keepdim=True).mean(dim=2,keepdim=True).mean(dim=3,keepdim=True)
        X_hat=(X-mean)/torch.sqrt(var+eps)
        moving_mean=momentum*moving_mean+(1.0-momentum)*mean
        moving_var=momentum*moving_var+(1.0-momentum)*var
    Y=gamma*X_hat+beta
    return Y,moving_mean,moving_var

In [3]:
class BatchNorm(nn.Module):
    def __init__(self,num_features,num_dims):
        super(BatchNorm,self).__init__()
        if num_dims==2:
            shape=(1,num_features)
        else:
            shape=(1,num_features,1,1)
        self.gamma=nn.Parameter(torch.ones(shape))
        self.beta=nn.Parameter(torch.zeros(shape))
        self.moving_mean=torch.zeros(shape)
        self.moving_var=torch.zeros(shape)
        
    def forward(self,X):
        if self.moving_mean.device!=X.device:
            self.moving_mean=self.moving_mean.to(X.device)
            self.moving_var=self.moving_var.to(X.device)
        Y,self.moving_mean,self.moving_var=batch_norm(self.training,X,self.gamma,self.beta,self.moving_mean,self.moving_var
                                                     ,eps=1e-5,momentum=0.9)
        return Y

### 使用批量归一化层的LeNet

In [5]:
net=nn.Sequential(
    nn.Conv2d(1,6,5),
    BatchNorm(6,num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    nn.Conv2d(6,16,5),
    BatchNorm(16,num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    d2l.FlattenLayer(),
    nn.Linear(16*4*4,120),
    BatchNorm(120,num_dims=2),
    nn.Sigmoid(),
    nn.Linear(120,84),
    BatchNorm(84,num_dims=2),
    nn.Sigmoid(),
    nn.Linear(84,10)
    )

In [6]:
batch_size=256
train_iter,test_iter=d2l.load_data_fashion_mnist(batch_size,root='/workspace/mycode/Dive-into-DL-PyTorch/data')

In [11]:
for x,_ in train_iter:
    print(x.shape)
    break

torch.Size([256, 1, 28, 28])


In [12]:
lr,num_epochs=0.001,5
optimizer=torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)

training on  cuda
epoch 1, loss 0.9897, train acc 0.786, test acc 0.833, time 8.2 sec
epoch 2, loss 0.2261, train acc 0.866, test acc 0.825, time 8.0 sec
epoch 3, loss 0.1208, train acc 0.881, test acc 0.864, time 8.0 sec
epoch 4, loss 0.0825, train acc 0.887, test acc 0.871, time 8.0 sec
epoch 5, loss 0.0610, train acc 0.895, test acc 0.871, time 8.0 sec


In [13]:
net=nn.Sequential(
    nn.Conv2d(1,6,5),
    nn.BatchNorm2d(6),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    nn.Conv2d(6,16,5),
    nn.BatchNorm2d(16),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    d2l.FlattenLayer(),
    nn.Linear(16*4*4,120),
    nn.BatchNorm1d(120),
    nn.Sigmoid(),
    nn.Linear(120,84),
    nn.BatchNorm1d(84),
    nn.Sigmoid(),
    nn.Linear(84,10)
    )

In [14]:
lr,num_epochs=0.001,5
optimizer=torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)

training on  cuda
epoch 1, loss 0.9959, train acc 0.783, test acc 0.836, time 6.9 sec
epoch 2, loss 0.2290, train acc 0.863, test acc 0.845, time 7.0 sec
epoch 3, loss 0.1216, train acc 0.879, test acc 0.863, time 7.0 sec
epoch 4, loss 0.0817, train acc 0.888, test acc 0.859, time 6.9 sec
epoch 5, loss 0.0607, train acc 0.893, test acc 0.843, time 7.0 sec
