In [1]:
import torch
import torch.nn as nn
from torch.utils import data as Data
from torchvision import transforms, datasets

In [3]:
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # momentum 一般选0.9或0.1
    if not torch.is_grad_enabled():  #做推理的时候用全局的均值和方差，因为可能就一个样本
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)  #2维为全连接层，4维为卷积层
        if len(X.shape) == 2:
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
        else:
            mean = X.mean(dim=(0, 2, 3), keepdim=True)  #每一个通道的均值
            var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)
        X_hat = (X - mean) / torch.sqrt(var + eps)
        moving_mean = momentum * moving_mean + (1 - momentum) * mean  #用当前批量的均值更新全局的均值
        moving_var = momentum * moving_var + (1 - momentum) * var
    Y = gamma * X_hat + beta  #更新gamma和beta
    return Y, moving_mean.data, moving_var.data

In [4]:
class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super().__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.ones(shape)

    def forward(self, X):
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        Y, self.moving_mean, self.moving_var = batch_norm(X, self.gamma, self.beta, self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9)
        return Y

In [5]:
net = nn.Sequential(nn.Conv2d(1, 6, kernel_size=5), BatchNorm(6, num_dims=4), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2),
                    nn.Conv2d(6, 16, kernel_size=5), BatchNorm(16, num_dims=4), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2), nn.Flatten(),
                    nn.Linear(16 * 4 * 4, 120), BatchNorm(120, num_dims=2), nn.Sigmoid(), nn.Linear(120, 84), BatchNorm(84, num_dims=2), nn.Sigmoid(),
                    nn.Linear(84, 10))
net2 = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5), BatchNorm(6, num_dims=4), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5), BatchNorm(16, num_dims=4), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2), nn.Flatten(),
    nn.Linear(16 * 4 * 4, 120), BatchNorm(120, num_dims=2), nn.Sigmoid(),
    nn.Linear(120, 84), BatchNorm(84, num_dims=2), nn.Sigmoid(),
    nn.Linear(84, 10))

In [6]:
lr, num_epochs, batch_size = 1., 10, 256
transform = transforms.Compose([
    # transforms.Resize((224, 224)),
    transforms.ToTensor()
])
mnist_train = datasets.FashionMNIST(root='dataset/Fashion_Minist', train=True, transform=transform, download=True)
mnist_test = datasets.FashionMNIST(root='dataset/Fashion_Minist', train=False, transform=transform, download=True)
train_iter = Data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=0)
test_iter = Data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=0)


def evaluate_accuracy_gpu(net, data_iter, device=None):
    if isinstance(net, torch.nn.Module):
        net.eval()
        if not device:
            device = next(iter(net.parameters())).device
    metric = [0, 0]
    for X, y in data_iter:
        if isinstance(X, list):
            X = [x.to(device) for x in X]
        else:
            X = X.to(device)
        y = y.to(device)
        with torch.no_grad():
            metric[0] += torch.sum(torch.argmax(net(X), dim=1) == y).item()
            metric[1] += y.shape[0]
        return metric[0] / metric[1]


def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)

    net.apply(init_weights)
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()
    num_batches = len(train_iter)
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        metric = [0, 0, 0]
        net.train()
        for i, (X, y) in enumerate(train_iter):
            optimizer.zero_grad()
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            metric[0] += l * X.shape[0]
            metric[1] += torch.sum(torch.argmax(y_hat, dim=1) == y).item()
            metric[2] += y.shape[0]
            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                print('epoch %d, batch %d, loss %.4f, train acc %.3f' % (epoch + 1, i + 1, train_l, train_acc))
        n += y.shape[0]
    test_acc = evaluate_accuracy_gpu(net, test_iter)
    print(f'loss {train_l:.3f},train acc {train_acc:.3f},test acc {test_acc:.3f}')

In [7]:
train_ch6(net2, train_iter, test_iter, num_epochs, lr, 'cuda')

training on cuda
epoch 1, batch 47, loss 1.1564, train acc 0.605
epoch 1, batch 94, loss 0.9509, train acc 0.668
epoch 1, batch 141, loss 0.8486, train acc 0.700
epoch 1, batch 188, loss 0.7799, train acc 0.722
epoch 1, batch 235, loss 0.7310, train acc 0.738
epoch 2, batch 47, loss 0.5130, train acc 0.810
epoch 2, batch 94, loss 0.5019, train acc 0.814
epoch 2, batch 141, loss 0.4826, train acc 0.823
epoch 2, batch 188, loss 0.4740, train acc 0.826
epoch 2, batch 235, loss 0.4673, train acc 0.829
epoch 3, batch 47, loss 0.4139, train acc 0.850
epoch 3, batch 94, loss 0.4148, train acc 0.848
epoch 3, batch 141, loss 0.4019, train acc 0.854
epoch 3, batch 188, loss 0.3977, train acc 0.855
epoch 3, batch 235, loss 0.3928, train acc 0.857
epoch 4, batch 47, loss 0.3545, train acc 0.867
epoch 4, batch 94, loss 0.3526, train acc 0.871
epoch 4, batch 141, loss 0.3537, train acc 0.871
epoch 4, batch 188, loss 0.3533, train acc 0.871
epoch 4, batch 235, loss 0.3517, train acc 0.872
epoch 5, ba

In [9]:
net2[1].gamma.reshape((-1,)), net2[1].beta.reshape((-1,))

(tensor([3.4200, 2.5762, 2.6140, 2.8960, 2.0903, 3.6441], device='cuda:0',
        grad_fn=<ReshapeAliasBackward0>),
 tensor([-1.9752,  2.7767,  2.8723,  0.6223, -0.4964, -2.4448], device='cuda:0',
        grad_fn=<ReshapeAliasBackward0>))

In [12]:
#调包实现
net3 = nn.Sequential(nn.Conv2d(1, 6, kernel_size=5), nn.BatchNorm2d(6), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2),
                    nn.Conv2d(6, 16, kernel_size=5), nn.BatchNorm2d(16), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2),
                    nn.Flatten(), nn.Linear(256, 120), nn.BatchNorm1d(120), nn.Sigmoid(), nn.Linear(120, 84), nn.BatchNorm1d(84), nn.Sigmoid(),
                    nn.Linear(84, 10))

In [13]:
train_ch6(net3, train_iter, test_iter, num_epochs, lr, 'cuda')

training on cuda
epoch 1, batch 47, loss 1.0964, train acc 0.621
epoch 1, batch 94, loss 0.8633, train acc 0.697
epoch 1, batch 141, loss 0.7476, train acc 0.736
epoch 1, batch 188, loss 0.6780, train acc 0.760
epoch 1, batch 235, loss 0.6325, train acc 0.776
epoch 2, batch 47, loss 0.4119, train acc 0.852
epoch 2, batch 94, loss 0.4054, train acc 0.854
epoch 2, batch 141, loss 0.3952, train acc 0.858
epoch 2, batch 188, loss 0.3943, train acc 0.858
epoch 2, batch 235, loss 0.3909, train acc 0.859
epoch 3, batch 47, loss 0.3439, train acc 0.876
epoch 3, batch 94, loss 0.3557, train acc 0.872
epoch 3, batch 141, loss 0.3483, train acc 0.874
epoch 3, batch 188, loss 0.3439, train acc 0.876
epoch 3, batch 235, loss 0.3426, train acc 0.876
epoch 4, batch 47, loss 0.3398, train acc 0.877
epoch 4, batch 94, loss 0.3355, train acc 0.879
epoch 4, batch 141, loss 0.3285, train acc 0.882
epoch 4, batch 188, loss 0.3272, train acc 0.882
epoch 4, batch 235, loss 0.3229, train acc 0.884
epoch 5, ba