In [100]:
import torch
import torch.nn as nn

In [101]:
# http://d2l.ai/chapter_convolutional-modern/batch-norm.html?highlight=batchnorm2d

In [102]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.bn = nn.BatchNorm2d(3, affine=False, momentum=False, track_running_stats=False)

    def forward(self, x):
        x = self.bn(x)
        return x

In [109]:
# With Learnable Parameters
#m = nn.BatchNorm2d(3, momentum=False, track_running_stats=False)
# Without Learnable Parameters
m = nn.BatchNorm2d(3, affine=True, momentum=False, track_running_stats=False)
input = torch.randn(2, 3, 2, 2)
model = Net()
model.train()
output = model(input)
print(input[0])

tensor([[[-0.2002,  1.8858],
         [ 0.6419, -0.7827]],

        [[-1.7291, -0.5246],
         [ 0.9352, -0.3953]],

        [[-0.2683,  0.1427],
         [-0.0577,  0.6652]]])


In [112]:
avg = torch.mean(input, dim=[0,2,3])
print(avg)
sqr = torch.abs(input - avg[None,:,None,None])
sqr

tensor([-0.0844,  0.0880, -0.0050])


tensor([[[[0.1158, 1.9702],
          [0.7263, 0.6983]],

         [[1.8170, 0.6125],
          [0.8473, 0.4832]],

         [[0.2633, 0.1477],
          [0.0527, 0.6703]]],


        [[[0.0111, 0.3808],
          [0.0566, 2.3086]],

         [[1.6653, 1.4483],
          [2.1870, 0.3385]],

         [[0.6484, 0.8170],
          [0.0263, 0.3597]]]])

In [98]:
avg = torch.mean(input, dim=[0,2,3])
var = torch.sqrt(torch.var(input, dim=[0,2,3], unbiased=False) + 1e-5)
print(avg)
print(var)

tensor([-0.0109, -0.0147,  0.0013])
tensor([1.0322, 1.0050, 0.9993])


In [99]:
a = (torch.randn(input.shape) / var[None,:,None,None]) + input
a

tensor([[[[ 0.6877,  0.1461,  1.6874,  ...,  0.4209, -0.3794,  0.4051],
          [-1.0926, -2.4687, -0.7446,  ..., -0.9367,  2.4565,  2.1334],
          [ 2.1169,  0.2763, -1.6185,  ...,  2.5010,  1.3717,  1.0402],
          ...,
          [-0.1888,  1.5552,  0.0081,  ..., -0.4778,  0.3557,  2.3575],
          [-0.1928, -0.8449, -0.7285,  ...,  1.0142,  0.0646,  1.7503],
          [-1.2573,  0.8269,  0.8398,  ...,  1.3521,  0.6122, -0.5174]],

         [[-0.3816, -1.7383,  0.0870,  ...,  2.9246,  2.0962,  0.9667],
          [ 0.1768, -0.8558, -2.2780,  ...,  0.3553,  2.2534, -1.8988],
          [ 0.3465, -0.8680,  2.8089,  ...,  0.9121, -0.3247,  0.7499],
          ...,
          [-3.1822,  1.6501, -0.7749,  ...,  1.6824, -1.0801,  0.3653],
          [ 1.2343, -0.7169, -0.2718,  ...,  0.3745, -0.1327,  0.2272],
          [-2.6126, -1.6869, -1.7422,  ..., -1.0244,  0.6757,  0.4838]],

         [[-2.0979,  0.5308, -1.0360,  ..., -0.9062,  2.8048,  1.1074],
          [-1.3350,  0.0250, -

In [61]:
#avg[None,:,None, None]

In [80]:
(input - avg[None,:,None, None])/(var[None,:,None,None])

tensor([[[[ 0.2129, -1.4905],
          [-1.1706,  1.3179]],

         [[ 2.2721,  0.4208],
          [-0.9273,  0.1711]],

         [[-0.6782, -1.4200],
          [ 0.3269,  1.5030]]],


        [[[ 0.3856, -0.7068],
          [ 0.0461,  1.4054]],

         [[-0.5705, -1.0837],
          [-0.4813,  0.1990]],

         [[-0.1303, -0.7361],
          [-0.4213,  1.5560]]]])

In [72]:
output[0]

tensor([[[ 0.2129, -1.4905],
         [-1.1706,  1.3179]],

        [[ 2.2721,  0.4208],
         [-0.9273,  0.1711]],

        [[-0.6782, -1.4200],
         [ 0.3269,  1.5030]]])

In [64]:
m.weight

Parameter containing:
tensor([1., 1., 1.], requires_grad=True)

In [65]:
m.bias

Parameter containing:
tensor([0., 0., 0.], requires_grad=True)

In [49]:
m

BatchNorm2d(3, eps=1e-05, momentum=False, affine=True, track_running_stats=False)

In [50]:
for i in model.parameters() :
    print(i)

In [33]:
for p in model.parameters() :
    print(p)

Parameter containing:
tensor([1., 1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0., 0.], requires_grad=True)


In [76]:
import torch
from torch import nn
# from d2l import torch as d2l

def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # Use `is_grad_enabled` to determine whether the current mode is training
    # mode or prediction mode
    if not torch.is_grad_enabled():
        # If it is prediction mode, directly use the mean and variance
        # obtained by moving average
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            # When using a fully-connected layer, calculate the mean and
            # variance on the feature dimension
            mean = X.mean(dim=0)
            var = ((X - mean)**2).mean(dim=0)
        else:
            # When using a two-dimensional convolutional layer, calculate the
            # mean and variance on the channel dimension (axis=1). Here we
            # need to maintain the shape of `X`, so that the broadcasting
            # operation can be carried out later
            mean = X.mean(dim=(0, 2, 3), keepdim=True)
            var = ((X - mean)**2).mean(dim=(0, 2, 3), keepdim=True)
        # In training mode, the current mean and variance are used for the
        # standardization
        X_hat = (X - mean) / torch.sqrt(var + eps)
        # Update the mean and variance using moving average
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta  # Scale and shift
    return Y, moving_mean.data, moving_var.data

In [77]:
class BatchNorm(nn.Module):
    # `num_features`: the number of outputs for a fully-connected layer
    # or the number of output channels for a convolutional layer. `num_dims`:
    # 2 for a fully-connected layer and 4 for a convolutional layer
    def __init__(self, num_features, num_dims):
        super().__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        # The scale parameter and the shift parameter (model parameters) are
        # initialized to 1 and 0, respectively
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        # The variables that are not model parameters are initialized to 0 and 1
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.ones(shape)

    def forward(self, X):
        # If `X` is not on the main memory, copy `moving_mean` and
        # `moving_var` to the device where `X` is located
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        # Save the updated `moving_mean` and `moving_var`
        Y, self.moving_mean, self.moving_var = batch_norm(
            X, self.gamma, self.beta, self.moving_mean, self.moving_var,
            eps=1e-5, momentum=0.9)
        return Y

In [78]:
custom_m = BatchNorm(num_features=3, num_dims=4)

In [79]:
output = custom_m(input)
output

tensor([[[[ 0.2129, -1.4905],
          [-1.1706,  1.3179]],

         [[ 2.2721,  0.4208],
          [-0.9273,  0.1711]],

         [[-0.6782, -1.4200],
          [ 0.3269,  1.5030]]],


        [[[ 0.3856, -0.7068],
          [ 0.0461,  1.4054]],

         [[-0.5705, -1.0837],
          [-0.4813,  0.1990]],

         [[-0.1303, -0.7361],
          [-0.4213,  1.5560]]]], grad_fn=<AddBackward0>)

In [117]:
torch.rand(1).item() > 0.5

False