In [2]:
import torch
from torch import nn

inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])
B, S, E = inputs.size()
inputs = inputs.reshape(S, B, E)
inputs.size()


torch.Size([2, 1, 3])

In [3]:


class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y  + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        return out

In [4]:

batch_size = 3
sentence_length = 5
embedding_dim = 8 
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 0.1807,  0.5339,  1.3347,  0.5311,  0.5614, -0.0292, -0.7823,
           0.3445],
         [ 0.2114,  0.5260,  1.4093, -0.6194,  1.6101,  0.8548, -0.3781,
          -0.6180],
         [ 0.1744, -0.9059, -0.4876,  2.4480,  2.0974,  0.2210,  2.4788,
          -0.4545]],

        [[-0.8417,  2.0853,  0.3793,  0.5303,  0.5920, -0.3745,  0.5126,
          -0.2652],
         [ 0.9464,  2.2875,  0.7504, -0.8314, -0.7316,  0.0767,  0.1700,
           0.3425],
         [-1.2084,  0.2109, -0.7768,  0.4667,  0.4166, -0.1492, -0.9095,
           0.0707]],

        [[ 0.0082,  0.2929, -0.3586,  1.1938, -2.0075,  0.2972,  0.0097,
           0.1387],
         [ 0.5569, -0.2345, -0.3695,  1.3147, -0.2123, -0.5639, -0.9903,
          -0.3575],
         [ 0.3999, -0.7194, -0.3010,  1.6036, -0.1382,  0.5192,  0.4772,
          -2.8677]],

        [[ 0.4629,  0.9144,  0.6903, -0.1967, -0.0791,  0.6542, -0.5260,
           0.8527],
         [ 1.0473,  0.5346, 

In [5]:
layer_norm = LayerNormalization(inputs.size()[-1:])

In [6]:
out = layer_norm.forward(inputs)

Mean 
 (torch.Size([5, 3, 1])): 
 tensor([[[ 0.3344],
         [ 0.3745],
         [ 0.6965]],

        [[ 0.3272],
         [ 0.3763],
         [-0.2349]],

        [[-0.0532],
         [-0.1071],
         [-0.1283]],

        [[ 0.3466],
         [ 0.3664],
         [-0.3126]],

        [[ 0.0848],
         [ 0.2629],
         [-0.2871]]])
Standard Deviation 
 (torch.Size([5, 3, 1])): 
 tensor([[[0.5628],
         [0.8229],
         [1.3229]],

        [[0.8253],
         [0.9312],
         [0.6034]],

        [[0.8481],
         [0.6718],
         [1.2221]],

        [[0.5053],
         [1.2716],
         [0.6448]],

        [[0.8988],
         [1.0231],
         [1.0273]]])
y 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-0.2730,  0.3545,  1.7776,  0.3496,  0.4035, -0.6460, -1.9842,
           0.0180],
         [-0.1982,  0.1841,  1.2576, -1.2078,  1.5015,  0.5836, -0.9146,
          -1.2061],
         [-0.3946, -1.2112, -0.8951,  1.3240,  1.0590, -0.3594,  1.3473,
          -0.8700]],

