# Layer Normalization
Typically performed to make back propogation smoother and faster
Rescales each of the features to ensure hat all features contribute proportionately


In [7]:
import torch 
from torch import nn

In [21]:
class LayerNormalization():
    def __init__(self,parameters_shape,eps = 1e-5):
        self.parameter_shape = parameters_shape
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta = nn.Parameter(torch.zeros(parameters_shape))
    def forward(self,input):
        dims = [-(i+1) for i in range(len(self.parameter_shape))]
        mean = input.mean(dim = dims,keepdim = True)
        var = ((input - mean)**2).mean(dim = dims,keepdim = True)
        std = (var + self.eps).sqrt()
        y = (input - mean)/std
        out = self.gamma*mean + self.beta
        return out

In [26]:
batch_size = 3
sequence_length = 4
embedding_dim = 8
input = torch.randn(sequence_length,batch_size,embedding_dim)
input

tensor([[[-1.6066e+00, -1.2144e-01,  4.8077e-01,  6.1586e-01,  2.3421e-01,
           1.7061e-01,  2.4429e-04, -1.8443e+00],
         [-2.9803e-01, -6.9081e-01, -9.7087e-01, -2.1962e-01, -7.1620e-01,
           1.8909e-01, -5.5205e-01,  1.9166e+00],
         [ 1.0647e+00,  1.0995e+00, -1.4656e+00, -1.2400e+00, -1.0252e+00,
           5.6818e-01,  1.1265e+00, -1.4623e+00]],

        [[-6.1685e-02, -7.5046e-01,  7.1222e-01, -1.0652e+00,  3.9705e-01,
           8.2210e-01,  3.6211e-01, -3.0005e-01],
         [ 1.1059e+00,  9.4058e-01, -5.4890e-01, -1.6709e-01, -5.3037e-01,
           1.0509e+00, -1.6510e+00, -9.3317e-01],
         [-7.0923e-01,  1.4314e+00, -7.7552e-01, -7.4886e-01, -3.3261e-03,
           1.5952e-01,  1.0939e+00, -7.7413e-01]],

        [[ 1.0554e+00,  1.1050e-01,  3.6262e-01,  2.4911e-01,  1.5173e+00,
           3.3672e-02, -7.6105e-01, -2.3047e-01],
         [ 3.5254e-01, -3.5780e-01,  6.7674e-01, -4.5833e-01,  5.0088e-01,
           4.9306e-01, -8.9852e-01, -1.2132e+0

In [27]:
layer_norm = LayerNormalization(input.size()[-1:])
    

In [28]:
out = layer_norm.forward(input)

In [29]:
out

tensor([[[-0.2588, -0.2588, -0.2588, -0.2588, -0.2588, -0.2588, -0.2588,
          -0.2588],
         [-0.1677, -0.1677, -0.1677, -0.1677, -0.1677, -0.1677, -0.1677,
          -0.1677],
         [-0.1668, -0.1668, -0.1668, -0.1668, -0.1668, -0.1668, -0.1668,
          -0.1668]],

        [[ 0.0145,  0.0145,  0.0145,  0.0145,  0.0145,  0.0145,  0.0145,
           0.0145],
         [-0.0917, -0.0917, -0.0917, -0.0917, -0.0917, -0.0917, -0.0917,
          -0.0917],
         [-0.0408, -0.0408, -0.0408, -0.0408, -0.0408, -0.0408, -0.0408,
          -0.0408]],

        [[ 0.2921,  0.2921,  0.2921,  0.2921,  0.2921,  0.2921,  0.2921,
           0.2921],
         [-0.1131, -0.1131, -0.1131, -0.1131, -0.1131, -0.1131, -0.1131,
          -0.1131],
         [ 0.1475,  0.1475,  0.1475,  0.1475,  0.1475,  0.1475,  0.1475,
           0.1475]],

        [[ 0.0157,  0.0157,  0.0157,  0.0157,  0.0157,  0.0157,  0.0157,
           0.0157],
         [-0.2610, -0.2610, -0.2610, -0.2610, -0.2610, -0.2610, 