<a href="https://colab.research.google.com/github/adil-1522/TRANSFORMER_pytorch/blob/main/LayerNormalization_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import torch
from torch import nn

In [8]:
inputs=torch.tensor([[[0.2,0.1,0.3],[0.5,0.1,0.1]]])
B,S,E = inputs.size()
inputs = inputs.reshape(S,B,E)
inputs.size()

torch.Size([2, 1, 3])

In [10]:
parameter_shape = inputs.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))
beta = nn.Parameter(torch.zeros(parameter_shape))


In [11]:
gamma.size(),beta.size()

(torch.Size([1, 3]), torch.Size([1, 3]))

In [12]:
dims = [-(i+1) for i in range(len(parameter_shape))]

In [13]:
dims

[-1, -2]

In [14]:
mean = inputs.mean(dim=dims,keepdim = True)
mean.size()

torch.Size([2, 1, 1])

In [15]:
mean

tensor([[[0.2000]],

        [[0.2333]]])

In [16]:
var = ((inputs-mean)**2).mean(dim=dims,keepdim=True)
epsilon = 1e-5
std = (var+epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [18]:
y=(inputs-mean)/std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [19]:
out=gamma*y+beta
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

**Class**

In [24]:
import torch
from torch import nn

class LayerNormalization():
  def __init__(self,parameter_shape,eps=1e-5):
    self.parameter_shape = parameter_shape
    self.eps = eps
    self.gamma = nn.Parameter(torch.ones(parameter_shape))
    self.beta = nn.Parameter(torch.zeros(parameter_shape))

  def forward(self,input):
    dims = [-(i+1) for i in range(len(parameter_shape))]
    mean = input.mean(dim=dims,keepdim = True)
    print(f"Mean \n ({mean.size()}): \n {mean}")
    var = ((input-mean)**2).mean(dim=dims,keepdim=True)
    std = (var+self.eps).sqrt()
    print(f"Standard Deviation \n ({std.size()}): \n {std}")
    y = (input-mean)/std
    print(f"y \n ({y.size()}): \n {y}")
    out = self.gamma*y+self.beta
    print(f"out \n ({out.size()}): {out}")
    return out


In [26]:
batch_size = 3
sentence_length = 5
embedding_dim = 8
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")


input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-0.5897,  1.4293,  1.6763,  1.8592, -0.1880, -0.1936,  0.1586,
           0.4671],
         [ 1.9739, -1.7182,  0.0953,  0.2840, -0.6275,  0.8292,  1.0407,
          -0.6550],
         [-0.9478,  0.4424,  0.6518, -0.2782, -0.8746,  0.9107,  0.5408,
           1.3074]],

        [[ 0.0946, -1.3505,  1.1117,  0.5447, -0.3188, -1.2377,  0.0636,
          -0.3960],
         [ 2.2841,  0.6063, -0.2040,  0.1069, -1.7385,  0.7208, -1.7329,
           0.8244],
         [ 0.0400, -1.1448,  0.6123, -0.0143,  0.9370,  0.3978, -1.1225,
          -0.6048]],

        [[ 0.4680,  0.9843,  0.0078,  0.9950, -0.1463, -0.6562,  1.1002,
           0.6854],
         [ 1.8417,  0.9190, -1.2468, -0.6705, -1.1556, -0.6820,  0.8783,
           0.4110],
         [ 0.4326, -0.5072, -0.4974, -0.3480,  0.5087,  0.2125,  3.0681,
           0.9062]],

        [[ 0.4317, -0.7807,  0.7469, -0.6179, -0.2498,  1.7079, -0.8894,
           0.2271],
         [ 1.0359, -0.3672, 

In [27]:
layer_norm = LayerNormalization(inputs.size()[-1:])

In [28]:
out = layer_norm.forward(inputs)

Mean 
 (torch.Size([5, 1, 1])): 
 tensor([[[ 0.3164]],

        [[-0.0634]],

        [[ 0.3129]],

        [[ 0.2686]],

        [[ 0.3525]]])
Standard Deviation 
 (torch.Size([5, 1, 1])): 
 tensor([[[0.9419]],

        [[0.9634]],

        [[0.9638]],

        [[0.9187]],

        [[0.9876]]])
y 
 (torch.Size([5, 3, 8])): 
 tensor([[[-0.9620,  1.1814,  1.4437,  1.6378, -0.5355, -0.5415, -0.1675,
           0.1600],
         [ 1.7596, -2.1600, -0.2347, -0.0345, -1.0021,  0.5444,  0.7689,
          -1.0312],
         [-1.3421,  0.1338,  0.3561, -0.6313, -1.2644,  0.6309,  0.2382,
           1.0520]],

        [[ 0.1640, -1.3360,  1.2197,  0.6311, -0.2651, -1.2190,  0.1318,
          -0.3453],
         [ 2.4367,  0.6951, -0.1460,  0.1767, -1.7388,  0.8139, -1.7330,
           0.9215],
         [ 0.1073, -1.1226,  0.7013,  0.0509,  1.0383,  0.4787, -1.0993,
          -0.5620]],

        [[ 0.1610,  0.6966, -0.3165,  0.7077, -0.4765, -1.0055,  0.8169,
           0.3865],
         [ 1.5863

In [29]:
out[0].mean(),out[1].std()

(tensor(-3.9736e-08, grad_fn=<MeanBackward0>),
 tensor(1.0215, grad_fn=<StdBackward0>))