**Normalizing activations with layer normalization**

Layer Normalization is typically applied before and after the multi-head attention module and before the final output layer

*The main idea behind layer normalization is to adjust the activations (outputs) of a neural network layer to have a mean of 0 and a variance of 1, also known as unit variance*

In [19]:
import torch
import torch.nn as nn

torch.manual_seed(123)
batch_example=torch.randn(2,5) # 2 input example
# print(batch_example)

layer=nn.Sequential(nn.Linear(5,6), nn.ReLU()) # 5 input, 6 output
out=layer(batch_example)
print(out)
print(out.shape)


tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)
torch.Size([2, 6])


In [34]:
# before applying layer normalization
mean=out.mean(dim=-1, keepdim=True)
print(mean) # mean of 1st and 2nd input row

var=out.var(dim=-1, keepdim=True)
print(var) # variance of 1st, 2nd input row

# dim=1,-1 calculates the mean across column dimension 
# dim=0 calculates the mean across row dimension

# for 3D 
# [batch_size, num_tokens, embedding_size]
# there we'll use dim=-1 instead of dim=1,2 etc
# -1 refers to the tensors last dimension


tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [35]:
# applying z score normalization
out_norm=(out-mean)/torch.sqrt(var)
mean=out_norm.mean(dim=-1, keepdim=True)
var=out_norm.var(dim=-1, keepdim=True)


In [36]:
print(out_norm)
print(mean)
print(var)


tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
tensor([[9.9341e-09],
        [0.0000e+00]], grad_fn=<MeanBackward1>)
tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [37]:
torch.set_printoptions(sci_mode=False)
print(mean)
print(var)

tensor([[0.0000],
        [0.0000]], grad_fn=<MeanBackward1>)
tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


**A layer normalization class**

In [42]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps=1e-5
        self.scale=nn.Parameter(torch.ones(emb_dim))
        self.shift=nn.Parameter(torch.zeros(emb_dim))

    def forward(self,x):
        mean=x.mean(dim=-1, keepdim=True)
        var=x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x=(x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x+self.shift
    


In [44]:
# trying LayerNorm
ln=LayerNorm(emb_dim=5)
out_ln=ln(batch_example)

mean=out_ln.mean(dim=-1, keepdim=True)
var=out_ln.var(dim=-1, keepdim=True, unbiased=False)

print(mean)
print(var)


tensor([[-0.0000],
        [ 0.0000]], grad_fn=<MeanBackward1>)
tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
