### GPT architecture part 2: Layer Normalization

In [1]:
import torch
import torch.nn as nn

torch.manual_seed(123)
batch_example = torch.randn(2, 5)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
print(out)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [2]:
### The neural network layer we have coded consists of a linear layer followed by a non linear activation function.

In [3]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [4]:
### The first row in the mean tensor above contains the mean value for the first input row, and the second output row contains
### The mean for the second row.
### Using keepdim=True in operations like mean or variance calculation ensures that the output tensor retains the same number of 
### dimensions as the input tensor, even though the operation reduces the tensor along the dimension specified via dim.
### For instance, without keepdim=True, the returned mean tensor would be a 2-dimensional vector [0.1324, 0.2170] instead of a 2x1-
### dimensional matrix [[0.1324], [0.2170]]

In [5]:
out_norm = (out - mean) / torch.sqrt(var)
mean = out_norm.mean(dim=-1, keepdim=True)
var = out_norm.var(dim=-1, keepdim=True)
print("Normalized layer outputs:\n", out_norm)
print("Mean:\n", mean)
print("Variance:\n", var)

Normalized layer outputs:
 tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
Mean:
 tensor([[9.9341e-09],
        [0.0000e+00]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [6]:
torch.set_printoptions(sci_mode=False)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [11]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [12]:
### eps is a small constant that is added to the variance to prevent division by zero during normalization.
### The scale and shift are two trainable parameters (of the same dimension as the input) that the LLM automatically adjusts 
### during training if it is determined that doing so would improve the model's performance on its training task.
### This allows the model to learn appropriate scaling and shifting that best sut the data it is processing.

In [13]:
### In our variance calculation method, we have opted for an implementation detail by setting unbiased=False.
### In the variance calculation, we divide by the number of inputs 'n' in the variance formula.
### This approach does not apply Bessel's correction, which typically uses n-1 instead of n in the denominator to adjust for bias
### in sample variance estimation.
### For large scale LLMs where the embedding dimension 'n' is significantly large, the difference between using 'n' and 'n-1' 
### is practically negligible.
### We chose this approach to ensure compatibility with GPT-2 model's normalization layers because it reflects tensorflow's
### default behavior, which was used to implement the original GPT-2 model.

In [14]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, keepdim=True, unbiased=False)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
