In [1]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

In [2]:
class LayerNormalization2(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias
    


In [13]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps = 1e-5):
        super().__init__()
        self.parameter_shape = parameters_shape
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta = nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i+1) for i in range(len(self.parameter_shape))]

        mean = inputs.mean(dim = dims, keepdim = True)
        print(f"Mean ({mean.size()})")

        var = ((inputs-mean)**2).mean(dim = dims,keepdim = True)

        std = (var + self.eps).sqrt()
        print(f"Standard Deviation  ({std.size()})")

        y = (inputs - mean) / std
        print(f"y: {y.size()}")

        out = self.gamma * y + self.beta
        print(f"self.gamma: {self.gamma.size()}, self.beta: {self.beta.size()}")
        
        print(f"out: {out.size()}")
        return out


In [None]:
batch_size = 3
sentence_length = 5
embedding_dim = 8
inputs = torch.randn(sentence_length, batch_size, embedding_dim)
print(inputs.shape)
print(inputs)

In [27]:
inputs = torch.Tensor([[[0.2,0.1,0.3],[0.5,0.1,0.1]]])
B, S, E = inputs.size()
inputs = inputs.reshape(S, B,E)
inputs.size()

torch.Size([2, 1, 3])

In [34]:
parameter_shape = inputs.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))
beta = nn.Parameter(torch.zeros(parameter_shape))
print(parameter_shape)
gamma.size(), beta.size()


torch.Size([1, 3])


(torch.Size([1, 3]), torch.Size([1, 3]))

In [30]:
dims = [-(i+1) for i in range(len(parameter_shape))]
dims # batch and embedding layer

[-1, -2]

In [37]:
mean = inputs.mean(dim = dims ,keepdim=True)
mean.size()

torch.Size([2, 1, 1])

In [38]:
var = ((inputs - mean)**2).mean(dim = dims, keepdim=True)
epsilon = 1e-5
std = (var + epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [40]:
y = (inputs - mean) / std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [41]:
out = gamma * y + beta
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

In [67]:
class LayerNormalization(nn.Module):
    def __init__(self, parameter_shape, eps=1e-5):
        super().__init__()
        self.parameter_shape = parameter_shape
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(parameter_shape))
        self.beta = nn.Parameter(torch.zeros(parameter_shape))

    def forward(self, x):
        dims = [-(i+1)for i in range(len(self.parameter_shape))]
        print(f"Dimensions: {dims}")

        mean = x.mean(dim = dims , keepdim = True)
        print(f"Mean: {mean}")

        var = ((x - mean)**2).mean(dim = dims, keepdim = True) 
        print(f"Variance: {var}")

        std = (var + self.eps).sqrt()
        print(f"Standard Deviation: {std}")

        y = (x - mean) / std
        print(f"y: {y}") 

        out = self.gamma * y + self.beta
        print(f"out: {out}")
        
        return out

In [68]:
batch_size = 1
sentence_length = 2
embedding_dim =  3 
inputs = torch.Tensor([[[0.2,0.1,0.3],[0.5,0.1,0.1]]])
inputs = inputs.reshape(sentence_length, batch_size, embedding_dim)
print(inputs.shape)

torch.Size([2, 1, 3])


In [69]:
class LayerNormalization2(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        print(f"Mean: {mean}")
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        print(f"Standard Deviation: {std}")
        # eps is to prevent dividing by zero or when std is very small
        out = self.alpha * (x - mean) / (std + self.eps) + self.bias
        print(f"out: {out}")
        return out

In [70]:
ln = LayerNormalization(inputs.size()[-2:])
out = ln(inputs)
out.size()

Dimensions: [-1, -2]
Mean: tensor([[[0.2000]],

        [[0.2333]]])
Variance: tensor([[[0.0067]],

        [[0.0356]]])
Standard Deviation: tensor([[[0.0817]],

        [[0.1886]]])
y: tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])
out: tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)


torch.Size([2, 1, 3])

In [71]:
batch_size = 1
sentence_length = 2
embedding_dim =  3 
inputs = torch.Tensor([[[0.2,0.1,0.3],[0.5,0.1,0.1]]])
inputs = inputs.reshape(sentence_length, batch_size, embedding_dim)
print(inputs.shape)
ln = LayerNormalization2(inputs.size()[-1])
ln.forward(inputs).size()

torch.Size([2, 1, 3])
Mean: tensor([[[0.2000]],

        [[0.2333]]])
Standard Deviation: tensor([[[0.1000]],

        [[0.2309]]])
out: tensor([[[ 0.0000, -1.0000,  1.0000]],

        [[ 1.1547, -0.5773, -0.5773]]], grad_fn=<AddBackward0>)


torch.Size([2, 1, 3])