In [4]:
import torch
from torch import nn

In [48]:
#inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])
inputs = random_tensor = torch.rand(2, 3, 4)

#B, S, E = inputs.size() # B is Batch Dimension, S is Sequence Length, E is Embedding Dimension
#inputs = inputs.reshape(S, B, E)
inputs.size()

torch.Size([2, 3, 4])

In [49]:
mean = inputs.mean(dim=(-1, -2), keepdim=True)
mean.size(), mean

(torch.Size([2, 1, 1]),
 tensor([[[0.5824]],
 
         [[0.5446]]]))

In [50]:
mean = inputs.mean(dim=(-2, -1), keepdim=True)
mean.size(), mean

(torch.Size([2, 1, 1]),
 tensor([[[0.5824]],
 
         [[0.5446]]]))

In [24]:
parameter_shape = inputs.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape)) #standard deviation as 1
beta = nn.Parameter(torch.zeros(parameter_shape))

In [25]:
gamma.size(), beta.size()

(Parameter containing:
 tensor([[1., 1., 1.]], requires_grad=True),
 torch.Size([1, 3]))

In [30]:
for i in range(2):
    print(i)

0
1


In [8]:
dims = [-(i + 1) for i in range(len(parameter_shape))]

In [9]:
dims 

[-1, -2]

In [13]:
inputs.size()

torch.Size([2, 1, 3])

In [15]:
mean = inputs.mean(dim=dims, keepdim=True)
mean.size()

torch.Size([2, 1, 1])

In [11]:
mean

tensor([[[0.2000]],

        [[0.2333]]])

In [22]:
var = ((inputs - mean)**2).mean(dim=dims, keepdim=True)
epsilon = 1e-5
std = (var + epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [23]:
y = (inputs - mean) / std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [26]:
out= gamma * y + beta
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

## Put in All Together

In [51]:
import torch 
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape = parameters_shape
        self.eps = eps 
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta = nn.Parameter(torch.zeros(parameters_shape))
    
    def forward(self, input):
        dims = [-(i + 1) for i in range((len(self.parameters_shape)))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) **2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()} = \n {y})")
        out = self.gamma * y + self.beta
        print(f"out \n ({out.size()}): \n {out}")
        return out 

In [52]:
batch_size = 3
sentence_length = 5
embedding_dim = 8
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"Input \n ({inputs.size()}) = \n {inputs}")

Input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-0.4038,  0.4701, -0.7077, -0.5308, -0.0911,  1.0892,  0.6218,
          -1.3178],
         [ 0.0333, -0.2147, -0.2740,  0.4524,  0.6405, -0.9766, -0.8086,
          -1.2669],
         [-2.0655,  1.5353,  0.2845, -0.5863, -1.2331, -1.0358, -0.6745,
          -2.0683]],

        [[ 2.6855,  0.9859, -1.3735, -0.7226,  0.2717,  0.4650, -0.1177,
          -0.9257],
         [ 0.5514, -1.3410, -0.5067, -1.7234, -0.1031,  0.7908, -1.0715,
           1.0788],
         [ 0.2456,  2.0981,  1.3984,  0.4814, -0.2051, -0.2127,  0.0275,
          -0.0486]],

        [[ 1.6775,  0.3853, -1.3209, -0.8189,  0.6491, -0.6232,  0.1418,
           0.6868],
         [-0.1612,  1.8456, -0.2629, -0.5832, -0.0241,  0.0343, -0.3490,
          -0.8445],
         [-0.6611,  0.8946, -0.9291,  0.7850, -0.2915,  0.0955, -1.0408,
           0.3849]],

        [[-0.1459, -0.4396, -1.0301, -0.5471, -0.0491, -0.1092, -1.3591,
          -0.2347],
         [-0.0585,  0.0697, 

In [53]:
layer_norm = LayerNormalization(inputs.size()[-2:])

In [54]:
out = layer_norm.forward(inputs)

Mean 
 (torch.Size([5, 1, 1])): 
 tensor([[[-0.3804]],

        [[ 0.1137]],

        [[-0.0138]],

        [[-0.4010]],

        [[-0.1818]]])
Standard Deviation 
 (torch.Size([5, 1, 1])): 
 tensor([[[0.8976]],

        [[1.0500]],

        [[0.8009]],

        [[0.8385]],

        [[0.8844]]])
y 
 (torch.Size([5, 3, 8]) = 
 tensor([[[-0.0261,  0.9474, -0.3647, -0.1676,  0.3223,  1.6372,  1.1165,
          -1.0444],
         [ 0.4608,  0.1845,  0.1185,  0.9277,  1.1373, -0.6642, -0.4771,
          -0.9877],
         [-1.8774,  2.1342,  0.7407, -0.2294, -0.9500, -0.7302, -0.3277,
          -1.8804]],

        [[ 2.4494,  0.8307, -1.4163, -0.7964,  0.1505,  0.3346, -0.2204,
          -0.9899],
         [ 0.4169, -1.3854, -0.5908, -1.7496, -0.2064,  0.6449, -1.1287,
           0.9191],
         [ 0.1257,  1.8899,  1.2235,  0.3502, -0.3036, -0.3109, -0.0821,
          -0.1546]],

        [[ 2.1116,  0.4982, -1.6320, -1.0052,  0.8276, -0.7609,  0.1943,
           0.8746],
         [-0.1841

In [8]:
a, *b = [1, 2, 3]
print(a)
print(b)
print(*b)

1
[2, 3]
2 3


In [13]:
class A:
    def __init__(self):
        print("a", end='')

class B(A):
    def __init__(self):
        super(B, self).__init__()
        print("b", end='')

class C(B):
    def __init__(self):
        super(C, self).__init__()
        print("c", end='')

class D(C):
    def __init__(self):
        super(D, self).__init__()
        print("d", end='')

d = A()

a

In [8]:
import torch
_2i = torch.arange(0, 512, 2).float()
denominator = torch.pow(10000, _2i/512)
position = torch.arange(200).reshape(200, 1)
even = torch.sin(position / denominator)
odd = torch.cos(position / denominator)
stacked = torch.stack([even, odd], dim=2)
PE = torch.flatten(stacked, start_dim = 1, end_dim = 2)
print(f"odd size: {odd.size()}, stacked size: {stacked.size()}, PE size: {PE.size()}")

odd size: torch.Size([200, 256]), stacked size: torch.Size([200, 256, 2]), PE size: torch.Size([200, 512])


In [9]:
stacked = torch.tensor([
    [
        [1, 2, 3, 4],
        [5, 6, 7, 8],
        [9, 10, 11, 12]
    ],
    [
        [13, 14, 15, 16],
        [17, 18, 19, 20],
        [21, 22, 23, 24]
    ]
])
PE = torch.flatten(stacked, start_dim=1, end_dim=2)
PE.size()

torch.Size([2, 12])

In [43]:
even = torch.tensor([[2, 4, 6], [8, 10, 12]])
print(even)
odd = torch.tensor([[1, 3, 5], [7, 9, 11]])
print(odd)
stacked = torch.stack([even, odd], dim=2)
print(stacked)
PE = torch.flatten(stacked, start_dim = 1, end_dim = 2)
PE

tensor([[ 2,  4,  6],
        [ 8, 10, 12]])
tensor([[ 1,  3,  5],
        [ 7,  9, 11]])
tensor([[[ 2,  1],
         [ 4,  3],
         [ 6,  5]],

        [[ 8,  7],
         [10,  9],
         [12, 11]]])


tensor([[ 2,  1,  4,  3,  6,  5],
        [ 8,  7, 10,  9, 12, 11]])

In [9]:
import torch
from torch import nn

#net = nn.Sequential(nn.Linear(10, 10), nn.Linear(10, 10)); net

#l = [nn.Linear(10, 10), nn.Linear(10, 10)]
#net = nn.Sequential(*l); net

l = [nn.Linear(10, 10)]*2
net = nn.Sequential(*l); net

net.add_module('2', nn.Dropout2d(0.4)); net

Sequential(
  (0): Linear(in_features=10, out_features=10, bias=True)
  (1): Linear(in_features=10, out_features=10, bias=True)
  (2): Dropout2d(p=0.4, inplace=False)
)