In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Input shape: (batch_size, features)
layer_norm = nn.LayerNorm(64)  # 64 = feature dimension
x = torch.randn(32, 64)  # batch_size=32, features=64

# Normalizes each of the 32 samples independently
# across their 64 features
output = layer_norm(x)  # Same shape: (32, 64)

# Each row (sample) now has mean≈0, std≈1
print(x[0].mean(), x[0].std())      # Random values
print(output[0].mean(), output[0].std())  # ≈0.0, ≈1.0

tensor(-0.1854) tensor(1.0781)
tensor(-7.4506e-09, grad_fn=<MeanBackward0>) tensor(1.0079, grad_fn=<StdBackward0>)


In [2]:
x = torch.tensor([[3.0, 4.0], [1.0, 1.0]])  # 2 vectors
print(torch.norm(x, dim=1))  # [5.0, 1.414] - original lengths

normalized = F.normalize(x, dim=1)
print(normalized)           # [[0.6, 0.8], [0.707, 0.707]]
print(torch.norm(normalized, dim=1))  # [1.0, 1.0] - unit length

tensor([5.0000, 1.4142])
tensor([[0.6000, 0.8000],
        [0.7071, 0.7071]])
tensor([1.0000, 1.0000])


In [3]:
toks = torch.randn(2, 4)
print(toks.shape)
N, L = toks.shape # (b, 4)
pos_ids = torch.arange(L, device=toks.device).unsqueeze(0).expand(N, L)
print(pos_ids)


torch.Size([2, 4])
tensor([[0, 1, 2, 3],
        [0, 1, 2, 3]])


In [5]:
toks.shape

torch.Size([2, 4])