In [1]:
import os
import random
import torch
import numpy as np

np.set_printoptions(suppress=True)
torch.set_printoptions(sci_mode=False)

def setup_seed(seed=42):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
setup_seed()

In [2]:
setup_seed()

normalized_shape = 128

y = torch.randn(32, normalized_shape)

X = torch.randn(32, normalized_shape)
X

tensor([[-0.5197,  1.8524,  1.8365,  ...,  1.6612, -2.6223, -0.4319],
        [-0.1010, -0.4388, -1.9775,  ..., -1.8040,  1.8377, -0.7219],
        [ 0.6289, -0.0685, -1.2115,  ...,  0.1415,  1.8377, -0.2277],
        ...,
        [-0.1587,  1.6984, -0.0560,  ...,  0.1716,  0.8127, -0.6369],
        [-1.3467,  0.6522, -1.3508,  ..., -0.4601,  0.1815,  0.1850],
        [ 0.7205, -0.2833,  0.0937,  ...,  0.5409,  0.6940,  1.8563]])

In [3]:
builtin_layer_norm = torch.nn.LayerNorm(normalized_shape)
builtin_layer_norm(X)

tensor([[    -0.6643,      1.5780,      1.5630,  ...,      1.3973,
             -2.6518,     -0.5813],
        [    -0.0014,     -0.3333,     -1.8453,  ...,     -1.6748,
              1.9036,     -0.6115],
        [     0.7076,     -0.1397,     -1.5283,  ...,      0.1155,
              2.1761,     -0.3331],
        ...,
        [    -0.1107,      1.7803,     -0.0060,  ...,      0.2256,
              0.8785,     -0.5976],
        [    -1.4359,      0.7016,     -1.4403,  ...,     -0.4878,
              0.1983,      0.2020],
        [     0.6567,     -0.3300,      0.0406,  ...,      0.4802,
              0.6306,      1.7731]], grad_fn=<NativeLayerNormBackward0>)

In [4]:
params = list(builtin_layer_norm.parameters())

params

[Parameter containing:
 tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 

In [5]:
error = (builtin_layer_norm(X) - y)**2
error = error.sum()
error.backward()

# builtin_layer_norm(input_tensor).backward()


In [6]:
error

tensor(8320.5303, grad_fn=<SumBackward0>)

In [7]:
params[0].grad

tensor([ 56.8991,  53.0642,  64.7571,  77.4292,  51.8013,  34.5774,  68.3558,
         84.9231,  70.2401,  41.9221,  60.5019,  61.8583,  87.3300,  86.7715,
         57.0997,  64.7705,  92.9782,  52.2718,  76.3927,  53.3727,  70.0271,
         44.7156,  48.2270,  63.1499,  73.0357,  46.2417,  59.2834,  62.1753,
         24.4343,  48.2572,  56.3098, 102.8503,  49.3163,  80.5047,  90.1483,
         65.5341,  63.3679,  81.3550,  48.1293,  90.1231,  62.5487,  84.8202,
         61.5340,  67.0921,  65.6296,  60.5733,  30.6680,  82.4995,  75.8112,
         58.3419,  53.1004,  66.8580,  48.7943,  57.5068,  83.4672,  83.0650,
         43.8958,  47.5017,  70.3146,  62.3127,  74.2883,  25.7525, 106.4405,
         94.0248,  76.6682,  68.9015,  52.6952,  52.3498,  57.4855,  61.0286,
         76.4142,  56.4901,  50.5821,  42.0063,  66.4569,  68.4997,  70.5565,
         71.3253,  52.9540,  56.5001,  58.2040,  76.3519,  81.1820,  69.1193,
         48.6353,  62.8500,  63.9958,  49.4465,  87.0537,  92.39

In [8]:
x.shape

NameError: name 'x' is not defined

In [9]:
params

[Parameter containing:
 tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 

In [10]:
with torch.no_grad():
  for p in builtin_layer_norm.parameters():
    learning_rate = 0.1
    new_val = p - learning_rate * p.grad
    print(new_val)
    # p.copy_(new_val)

tensor([-4.6899, -4.3064, -5.4757, -6.7429, -4.1801, -2.4577, -5.8356, -7.4923,
        -6.0240, -3.1922, -5.0502, -5.1858, -7.7330, -7.6772, -4.7100, -5.4771,
        -8.2978, -4.2272, -6.6393, -4.3373, -6.0027, -3.4716, -3.8227, -5.3150,
        -6.3036, -3.6242, -4.9283, -5.2175, -1.4434, -3.8257, -4.6310, -9.2850,
        -3.9316, -7.0505, -8.0148, -5.5534, -5.3368, -7.1355, -3.8129, -8.0123,
        -5.2549, -7.4820, -5.1534, -5.7092, -5.5630, -5.0573, -2.0668, -7.2500,
        -6.5811, -4.8342, -4.3100, -5.6858, -3.8794, -4.7507, -7.3467, -7.3065,
        -3.3896, -3.7502, -6.0315, -5.2313, -6.4288, -1.5753, -9.6441, -8.4025,
        -6.6668, -5.8902, -4.2695, -4.2350, -4.7485, -5.1029, -6.6414, -4.6490,
        -4.0582, -3.2006, -5.6457, -5.8500, -6.0556, -6.1325, -4.2954, -4.6500,
        -4.8204, -6.6352, -7.1182, -5.9119, -3.8635, -5.2850, -5.3996, -3.9446,
        -7.7054, -8.2396, -6.7857, -3.5167, -6.2178, -4.5104, -7.1073, -6.8145,
        -5.9424, -4.6234, -7.5101, -6.02

In [11]:
list(builtin_layer_norm.named_parameters())

[('weight',
  Parameter containing:
  tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1.], requires_grad=True)),
 ('bias',
  Parameter containing:
  tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [12]:
list(builtin_layer_norm.buffers())

[]

In [13]:
from torchvision.io import read_image
from torchvision.models import convnext_tiny, ConvNeXt_Tiny_Weights


# Step 1: Initialize model with the best available weights
weights = ConvNeXt_Tiny_Weights.DEFAULT
model = convnext_tiny(weights=weights)


Downloading: "https://download.pytorch.org/models/convnext_tiny-983f1562.pth" to /Users/citizen2/.cache/torch/hub/checkpoints/convnext_tiny-983f1562.pth
100.0%


In [26]:
model

ConvNeXt(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      (1): LayerNorm2d((96,), eps=1e-06, elementwise_affine=True)
    )
    (1): Sequential(
      (0): CNBlock(
        (block): Sequential(
          (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
          (1): Permute()
          (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          (3): Linear(in_features=96, out_features=384, bias=True)
          (4): GELU(approximate='none')
          (5): Linear(in_features=384, out_features=96, bias=True)
          (6): Permute()
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
      )
      (1): CNBlock(
        (block): Sequential(
          (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
          (1): Permute()
          (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          (3): Linear(in_features=

In [24]:
list(model.features[0][1].named_buffers())

[]

In [25]:
list(model.features[0][1].named_parameters())

[('weight',
  Parameter containing:
  tensor([     0.0175,      0.1579,      0.3161,      0.0979,      0.4666,
               0.0013,      0.2442,      0.1314,      0.1714,      0.2489,
               0.3636,     -0.0003,      0.1290,      0.1783,      0.2120,
               0.0738,      0.1426,      0.0313,      0.0967,      0.4619,
               0.4981,      0.0504,      0.1325,      0.2337,      0.0017,
               0.3482,      0.1150,      0.0483,      0.0127,      0.1527,
               0.0127,      0.0217,      0.0262,     -0.0001,      0.1772,
               0.1443,      0.3130,      0.1146,      0.0276,      0.1730,
               0.1023,      0.2916,      0.0211,     -0.0002,      0.0561,
               0.0005,      0.0001,     -0.0002,      0.0643,      0.0477,
               0.3716,      0.0588,      0.0961,      0.0003,      0.3510,
               0.1468,      0.0544,      0.0454,     -0.0003,      0.0244,
               0.0010,      0.2185,      0.0298,      0.2375,   

In [29]:
for name, p in model.named_parameters():
    print(name)
    print(p)
    
    break

features.0.0.weight
Parameter containing:
tensor([[[[    -0.1019,     -0.1258,     -0.0777,     -0.0484],
          [     0.0491,     -0.0058,     -0.0272,      0.0074],
          [     0.0055,     -0.0166,     -0.0004,     -0.0102],
          [     0.0575,      0.0633,      0.1328,      0.1206]],

         [[    -0.0151,     -0.0420,     -0.0025,      0.0220],
          [     0.0817,      0.0309,      0.0183,      0.0206],
          [    -0.0447,     -0.0466,     -0.0218,     -0.0682],
          [    -0.0263,     -0.0124,      0.0550,      0.0047]],

         [[     0.0467,      0.0268,      0.0762,      0.0656],
          [     0.0819,      0.0573,      0.0403,      0.0252],
          [    -0.0495,     -0.0425,     -0.0063,     -0.0838],
          [    -0.0804,     -0.0695,      0.0060,     -0.0555]]],


        [[[     0.0233,      0.0212,      0.0454,     -0.0047],
          [     0.0011,     -0.0746,      0.0673,      0.0662],
          [    -0.0066,     -0.0007,      0.0017,     

In [31]:
list(model.named_buffers())

[]

In [33]:
builtin_layer_norm.state_dict()

OrderedDict([('weight',
              tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                      1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                      1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                      1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                      1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                      1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                      1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                      1., 1.])),
             ('bias',
              tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0

In [36]:
torch.nn.BatchNorm1d(5).state_dict()

OrderedDict([('weight', tensor([1., 1., 1., 1., 1.])),
             ('bias', tensor([0., 0., 0., 0., 0.])),
             ('running_mean', tensor([0., 0., 0., 0., 0.])),
             ('running_var', tensor([1., 1., 1., 1., 1.])),
             ('num_batches_tracked', tensor(0))])

In [38]:
model.state_dict().keys()

odict_keys(['features.0.0.weight', 'features.0.0.bias', 'features.0.1.weight', 'features.0.1.bias', 'features.1.0.layer_scale', 'features.1.0.block.0.weight', 'features.1.0.block.0.bias', 'features.1.0.block.2.weight', 'features.1.0.block.2.bias', 'features.1.0.block.3.weight', 'features.1.0.block.3.bias', 'features.1.0.block.5.weight', 'features.1.0.block.5.bias', 'features.1.1.layer_scale', 'features.1.1.block.0.weight', 'features.1.1.block.0.bias', 'features.1.1.block.2.weight', 'features.1.1.block.2.bias', 'features.1.1.block.3.weight', 'features.1.1.block.3.bias', 'features.1.1.block.5.weight', 'features.1.1.block.5.bias', 'features.1.2.layer_scale', 'features.1.2.block.0.weight', 'features.1.2.block.0.bias', 'features.1.2.block.2.weight', 'features.1.2.block.2.bias', 'features.1.2.block.3.weight', 'features.1.2.block.3.bias', 'features.1.2.block.5.weight', 'features.1.2.block.5.bias', 'features.2.0.weight', 'features.2.0.bias', 'features.2.1.weight', 'features.2.1.bias', 'feature

In [39]:
model

ConvNeXt(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      (1): LayerNorm2d((96,), eps=1e-06, elementwise_affine=True)
    )
    (1): Sequential(
      (0): CNBlock(
        (block): Sequential(
          (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
          (1): Permute()
          (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          (3): Linear(in_features=96, out_features=384, bias=True)
          (4): GELU(approximate='none')
          (5): Linear(in_features=384, out_features=96, bias=True)
          (6): Permute()
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
      )
      (1): CNBlock(
        (block): Sequential(
          (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
          (1): Permute()
          (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          (3): Linear(in_features=

In [22]:
import torch
import torch.nn as nn

class CustomLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super(CustomLayerNorm, self).__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(normalized_shape))
        self.beta = nn.Parameter(torch.zeros(normalized_shape))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True, unbiased=False)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

# Define normalized shape
normalized_shape = 128

# Initialize custom and built-in layer normalization
custom_layer_norm = CustomLayerNorm(normalized_shape)
builtin_layer_norm = nn.LayerNorm(normalized_shape)

# Set the same weights and biases for comparison
with torch.no_grad():
    custom_layer_norm.gamma.copy_(builtin_layer_norm.weight)
    custom_layer_norm.beta.copy_(builtin_layer_norm.bias)

# Create input tensor
input_tensor = torch.randn(32, normalized_shape)

# Apply both normalizations
output_custom = custom_layer_norm(input_tensor)
output_builtin = builtin_layer_norm(input_tensor)

# Check if the outputs are almost equal
print(torch.allclose(output_custom, output_builtin, atol=1e-6))


True
