In [1]:
from importlib.metadata import version

print("torch version:", version("torch"))

torch version: 2.7.1


In [2]:
from gpt import TransformerBlock

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

block = TransformerBlock(GPT_CONFIG_124M)
print(block)

TransformerBlock(
  (att): MultiHeadAttention(
    (W_query): Linear(in_features=768, out_features=768, bias=False)
    (W_key): Linear(in_features=768, out_features=768, bias=False)
    (W_value): Linear(in_features=768, out_features=768, bias=False)
    (out_proj): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ff): FeedForward(
    (layers): Sequential(
      (0): Linear(in_features=768, out_features=3072, bias=True)
      (1): GELU()
      (2): Linear(in_features=3072, out_features=768, bias=True)
    )
  )
  (norm1): LayerNorm()
  (norm2): LayerNorm()
  (drop_shortcut): Dropout(p=0.1, inplace=False)
)


In [3]:
for name, param in block.named_parameters():
    print(f"name:{name} {param.shape}")

name:att.W_query.weight torch.Size([768, 768])
name:att.W_key.weight torch.Size([768, 768])
name:att.W_value.weight torch.Size([768, 768])
name:att.out_proj.weight torch.Size([768, 768])
name:att.out_proj.bias torch.Size([768])
name:ff.layers.0.weight torch.Size([3072, 768])
name:ff.layers.0.bias torch.Size([3072])
name:ff.layers.2.weight torch.Size([768, 3072])
name:ff.layers.2.bias torch.Size([768])
name:norm1.scale torch.Size([768])
name:norm1.shift torch.Size([768])
name:norm2.scale torch.Size([768])
name:norm2.shift torch.Size([768])


In [17]:
print(param.data)
print(param.grad)
print(param.name)

tensor([-2.8660e-03,  7.6987e-03, -1.9468e-03,  2.9360e-03, -2.1866e-03,
         1.3394e-02,  7.6078e-03, -1.4768e-02, -2.5272e-03, -1.2794e-02,
         1.7550e-02, -1.5238e-02, -7.0145e-03, -2.2412e-03, -4.6024e-03,
         1.0892e-02,  1.3499e-02, -1.5189e-02, -1.5211e-02, -8.6266e-03,
         1.2799e-02,  4.2199e-03,  3.2212e-03, -4.0765e-03,  5.2852e-03,
         6.1208e-04,  4.1661e-03,  5.2963e-03, -6.1528e-03,  1.2753e-03,
        -6.1978e-03, -1.3765e-02, -1.1962e-02, -1.4039e-03, -7.2276e-03,
         1.4661e-02, -3.7850e-03,  5.3649e-03, -1.2704e-02,  1.1500e-02,
         3.4307e-03, -2.4888e-04,  1.0896e-02, -1.7567e-02, -1.6188e-02,
         5.7100e-03,  9.6828e-03, -1.1620e-02, -6.0314e-03, -8.4283e-03,
        -1.1384e-02, -1.7943e-02,  1.5920e-02, -5.5909e-03,  1.4460e-03,
         7.1554e-04,  6.2540e-03,  1.6037e-02, -1.1734e-02,  7.5017e-03,
        -9.4770e-03, -4.1816e-03,  1.3061e-03, -1.5165e-02, -1.3627e-02,
        -2.8658e-03, -1.5341e-02, -1.3703e-02, -5.7

In [27]:
# parameters in ff layer.

total_params_ff = 0
for name, param in block.ff.named_parameters():
    print(f"name:{name} {param.shape} {param.numel()}")
    total_params_ff += param.numel()

print(f"total_params_ff: {total_params_ff}")

name:layers.0.weight torch.Size([3072, 768]) 2359296
name:layers.0.bias torch.Size([3072]) 3072
name:layers.2.weight torch.Size([768, 3072]) 2359296
name:layers.2.bias torch.Size([768]) 768
total_params_ff: 4722432


In [29]:
total_params_ff = 0
for name, param in block.att.named_parameters():
    print(f"param: {param}")
    print(f"name:{name} {param.shape} {param.numel()}")
    total_params_ff += param.numel()

print(f"total_params_ff: {total_params_ff}")

param: Parameter containing:
tensor([[ 0.0269, -0.0357,  0.0185,  ...,  0.0225,  0.0077,  0.0098],
        [ 0.0138,  0.0178, -0.0014,  ..., -0.0185,  0.0259,  0.0013],
        [ 0.0018, -0.0109, -0.0175,  ..., -0.0296, -0.0310,  0.0069],
        ...,
        [-0.0221,  0.0220,  0.0075,  ...,  0.0160,  0.0090,  0.0129],
        [-0.0188,  0.0214,  0.0054,  ..., -0.0273, -0.0259, -0.0176],
        [ 0.0308, -0.0120,  0.0039,  ...,  0.0250,  0.0041, -0.0063]],
       requires_grad=True)
name:W_query.weight torch.Size([768, 768]) 589824
param: Parameter containing:
tensor([[-0.0186, -0.0168,  0.0238,  ...,  0.0282,  0.0206,  0.0032],
        [-0.0162,  0.0246,  0.0093,  ..., -0.0042,  0.0003,  0.0118],
        [-0.0324,  0.0263,  0.0020,  ...,  0.0292,  0.0176,  0.0265],
        ...,
        [-0.0347,  0.0154,  0.0333,  ..., -0.0017, -0.0148, -0.0141],
        [-0.0323, -0.0197, -0.0047,  ..., -0.0031,  0.0094,  0.0072],
        [ 0.0232,  0.0223,  0.0168,  ..., -0.0318, -0.0242,  0.0342]

In [30]:
import torch
import torch.nn as nn

# Create a simple linear layer
linear = nn.Linear(in_features=10, out_features=5, bias=True)


In [35]:
linear.weight?

[0;31mType:[0m        Parameter
[0;31mString form:[0m
Parameter containing:
           tensor([[ 0.2548,  0.1334, -0.0462,  0.0847, -0.0305, -0.1611,  0.1566, -0. <...> 52,  0.2111, -0.1356,  0.1570, -0.2800, -0.2906,
           -0.0413, -0.0559]], requires_grad=True)
[0;31mLength:[0m      5
[0;31mFile:[0m        ~/miniconda3/envs/pytorch_gpu_0710/lib/python3.10/site-packages/torch/nn/parameter.py
[0;31mDocstring:[0m  
A kind of Tensor that is to be considered a module parameter.

Parameters are :class:`~torch.Tensor` subclasses, that have a
very special property when used with :class:`Module` s - when they're
assigned as Module attributes they are automatically added to the list of
its parameters, and will appear e.g. in :meth:`~Module.parameters` iterator.
Assigning a Tensor doesn't have such effect. This is because one might
want to cache some temporary state, like last hidden state of the RNN, in
the model. If there was no such class as :class:`Parameter`, these
temporari