<a href="https://colab.research.google.com/github/alexlinapp/proofLLM/blob/main/LLMArch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,      # vocab size
    "context_length": 1024,   # context lnegth
    "emb_dim": 768,
    "n_heads": 12,            # Number of attention heads
    "n_layers": 12,           # Number of layers
    "drop_rate": 0.1,         # Dropout rate
    "qkv_bias": False         # Query-Key-Value bias
}

In [3]:
class DummyGPTModel(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.tok_emb = nn.Embedding(config["vocab_size"], config["emb_dim"])
    self.pos_emb = nn.Embedding(config["context_length"], config["emb_dim"])
    self.drop_emb = nn.Dropout(config["drop_rate"])
    self.trf_blocks = nn.Sequential(*[DummyTransformerBlock(config) for _ in range(config["n_layers"])])
    self.final_norm = DummyLayerNorm(config["emb_dim"])
    self.out_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

  def forward(self, x):
    batch_size, seq_len = x.shape
    tok_emb = self.tok_emb(x)
    pos_emb = self.pos_emb(torch.arange(seq_len, device=x.device))
    x = tok_emb + pos_emb
    x = self.drop_emb(x)
    x = self.trf_blocks(x)
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits

class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1, keepdim=True, unbiased=False)
    norm_x = (x - mean) / torch.sqrt(var + self.eps)
    return self.scale * norm_x + self.shift


class DummyTransformerBlock(nn.Module):
  def __init__(self, config):
    super().__init__()
  def forward(self, x):
    return x

class DummyLayerNorm(nn.Module):
  def __init__(self, config):
    super().__init__()
  def forward(self, x):
    return x


In [4]:
import tiktoken

In [5]:
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim = 0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [6]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Ouptut shape:", logits.shape)
print(logits)

Ouptut shape: torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0447,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)


In [7]:
torch.manual_seed(123)
batch_example = torch.randn(2, 5)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
print(batch_example)
print(out)
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
print("Mean:\n", mean)
print("Var:\n", var)
out_norm = (out - mean) / torch.sqrt(var + 1e-5)
mean, var = out_norm.mean(dim=-1, keepdim=True), out_norm.var(dim=-1, keepdim=True)
print("Normalized:\n", out_norm)
print("Mean:\n", mean)
print("Var:\n", var)

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])
tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)
Mean:
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Var:
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)
Normalized:
 tensor([[ 0.6157,  1.4123, -0.8717,  0.5871, -0.8717, -0.8717],
        [-0.0189,  0.1121, -1.0875,  1.5171,  0.5647, -1.0875]],
       grad_fn=<DivBackward0>)
Mean:
 tensor([[-1.9868e-08],
        [ 1.9868e-08]], grad_fn=<MeanBackward1>)
Var:
 tensor([[0.9996],
        [0.9997]], grad_fn=<VarBackward0>)


In [8]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, keepdim=True, unbiased=False)
print("Normalized:\n", out_ln)
print("Mean:\n", mean)
print("Var:\n", var)

Normalized:
 tensor([[ 0.5528,  1.0693, -0.0223,  0.2656, -1.8654],
        [ 0.9087, -1.3767, -0.9564,  1.1304,  0.2940]], grad_fn=<AddBackward0>)
Mean:
 tensor([[-2.9802e-08],
        [ 0.0000e+00]], grad_fn=<MeanBackward1>)
Var:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [126]:
class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x):
    return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))
class FeedForward(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.layers = nn.Sequential(nn.Linear(config["emb_dim"], 4 * config["emb_dim"]),
                                GELU(),
                                nn.Linear(4 * config["emb_dim"], config["emb_dim"]))
  def forward(self, x):
    return self.layers(x)


class ExampleDeepNeuralNetwork(nn.Module):
  def __init__(self, layer_sizes, use_shortcut):
    super().__init__()
    self.use_shortcut = use_shortcut
    self.layers = nn.ModuleList()
    # for i in range(len(layer_sizes)-1):
    #   self.layers.append(nn.Sequential(nn.Linear(layer_sizes[i], layer_sizes[i + 1]), GELU()))
    #   #print(i)
    self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
        ])
  def forward(self, x):
    for layer in self.layers:
      layer_output = layer(x)
      if self.use_shortcut and x.shape == layer_output.shape:
        x = x + layer_output
      else:
        x = layer_output
    return x


def print_gradients(model, x):
  output = model(x)
  target = torch.tensor([[0.]])
  loss = nn.MSELoss()
  loss = loss(output, target)
  loss.backward()

  for name, param in model.named_parameters():
    if 'weight' in name:
      print(f"{name} has gradient mean of: {param.grad.abs().mean().item()}")

In [10]:
ffn = FeedForward(GPT_CONFIG_124M)
x = torch.randn(2, 3, GPT_CONFIG_124M["emb_dim"])
print(x.shape)
out_ffn = ffn(x)
print(out_ffn)

torch.Size([2, 3, 768])
tensor([[[-0.3716, -0.2019,  0.1961,  ..., -0.2702,  0.0683,  0.2355],
         [ 0.0063,  0.0706,  0.4140,  ...,  0.1767, -0.0537, -0.1279],
         [ 0.1751, -0.1655, -0.1007,  ..., -0.3212,  0.2659, -0.1650]],

        [[ 0.3045,  0.0826,  0.1324,  ..., -0.2859, -0.0285, -0.2395],
         [-0.1005,  0.2507,  0.0544,  ..., -0.2401,  0.0284, -0.2866],
         [ 0.2492, -0.0038, -0.3012,  ..., -0.1008, -0.0374,  0.0472]]],
       grad_fn=<ViewBackward0>)


In [127]:
layer_sizes = [3,3,3,3,3,1]
sample_input = torch.tensor([[1.,0.,-1.]])
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=False)
print("No skip connections. Vanishing gradients:\n")
print_gradients(model_without_shortcut, sample_input)


torch.manual_seed(123)
model_with_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=True)
print("\nWith skip connections. Non-vanishing gradients:\n")
print_gradients(model_with_shortcut, sample_input)


No skip connections. Vanishing gradients:

layers.0.0.weight has gradient mean of: 0.00020173584925942123
layers.1.0.weight has gradient mean of: 0.00012011159560643137
layers.2.0.weight has gradient mean of: 0.0007152040489017963
layers.3.0.weight has gradient mean of: 0.0013988736318424344
layers.4.0.weight has gradient mean of: 0.005049645435065031

With skip connections. Non-vanishing gradients:

layers.0.0.weight has gradient mean of: 0.22169791162014008
layers.1.0.weight has gradient mean of: 0.20694105327129364
layers.2.0.weight has gradient mean of: 0.32896995544433594
layers.3.0.weight has gradient mean of: 0.2665732204914093
layers.4.0.weight has gradient mean of: 1.3258540630340576


In [149]:
from attention import MultiHeadAttention

class TransformerBlock(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.attn = MultiHeadAttention(d_in = config["emb_dim"], d_out=config["emb_dim"],
                                   context_length = config["context_length"],
                                   dropout = config["drop_rate"],
                                   num_heads = config["n_heads"],
                                   qkv_bias =config["qkv_bias"])

    # using two separate norms since they have different scale weights and biases
    self.ff = FeedForward(config)
    self.norm1 = LayerNorm(config["emb_dim"])
    self.norm2 = LayerNorm(config["emb_dim"])
    self.drop_shortcut = nn.Dropout(config["drop_rate"])

  def forward(self, x):

    # shortcut for attention block
    shortcut = x
    x = self.norm1(x)
    x = self.attn(x)
    x = self.drop_shortcut(x)
    x = shortcut + x

    # shortcut for feed forward block
    shortcut = x
    x = self.norm2(x)
    x = self.ff(x)
    x = self.drop_shortcut(x)
    x = shortcut + x
    return x

In [144]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
out = block(x)
print("Input shape:", x.shape)
print("Output shape:", out.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


In [161]:
from attention import MultiHeadAttention
class GPTModel(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.tok_emb = nn.Embedding(config["vocab_size"], config["emb_dim"])
    self.pos_emb = nn.Embedding(config["context_length"], config["emb_dim"])
    self.drop_emb = nn.Dropout(config["drop_rate"])
    self.trf_blocks = nn.Sequential(*[TransformerBlock(config) for _ in range(config["n_layers"])])
    self.final_norm = LayerNorm(config["emb_dim"])
    self.out_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

  def forward(self, x):
    batch_size, seq_len = x.shape
    tok_emb = self.tok_emb(x)
    pos_emb = self.pos_emb(torch.arange(seq_len, device=x.device))
    x = tok_emb + pos_emb
    x = self.drop_emb(x)
    x = self.trf_blocks(x)
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits

class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1, keepdim=True, unbiased=False)
    norm_x = (x - mean) / torch.sqrt(var + self.eps)
    return self.scale * norm_x + self.shift

class TransformerBlock(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.attn = MultiHeadAttention(d_in = config["emb_dim"], d_out=config["emb_dim"],
                                   context_length = config["context_length"],
                                   dropout = config["drop_rate"],
                                   num_heads = config["n_heads"],
                                   qkv_bias =config["qkv_bias"])

    # using two separate norms since they have different scale weights and biases
    self.ff = FeedForward(config)
    self.norm1 = LayerNorm(config["emb_dim"])
    self.norm2 = LayerNorm(config["emb_dim"])
    self.drop_shortcut = nn.Dropout(config["drop_rate"])

  def forward(self, x):

    # shortcut for attention block
    shortcut = x
    x = self.norm1(x)
    x = self.attn(x)
    x = self.drop_shortcut(x)
    x = shortcut + x

    # shortcut for feed forward block
    shortcut = x
    x = self.norm2(x)
    x = self.ff(x)
    x = self.drop_shortcut(x)
    x = shortcut + x
    return x

In [162]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
print("Input Batch:\n", batch)
logits = model(batch)
print("\nOuptut shape:\n", logits.shape)
print(logits)

Input Batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Ouptut shape:
 torch.Size([2, 4, 50257])
tensor([[[ 0.1381,  0.0077, -0.1963,  ..., -0.0222, -0.1060,  0.1717],
         [ 0.3865, -0.8408, -0.6564,  ..., -0.5163,  0.2369, -0.3357],
         [ 0.6989, -0.1829, -0.1631,  ...,  0.1472, -0.6504, -0.0056],
         [-0.4290,  0.1669, -0.1258,  ...,  1.1579,  0.5303, -0.5549]],

        [[ 0.1094, -0.2894, -0.1467,  ..., -0.0557,  0.2911, -0.2824],
         [ 0.0882, -0.3552, -0.3527,  ...,  1.2930,  0.0053,  0.1898],
         [ 0.6091,  0.4702, -0.4094,  ...,  0.7688,  0.3787, -0.1974],
         [-0.0612, -0.0737,  0.4751,  ...,  1.2463, -0.3834,  0.0609]]],
       grad_fn=<UnsafeViewBackward0>)


In [175]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")
print("Token embedding layer shape:", model.tok_emb.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)

block = TransformerBlock(GPT_CONFIG_124M)
ff_params = sum(p.numel() for p in block.ff.parameters())
print(f"Total number of ff_parameters: {ff_params:,}")
aa_params = sum(p.numel() for p in block.attn.parameters())
print(f"Total number of aa_parameters: {aa_params:,}")

total_size_bytes = total_params * 4  # assume floating point 32, so 4 bytes per parameter
mb = total_size_bytes / (1024 * 1024)
print(f"Total size of parameters: {mb:.2f} MB")

Total number of parameters: 163,009,536
Token embedding layer shape: torch.Size([50257, 768])
Output layer shape: torch.Size([50257, 768])
Total number of ff_parameters: 4,722,432
Total number of aa_parameters: 2,360,064
Total size of parameters: 621.83 MB


In [177]:
def generate_text_simple(model, idx,  max_new_tokens, context_size):
  for _ in range(max_new_tokens):
    idx_cond = idx[:, -context_size:]
    with torch.no_grad():
      logits = model(idx_cond)

    logits = logits[:, -1, :]
    probs = torch.softmax(logits, dim=-1)
    idx_next = torch.argmax(probs, dim=-1, keepdim=True)
    idx = torch.cat((idx, idx_next), dim=1)
  return idx

In [187]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor:", encoded_tensor)

model.eval()
out = generate_text_simple(model, encoded_tensor, max_new_tokens=6, context_size=GPT_CONFIG_124M["context_length"])
print(out)
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

encoded: [15496, 11, 314, 716]
encoded_tensor: tensor([[15496,    11,   314,   716]])
tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
Hello, I am Featureiman Byeswickattribute argue
