<a href="https://colab.research.google.com/github/alexlinapp/proofLLM/blob/main/LLMArch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,      # vocab size
    "context_length": 1024,   # context lnegth
    "emb_dim": 768,
    "n_heads": 12,            # Number of attention heads
    "n_layers": 12,           # Number of layers
    "drop_rate": 0.1,         # Dropout rate
    "qkv_bias": False         # Query-Key-Value bias
}

In [3]:
class DummyGPTModel(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.tok_emb = nn.Embedding(config["vocab_size"], config["emb_dim"])
    self.pos_emb = nn.Embedding(config["context_length"], config["emb_dim"])
    self.drop_emb = nn.Dropout(config["drop_rate"])
    self.trf_blocks = nn.Sequential(*[DummyTransformerBlock(config) for _ in range(config["n_layers"])])
    self.final_norm = DummyLayerNorm(config["emb_dim"])
    self.out_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

  def forward(self, x):
    batch_size, seq_len = x.shape
    tok_emb = self.tok_emb(x)
    pos_emb = self.pos_emb(torch.arange(seq_len, device=x.device))
    x = tok_emb + pos_emb
    x = self.drop_emb(x)
    x = self.trf_blocks(x)
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits

class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1, keepdim=True, unbiased=False)
    norm_x = (x - mean) / torch.sqrt(var + self.eps)
    return self.scale * norm_x + self.shift


class DummyTransformerBlock(nn.Module):
  def __init__(self, config):
    super().__init__()
  def forward(self, x):
    return x

class DummyLayerNorm(nn.Module):
  def __init__(self, config):
    super().__init__()
  def forward(self, x):
    return x


In [4]:
import tiktoken

In [5]:
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim = 0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [6]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Ouptut shape:", logits.shape)
print(logits)

Ouptut shape: torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0448,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)


In [7]:
torch.manual_seed(123)
batch_example = torch.randn(2, 5)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
print(batch_example)
print(out)
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
print("Mean:\n", mean)
print("Var:\n", var)
out_norm = (out - mean) / torch.sqrt(var + 1e-5)
mean, var = out_norm.mean(dim=-1, keepdim=True), out_norm.var(dim=-1, keepdim=True)
print("Normalized:\n", out_norm)
print("Mean:\n", mean)
print("Var:\n", var)

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])
tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)
Mean:
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Var:
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)
Normalized:
 tensor([[ 0.6157,  1.4123, -0.8717,  0.5871, -0.8717, -0.8717],
        [-0.0189,  0.1121, -1.0875,  1.5171,  0.5647, -1.0875]],
       grad_fn=<DivBackward0>)
Mean:
 tensor([[-1.9868e-08],
        [ 1.9868e-08]], grad_fn=<MeanBackward1>)
Var:
 tensor([[0.9996],
        [0.9997]], grad_fn=<VarBackward0>)


In [8]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, keepdim=True, unbiased=False)
print("Normalized:\n", out_ln)
print("Mean:\n", mean)
print("Var:\n", var)

Normalized:
 tensor([[ 0.5528,  1.0693, -0.0223,  0.2656, -1.8654],
        [ 0.9087, -1.3767, -0.9564,  1.1304,  0.2940]], grad_fn=<AddBackward0>)
Mean:
 tensor([[-2.9802e-08],
        [ 0.0000e+00]], grad_fn=<MeanBackward1>)
Var:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [9]:
class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x):
    return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))
class FeedForward(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.layers = nn.Sequential(nn.Linear(config["emb_dim"], 4 * config["emb_dim"]),
                                GELU(),
                                nn.Linear(4 * config["emb_dim"], config["emb_dim"]))
  def forward(self, x):
    return self.layers(x)


class ExampleDeepNeuralNetwork(nn.Module):
  def __init__(self, layer_sizes, use_shortcut):
    super().__init__()
    self.use_shortcut = use_shortcut
    self.layers = nn.ModuleList()
    # for i in range(len(layer_sizes)-1):
    #   self.layers.append(nn.Sequential(nn.Linear(layer_sizes[i], layer_sizes[i + 1]), GELU()))
    #   #print(i)
    self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
        ])
  def forward(self, x):
    for layer in self.layers:
      layer_output = layer(x)
      if self.use_shortcut and x.shape == layer_output.shape:
        x = x + layer_output
      else:
        x = layer_output
    return x


def print_gradients(model, x):
  output = model(x)
  target = torch.tensor([[0.]])
  loss = nn.MSELoss()
  loss = loss(output, target)
  loss.backward()

  for name, param in model.named_parameters():
    if 'weight' in name:
      print(f"{name} has gradient mean of: {param.grad.abs().mean().item()}")

In [10]:
ffn = FeedForward(GPT_CONFIG_124M)
x = torch.randn(2, 3, GPT_CONFIG_124M["emb_dim"])
print(x.shape)
out_ffn = ffn(x)
print(out_ffn)

torch.Size([2, 3, 768])
tensor([[[-0.3731, -0.2161,  0.1972,  ..., -0.2462,  0.0535,  0.2413],
         [ 0.0069,  0.0609,  0.3952,  ...,  0.1626, -0.0415, -0.1237],
         [ 0.1569, -0.1565, -0.0789,  ..., -0.3007,  0.2389, -0.1702]],

        [[ 0.2887,  0.0783,  0.1038,  ..., -0.2605, -0.0504, -0.2268],
         [-0.0889,  0.2274,  0.0563,  ..., -0.2062,  0.0148, -0.2420],
         [ 0.2520, -0.0005, -0.2848,  ..., -0.0739, -0.0354,  0.0410]]],
       grad_fn=<ViewBackward0>)


In [11]:
layer_sizes = [3,3,3,3,3,1]
sample_input = torch.tensor([[1.,0.,-1.]])
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=False)
print("No skip connections. Vanishing gradients:\n")
print_gradients(model_without_shortcut, sample_input)


torch.manual_seed(123)
model_with_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=True)
print("\nWith skip connections. Non-vanishing gradients:\n")
print_gradients(model_with_shortcut, sample_input)


No skip connections. Vanishing gradients:

layers.0.0.weight has gradient mean of: 0.00020173584925942123
layers.1.0.weight has gradient mean of: 0.00012011159560643137
layers.2.0.weight has gradient mean of: 0.0007152040489017963
layers.3.0.weight has gradient mean of: 0.0013988736318424344
layers.4.0.weight has gradient mean of: 0.005049645435065031

With skip connections. Non-vanishing gradients:

layers.0.0.weight has gradient mean of: 0.22169792652130127
layers.1.0.weight has gradient mean of: 0.20694108307361603
layers.2.0.weight has gradient mean of: 0.3289699852466583
layers.3.0.weight has gradient mean of: 0.2665732204914093
layers.4.0.weight has gradient mean of: 1.3258541822433472


In [12]:
from attention import MultiHeadAttention

class TransformerBlock(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.attn = MultiHeadAttention(d_in = config["emb_dim"], d_out=config["emb_dim"],
                                   context_length = config["context_length"],
                                   dropout = config["drop_rate"],
                                   num_heads = config["n_heads"],
                                   qkv_bias =config["qkv_bias"])

    # using two separate norms since they have different scale weights and biases
    self.ff = FeedForward(config)
    self.norm1 = LayerNorm(config["emb_dim"])
    self.norm2 = LayerNorm(config["emb_dim"])
    self.drop_shortcut = nn.Dropout(config["drop_rate"])

  def forward(self, x):

    # shortcut for attention block
    shortcut = x
    x = self.norm1(x)
    x = self.attn(x)
    x = self.drop_shortcut(x)
    x = shortcut + x

    # shortcut for feed forward block
    shortcut = x
    x = self.norm2(x)
    x = self.ff(x)
    x = self.drop_shortcut(x)
    x = shortcut + x
    return x

torch version: 2.6.0+cu124
torch.Size([]) torch.Size([3])
torch.Size([]) torch.Size([3])
torch.Size([]) torch.Size([3])
torch.Size([]) torch.Size([3])
torch.Size([]) torch.Size([3])
torch.Size([]) torch.Size([3])
This is atten_weights_2:  tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])
x2:  tensor([0.5500, 0.8700, 0.6600])
Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]], requires_grad=True) Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]], requires_grad=True) Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
 

In [13]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
out = block(x)
print("Input shape:", x.shape)
print("Output shape:", out.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


In [14]:
from attention import MultiHeadAttention
class GPTModel(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.tok_emb = nn.Embedding(config["vocab_size"], config["emb_dim"])
    self.pos_emb = nn.Embedding(config["context_length"], config["emb_dim"])
    self.drop_emb = nn.Dropout(config["drop_rate"])
    self.trf_blocks = nn.Sequential(*[TransformerBlock(config) for _ in range(config["n_layers"])])
    self.final_norm = LayerNorm(config["emb_dim"])
    self.out_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

  def forward(self, x):
    batch_size, seq_len = x.shape
    tok_emb = self.tok_emb(x)
    pos_emb = self.pos_emb(torch.arange(seq_len, device=x.device))
    x = tok_emb + pos_emb
    x = self.drop_emb(x)
    x = self.trf_blocks(x)
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits

class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1, keepdim=True, unbiased=False)
    norm_x = (x - mean) / torch.sqrt(var + self.eps)
    return self.scale * norm_x + self.shift

class TransformerBlock(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.attn = MultiHeadAttention(d_in = config["emb_dim"], d_out=config["emb_dim"],
                                   context_length = config["context_length"],
                                   dropout = config["drop_rate"],
                                   num_heads = config["n_heads"],
                                   qkv_bias =config["qkv_bias"])

    # using two separate norms since they have different scale weights and biases
    self.ff = FeedForward(config)
    self.norm1 = LayerNorm(config["emb_dim"])
    self.norm2 = LayerNorm(config["emb_dim"])
    self.drop_shortcut = nn.Dropout(config["drop_rate"])

  def forward(self, x):

    # shortcut for attention block
    shortcut = x
    x = self.norm1(x)
    x = self.attn(x)
    x = self.drop_shortcut(x)
    x = shortcut + x

    # shortcut for feed forward block
    shortcut = x
    x = self.norm2(x)
    x = self.ff(x)
    x = self.drop_shortcut(x)
    x = shortcut + x
    return x

In [15]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
print("Input Batch:\n", batch)
logits = model(batch)
print("\nOuptut shape:\n", logits.shape)
print(logits)

Input Batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Ouptut shape:
 torch.Size([2, 4, 50257])
tensor([[[ 0.1381,  0.0077, -0.1963,  ..., -0.0222, -0.1060,  0.1717],
         [ 0.3865, -0.8408, -0.6564,  ..., -0.5163,  0.2369, -0.3357],
         [ 0.6989, -0.1829, -0.1631,  ...,  0.1472, -0.6504, -0.0056],
         [-0.4290,  0.1669, -0.1258,  ...,  1.1579,  0.5303, -0.5549]],

        [[ 0.1094, -0.2894, -0.1467,  ..., -0.0557,  0.2911, -0.2824],
         [ 0.0882, -0.3552, -0.3527,  ...,  1.2930,  0.0053,  0.1898],
         [ 0.6091,  0.4702, -0.4094,  ...,  0.7688,  0.3787, -0.1974],
         [-0.0612, -0.0737,  0.4751,  ...,  1.2463, -0.3834,  0.0609]]],
       grad_fn=<UnsafeViewBackward0>)


In [16]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")
print("Token embedding layer shape:", model.tok_emb.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)

block = TransformerBlock(GPT_CONFIG_124M)
ff_params = sum(p.numel() for p in block.ff.parameters())
print(f"Total number of ff_parameters: {ff_params:,}")
aa_params = sum(p.numel() for p in block.attn.parameters())
print(f"Total number of aa_parameters: {aa_params:,}")

total_size_bytes = total_params * 4  # assume floating point 32, so 4 bytes per parameter
mb = total_size_bytes / (1024 * 1024)
print(f"Total size of parameters: {mb:.2f} MB")

Total number of parameters: 163,009,536
Token embedding layer shape: torch.Size([50257, 768])
Output layer shape: torch.Size([50257, 768])
Total number of ff_parameters: 4,722,432
Total number of aa_parameters: 2,360,064
Total size of parameters: 621.83 MB


In [17]:
def generate_text_simple(model, idx,  max_new_tokens, context_size):
  for _ in range(max_new_tokens):
    idx_cond = idx[:, -context_size:]
    with torch.no_grad():
      logits = model(idx_cond)

    logits = logits[:, -1, :]
    probs = torch.softmax(logits, dim=-1)
    idx_next = torch.argmax(probs, dim=-1, keepdim=True)
    idx = torch.cat((idx, idx_next), dim=1)
  return idx

In [18]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor:", encoded_tensor)

model.eval()
out = generate_text_simple(model, encoded_tensor, max_new_tokens=6, context_size=GPT_CONFIG_124M["context_length"])
print(out)
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

encoded: [15496, 11, 314, 716]
encoded_tensor: tensor([[15496,    11,   314,   716]])
tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
Hello, I am Featureiman Byeswickattribute argue


In [43]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,      # vocab size
    "context_length": 256,   # context lnegth
    "emb_dim": 768,
    "n_heads": 12,            # Number of attention heads
    "n_layers": 12,           # Number of layers
    "drop_rate": 0.1,         # Dropout rate
    "qkv_bias": False         # Query-Key-Value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

def text_to_token_ids(text, tokenizer):
  encoded_tensor = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
  encoded_tensor = torch.tensor(encoded_tensor).unsqueeze(0)
  return encoded_tensor
def token_id_to_text(token_ids, tokenizer):
  decoded_text = tokenizer.decode(token_ids.squeeze(0).tolist())
  return decoded_text

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")
encoded_tensor = text_to_token_ids(start_context, tokenizer)
token_ids = generate_text_simple(model, encoded_tensor, max_new_tokens=10, context_size=GPT_CONFIG_124M["context_length"])
print(token_id_to_text(token_ids, tokenizer))


Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [20]:
import os
import urllib.request

file_path = "the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode('utf-8')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

In [21]:
print(text_data[:99])
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print(f"Total characters: {total_characters}")
print(f"Total tokens: {total_tokens}")


I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 
Total characters: 20479
Total tokens: 5145


In [66]:
train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
print("Train data length:", len(train_data))
val_data = text_data[split_idx:]
print("Val data length:", len(val_data))

Train data length: 18431
Val data length: 2048


In [80]:
from tokenization import createDataLoaderV1
torch.manual_seed(123)
# BOTH CONSUME random numbers!!!!!! Thus if you set shuffle=True for val_loader they will both consume random numbers from the same RNG, so val_loader
# indirectly affects train_loader!!!
train_loader = createDataLoaderV1(train_data,
                                  batch_size=2,
                                  max_length=GPT_CONFIG_124M["context_length"],
                                  stride=GPT_CONFIG_124M["context_length"],
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=0)
val_loader = createDataLoaderV1(val_data,
                                batch_size=2,
                                max_length=GPT_CONFIG_124M["context_length"],
                                stride=GPT_CONFIG_124M["context_length"],
                                shuffle=False,
                                drop_last=False,
                                num_workers=0)
print(len(val_loader))
print("Train_loader")
for x, y in train_loader:
  print(x.shape, y.shape)
print("\nVal_loader")
for x, y in val_loader:
  print(x.shape, y.shape)


def calc_loss_batch(input_batch, target_batch, model, device):
  input_batch = input_batch.to(device)
  target_batch = target_batch.to(device)
  logits = model(input_batch)
  loss = nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
  return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
  total_loss = 0
  if len(data_loader) == 0:
    return float("nan")
  elif num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(len(data_loader), num_batches)
  for i, (input_batch, target_batch) in enumerate(data_loader):   # BOTH CONSUME random numbers!!!!!!
    if i < num_batches:
      loss = calc_loss_batch(input_batch, target_batch, model, device)
      total_loss += loss.item()
    else:
      break
  return total_loss / num_batches




1
Train_loader
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Val_loader
torch.Size([2, 256]) torch.Size([2, 256])


In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)
with torch.no_grad():
  train_loss = calc_loss_loader(train_loader, model, device)
  val_loss = calc_loss_loader(val_loader, model, device)
print(f"Train loss: {train_loss:.10f}")
print(f"Val loss: {val_loss:.10f}")

cuda
Train loss: 10.9875834783
Val loss: 10.9811048508


In [81]:
def evaluate_model(model, train_loader, val_loader, eval_iter, device):
  model.eval()
  with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
  model.train()
  return train_loss, val_loss




def text_to_token_ids(text, tokenizer):
  encoded_tensor = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
  encoded_tensor = torch.tensor(encoded_tensor).unsqueeze(0)
  return encoded_tensor


def token_id_to_text(token_ids, tokenizer):
  decoded_text = tokenizer.decode(token_ids.squeeze(0).tolist())
  return decoded_text


def generate_text_simple(model, idx,  max_new_tokens, context_size):
  for _ in range(max_new_tokens):
    idx_cond = idx[:, -context_size:]
    with torch.no_grad():
      logits = model(idx_cond)

    logits = logits[:, -1, :]
    probs = torch.softmax(logits, dim=-1)
    idx_next = torch.argmax(probs, dim=-1, keepdim=True)
    idx = torch.cat((idx, idx_next), dim=1)
  return idx

def generate_and_print_sample(model, tokenizer, start_context, device):
  model.eval()
  context_size = model.pos_emb.weight.shape[0]    # num_positions

  encoded_tensor = text_to_token_ids(start_context, tokenizer).to(device)
  with torch.no_grad():
    token_ids = generate_text_simple(model, encoded_tensor, max_new_tokens=50, context_size=context_size)

  decoded_text = token_id_to_text(token_ids, tokenizer)
  print(decoded_text.replace("\n", " "))
  model.train()


def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq,
                       eval_iter, start_context, tokenizer):
  train_losses, val_losses, track_tokens_seen = [], [], []
  tokens_seen, global_step = 0, -1

  for epoch in range(num_epochs):
    model.train()   # enables dropout and batchnorm. Only affects layers that already exist in the model, so if no dropout module in model, nothing to enable
    for input_batch, target_batch in train_loader:
      optimizer.zero_grad()
      loss = calc_loss_batch(input_batch, target_batch, model, device)
      loss.backward()
      optimizer.step()
      tokens_seen += input_batch.numel()
      global_step += 1


      if (global_step % eval_freq == 0):
        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, eval_iter, device)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        track_tokens_seen.append(tokens_seen)
        print(f"Epoch {epoch + 1} (step {global_step:06d}) | Train loss {train_loss:.3f} | Val loss {val_loss:.3f} ")

    generate_and_print_sample(model, tokenizer, start_context, device)
  return train_losses, val_losses, track_tokens_seen

In [79]:
# def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
#                        eval_freq, eval_iter, start_context, tokenizer):
#     # Initialize lists to track losses and tokens seen
#     train_losses, val_losses, track_tokens_seen = [], [], []
#     tokens_seen, global_step = 0, -1

#     # Main training loop
#     for epoch in range(num_epochs):
#         model.train()  # Set model to training mode

#         for input_batch, target_batch in train_loader:
#             optimizer.zero_grad() # Reset loss gradients from previous batch iteration
#             loss = calc_loss_batch(input_batch, target_batch, model, device)
#             loss.backward() # Calculate loss gradients
#             optimizer.step() # Update model weights using loss gradients
#             tokens_seen += input_batch.numel()
#             global_step += 1

#             # Optional evaluation step
#             if global_step % eval_freq == 0:
#                 train_loss, val_loss = evaluate_model(
#                     model, train_loader, val_loader, device, eval_iter)
#                 train_losses.append(train_loss)
#                 val_losses.append(val_loss)
#                 track_tokens_seen.append(tokens_seen)
#                 print(f"Ep {epoch+1} (Step {global_step:06d}): "
#                       f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

#         # Print a sample text after each epoch
#         generate_and_print_sample(
#             model, tokenizer, device, start_context
#         )

#     return train_losses, val_losses, track_tokens_seen


# def evaluate_model(model, train_loader, val_loader, device, eval_iter):
#     model.eval()
#     with torch.no_grad():
#         train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
#         val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
#     model.train()
#     return train_loss, val_loss
# def token_ids_to_text(token_ids, tokenizer):
#   decoded_text = tokenizer.decode(token_ids.squeeze(0).tolist())
#   return decoded_text

# def generate_and_print_sample(model, tokenizer, device, start_context):
#     model.eval()
#     context_size = model.pos_emb.weight.shape[0]
#     encoded = text_to_token_ids(start_context, tokenizer).to(device)
#     with torch.no_grad():
#         token_ids = generate_text_simple(
#             model=model, idx=encoded,
#             max_new_tokens=50, context_size=context_size
#         )
#     decoded_text = token_ids_to_text(token_ids, tokenizer)
#     print(decoded_text.replace("\n", " "))  # Compact print format
#     model.train()
torch.manual_seed(42)  # IMPORTANT — reset all randomness here
train_loader1 = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader1 = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)
val_loader2 = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

# def evaluate_model(model, train_loader, val_loader, eval_iter, device):
#   model.eval()
#   with torch.no_grad():
#     train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
#     val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
#   model.train()
#   return train_loss, val_loss

model.eval()
with torch.no_grad():
    train_loss_1, val_loss_1 = evaluate_model(model, train_loader1, val_loader1, 5, device)

model.eval()
with torch.no_grad():
    train_loss_2, val_loss_2 = evaluate_model(model, train_loader1, val_loader2, 5, device)

print("Train loss with unshuffled val_loader:", train_loss_1, val_loss_1)
print("Train loss with shuffled val_loader:", train_loss_2, val_loss_2)

Train loss with unshuffled val_loader: 0.5846027433872223 6.443647384643555
Train loss with shuffled val_loader: 0.590851879119873 6.443647384643555


In [82]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 10
start_context = "Every effort moves you"
train_losses, val_losses, tokens_seen = train_model_simple(model, train_loader, val_loader,
                                                           optimizer, device=device, num_epochs=num_epochs, eval_freq=5, eval_iter=5,
                                                           start_context=start_context, tokenizer = tokenizer)

Epoch 1 (step 000000) | Train loss 9.818 | Val loss 9.930 
Epoch 1 (step 000005) | Train loss 8.066 | Val loss 8.336 
Every effort moves you,,,,,,,,,,,,.                                     
Epoch 2 (step 000010) | Train loss 6.623 | Val loss 7.053 
Epoch 2 (step 000015) | Train loss 6.047 | Val loss 6.605 
Every effort moves you, and,, and,,,,,,, and,.                                   
Epoch 3 (step 000020) | Train loss 5.532 | Val loss 6.507 
Epoch 3 (step 000025) | Train loss 5.399 | Val loss 6.389 
Every effort moves you, and to the to the of the to the, and I had. Gis, and, and, and, and, and, and I had the, and, and, and, and, and, and, and, and, and
Epoch 4 (step 000030) | Train loss 4.895 | Val loss 6.280 
Epoch 4 (step 000035) | Train loss 4.648 | Val loss 6.304 
Every effort moves you.  "I the picture.                    "I"I the picture"I had the the honour of the picture and I had been the picture of
Epoch 5 (step 000040) | Train loss 4.023 | Val loss 6.165 
Every effort m

In [88]:
model.eval()
token_ids = generate_text_simple(model, text_to_token_ids("Every effort moves you", tokenizer).to(device), max_new_tokens=25, context_size=GPT_CONFIG_124M["context_length"])
print(token_id_to_text(token_ids, tokenizer))

vocab = {
    "closer" : 0,
    "every" : 1,
    "effort" : 2,
    "forward" : 3,
    "inches" : 4,
    "moves" : 5,
    "pizza" : 6,
    "toward": 7,
    "you" : 8
}

inverse_vocab = {v : k for k, v in vocab.items()}
print(vocab)
print("\nInverse Vocab:\n", inverse_vocab)

Every effort moves you?"

"Yes--quite insensible to the irony. She wanted him vindicated--and by me!"


{'closer': 0, 'every': 1, 'effort': 2, 'forward': 3, 'inches': 4, 'moves': 5, 'pizza': 6, 'toward': 7, 'you': 8}

Inverse Vocab:
 {0: 'closer', 1: 'every', 2: 'effort', 3: 'forward', 4: 'inches', 5: 'moves', 6: 'pizza', 7: 'toward', 8: 'you'}
