# **Pretraining an LLM on Unlabeled Data**

In [1]:
from importlib.metadata import version

pkgs = ["matplotlib", 
        "numpy", 
        "tiktoken", 
        "torch",
        "tensorflow" # For OpenAI's pretrained weights since earlier models were trained in TensorFlow
       ]
for p in pkgs:
    print(f"{p} version: {version(p)}")

matplotlib version: 3.10.1
numpy version: 2.2.5
tiktoken version: 0.9.0
torch version: 2.7.0
tensorflow version: 2.19.0


## **1. Evaluating Generative Text Models**

--prose--

In [20]:
import torch
from utils.components import GPTModel

# GPT configuration from previos NBs
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 1024, # Retaining original context length
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(42)
model = GPTModel(GPT_CONFIG_124M)
model.eval(); # Dropout disabled during inference

In [21]:
import tiktoken
from utils.components import generate_text_simple

# Pipeline functions
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # Add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # Removing batch dim
    return tokenizer.decode(flat.tolist())

start_content = "In the grim darkness of the far future"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_content, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 In the grim darkness of the far future� lobster bump tobacco Unknown confused WEEKirtual Shares sod


### **1.1 Calculating The Text Generation Loss i.e. Cross-Entropy and Perplexity**

In [22]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [23]:
with torch.no_grad():
    logits = model(inputs)
    
probs = torch.softmax(logits, dim=-1) # Probability of each token in vocabulary
print(probs.shape) # Shape ==> (batch_size, num_tokens, vocab_size)

torch.Size([2, 3, 50257])


In [24]:
token_ids = torch.argmax(probs, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)

Token IDs:
 tensor([[[41449],
         [ 1838],
         [39347]],

        [[10719],
         [ 7417],
         [18879]]])


In [25]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Yad makes php


In [26]:
# Token probabilities corresponding to the target indices
text_idx = 0
target_probs_1 = probs[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probs_1)

text_idx = 1
target_probs_2 = probs[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probs_2)

Text 1: tensor([1.0347e-05, 3.0490e-05, 3.4867e-05])
Text 1: tensor([2.4436e-05, 1.0729e-05, 6.7333e-06])


In [27]:
# Compute log of all token probabilities
log_probs = torch.log(torch.cat((target_probs_1, target_probs_2)))
print(log_probs)

tensor([-11.4788, -10.3981, -10.2640, -10.6195, -11.4425, -11.9084])


In [28]:
# Computing the average log probability of each token
avg_log_probs = torch.mean(log_probs)
print(avg_log_probs)

tensor(-11.0186)


In [29]:
# Minimize the negative average log-probability
neg_avg_log_prob = avg_log_probs * -1
print(neg_avg_log_prob)

tensor(11.0186)


In [30]:
# Prior to implementing cross entropy, we should check the shape of logits and targets
# Logits shape --> (batch_size, num_tokens, vocab_size)
print("Logits shape:", logits.shape)

# Targets have shape (batch_size, num_tokens)
print("Targets shape:", targets.shape)

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])


In [31]:
# Flattening above tensors for cross entropy. Flattening is done by combining over the batch dimension
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

# Applying cross entropy loss
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)

# Printing shapes and cross entropy loss
print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)
print("Cross-Entropy loss:", loss)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])
Cross-Entropy loss: tensor(11.0186)


In [32]:
# Calculating the perplexity 
perplexity = torch.exp(loss)
print("Perplexity:", perplexity)

Perplexity: tensor(60995.6328)


### **1.2 Calculating Traininig and Validation Losses** 

In [34]:
import os

file_path = "data/the-law-bastiat.txt"

with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

In [36]:
# First and last99 chars
print(text_data[:99], "\n")
print(text_data[-99:])

 The law perverted! The law—and, in its wake, all the collective forces of the nation—the law, I sa 

egun—reject all systems, and try liberty—liberty, which is an act of faith in God and in His work. 


In [37]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print("Characters: ", total_characters)
print("Tokens: ", total_tokens)

Characters:  95988
Tokens:  21939


In [38]:
from utils.components import create_dataloader_v1

# Training / validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

torch.manual_seed(42)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [40]:
# Running sanity check
if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the training loader!")
    
if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader!")

In [42]:
# Checking shapes of train and val loaders.
print("Train loader:")
for x, y in train_loader:
    print(x.shape, y.shape)

print("\nValidation loader:")
for x, y in val_loader:
    print(x.shape, y.shape)

Train loader:
torch.Size([2, 1024]) torch.Size([2, 1024])
torch.Size([2, 1024]) torch.Size([2, 1024])
torch.Size([2, 1024]) torch.Size([2, 1024])
torch.Size([2, 1024]) torch.Size([2, 1024])
torch.Size([2, 1024]) torch.Size([2, 1024])
torch.Size([2, 1024]) torch.Size([2, 1024])
torch.Size([2, 1024]) torch.Size([2, 1024])
torch.Size([2, 1024]) torch.Size([2, 1024])
torch.Size([2, 1024]) torch.Size([2, 1024])

Validation loader:
torch.Size([2, 1024]) torch.Size([2, 1024])


In [43]:
# Checking token sizes
train_tokens = 0
for input_batch, target_batch in train_loader:
    train_tokens += input_batch.numel()
    
val_tokens = 0
for input_batch, target_batch in val_loader:
    val_tokens += input_batch.numel()

print("Training tokens:", train_tokens)
print("Validation tokens:", val_tokens)
print("All tokens:", train_tokens + val_tokens)

Training tokens: 18432
Validation tokens: 2048
All tokens: 20480


In [44]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce num of batches to match the total number of batches in the data loader
        # in case values are exceeded.
        num_batches = min(num_batches, len(data_loader))
    for i, (input, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else: 
            break
    return total_loss / num_batches

In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

torch.manual_seed(42)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
    
print("Training loss:", train_loss)
print("Validation loss:", val_loss)  

Training loss: 10.978011661105686
Validation loss: 10.971200942993164
