## Finetune an LLM for classification
- **Code created by**: Noor de Bruijn
- **Last edits**: Tuesday June 3rd, 2025

### Install and import packages

In [None]:
#Install packages
!pip install tqdm
!pip install torch
!pip install joblib

In [None]:
#Import packages
import joblib
import json
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import sys
import time
import torch
from tqdm import tqdm

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

print("Success")

### Check cwd, venv, Python version

In [None]:
#Check current working directory
print("Current working directory:")
print(os.getcwd())

#Check virtual environment path
print("\n Python executable (venv path):")
print(sys.executable)

#Check Python version
print("\nPython version:")
print(sys.version)

## Stage 1. Data preparation

### Step 1. Download the data

In [None]:
#Check size of pkl file
!ls -lh 

#Check specific size
print(os.path.getsize(''))

In [None]:
#Define path to file
file_path = ""

try:
    obj = joblib.load(file_path)
    print(type(obj))
except Exception as e:
    print(f"Joblib load error: {e}")

In [None]:
#Open file
with open('', 'rb') as file:
    data = pickle.load(file)
print("Success")

### Step 2. Preprocess the data

#### Split the dataset (train, validation, test)

In [None]:
#Define function to create three datasets (train, validation, test)
def random_split(df, train_frac, validation_frac):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)
    
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]
    
    return train_df, validation_df, test_df

#Apply function to current dataframe
train_df, validation_df, test_df = random_split(df, 0.7, 0.1)

#### Save datasets as separate CSV files

In [None]:
#Save the datasets as CSV files so they can be used later
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

### Step 3. Create data loaders

#### Implement a PyTorch Dataset (specifies how data is loaded and preprocessed) 

In [None]:
class DS(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        self.encoded_texts = [tokenizer.encode(text) for text in self.data["Text"]]
        
        if max_length is None:
            self.max_length = self._longest_encoded_length()
            else:
                self.max_length = max_length
                
                #Truncuation if text > max_length
                self.encoded_texts = [encoded_texts[:self.max_length] for encoded_text in self.encoded_texts]
                
                #Padding with <|endoftext|> = 50256 (GPT2)
                self.encoded_texts = [
                    encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
                    for encoded_text in self.encoded_texts
                ]
                
    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )
    
    def __len__(self):
        return (len(self.data))
    
    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

In [None]:
#Apply DS to train.csv
train_dataset = DS(csv_file = "train.csv",
                   max_length = None,
                   tokenizer = tokenizer
                  )

#Apply DS to validation.csv
validation_dataset = DS(csv_file = "validation.csv",
                   max_length = None,
                   tokenizer = tokenizer
                  )

#Apply DS to test.csv
test_dataset = DS(csv_file = "test.csv",
                   max_length = None,
                   tokenizer = tokenizer
                  )

#### Creating PyTorch data loaders

In [None]:
#Setting various variables
num_workers = 0
batch_size = 8
torch.manual_seed(123)

#Dataloader for train_dataset
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

#Dataloader for validation_dataset
validation_loader = DataLoader(
    dataset=validation_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

#Dataloader for test_dataset
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

In [None]:
#Print statement to see number of batches in each dataset
print(f"{len(train_loader)} training batches")
print(f"{len(validation_loader)} validation batches")
print(f"{len(test_loader)} test batches")

## Stage 2. Model setup

### Step 4. Initialize the model

In [None]:
#Select model
MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Example text"

#Base model configuration
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True
}

#Define model specific parameters (embedding dimension, # of transformer blocks, # of attention heads)
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

#Using .update to add new keys from model_configs[MODEL] to BASE_CONFIG
BASE_CONFIG.update(model_configs[MODEL])

### Step 5. Load pretrained weights

In [None]:
#Set device variable to GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#Define GPTModel
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [None]:
#Function to download original GPT2 weights
def download_and_load_gpt2(model_size, models_dir):
    # Validate model size
    allowed_sizes = ("124M", "355M", "774M", "1558M")
    if model_size not in allowed_sizes:
        raise ValueError(f"Model size not in {allowed_sizes}")

    # Define paths
    model_dir = os.path.join(models_dir, model_size)
    base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
    backup_base_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/gpt2"
    filenames = [
        "checkpoint", "encoder.json", "hparams.json",
        "model.ckpt.data-00000-of-00001", "model.ckpt.index",
        "model.ckpt.meta", "vocab.bpe"
    ]

    # Download files
    os.makedirs(model_dir, exist_ok=True)
    for filename in filenames:
        file_url = os.path.join(base_url, model_size, filename)
        backup_url = os.path.join(backup_base_url, model_size, filename)
        file_path = os.path.join(model_dir, filename)
        download_file(file_url, file_path, backup_url)

    # Load settings and params
    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    settings = json.load(open(os.path.join(model_dir, "hparams.json"), "r", encoding="utf-8"))
    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)

    return settings, params

In [None]:
#Manually load pre-trained weights into custom GPT model (gpt) from set of stored parameters (params)
def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])
    
    
load_weights_into_gpt(gpt, params)
gpt.to(device);

In [None]:
#Define model and model size
MODEL = "gpt2-small (124M)"
model_size = "124M"

settings, params = download_and_load_gpt2(
    model_size=model_size, models_dir="gpt2"
)
#Define the model
model = GPTModel(BASE_CONFIG)

#Load the weights
load_weights_into_gpt(model, params)

### Step 6. Modify model for fine-tuning

#### Adding a classification layer

In [None]:
#Set seed and num_classes variable
torch.manual_seed(123)
num_classes = 2

#Change out_head of the model
#This new output_layer has required_grad automatically set to True
model.out_head = torch.nn.Linear(
    in_features=BASE_CONFIG["emb_dim"],
    out_features = num_classes
)

#Make transformer block and final normalization layer trainable (set required_grad = True)
for param in model.trf_blocks[-1].parameters():
    param.required_grad = True
for param in model.final_norm.parameters():
    param.required_grad = True

### Step 7. Implement evaluation utilities

#### Define functions for classification accuracy and loss

In [None]:
# I. Classification accuracy
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct predictions, num_examples = 0, 0
    
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)
            
            with torch.no_grad():
                logits = model(input_batch)[:, -1, :]
                predicted_labels = torch.argmax(logits, dim=-1)
                
                num_examples += predicted_labels.shape[0]
                correct_predictions += (
                    (predicted_labels == target_batch).sum().item()
                )
                
                else:
                    break
            return correct_predictions / num_examples

In [None]:
# II. Classification loss
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)[:, -1, :]
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

#### Determine classification accuracy

In [None]:
#Set device variable to GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#Set number of batches
num_batches = 10

torch.manual_seed(123)

#Determine classification accuracy across train, validation, test datasets
train_accuracy = calc_accuracy_loader(
    train_loader, model, device, num_batches=num_batches
)

validation_accuracy = calc_accuracy_loader(
    validation_loader, model, device, num_batches=num_batches
)

test_accuracy = calc_accuracy_loader(
    test_loader, model, device, num_batches=num_batches
)

#Print statements to see accuracy across different datasets
print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {validation_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

#Important: To improve prediction accuracies, we need to fine-tune the model.

#### Determine classification loss 

In [None]:
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=num_batches)
    val_loss = calc_loss_loader(validation_loader, model, device, num_batches=num_batches)
    test_loss = calc_loss_loader(test_loader, model, device, num_batches=num_batches)

print(f"Training loss: {train_loss:.3f}")
print(f"Validation loss: {val_loss:.3f}")
print(f"Test loss: {test_loss:.3f}")

## Stage 3. Model fine-tuning and usage

### Step 8. Fine-tune model

In [None]:
def train_classifier_simple(
        model, train_loader, validation_loader, optimizer, device,
        num_epochs, eval_freq, eval_iter):
    train_losses, validation_losses, train_accuracies, validation_accuracies = [], [], [], []
    examples_seen, global_step = 0, -1
    
    for epoch in range(num_epochs):
        model.train()
        
        #Training loop
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            examples_seen += input_batch.shape[0]
            global_step += 1
            
            #Calculate losses
            if global_step % eval_freq == 0:
                train_loss, validation_loss = evaluate_model(
                    model, train_loader, validation_loader, device, eval_iter)
                train_losses.append(train_loss)
                validation_losses.append(validation_loss)
                
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, "
                      f"Validation loss {validation_loss:.3f}"
                      
        #Calculate accuracies        
        train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
        validation_accuracy = calc_accuracy_loader(validation_loader, model, device, num_batches=eval_iter)
        
        #Print accuracies
        print(f"Training accuracy: {train_accuracy*100:.2f}%")
        print(f"Validation accuracy: {validation_accuracy*100:.2f}%")
        
        #Append accuracies to list
        train_accuracies.append(train_accuracy)
        validation_accuracies.append(validation_accuracy)
        
    return train_losses, validation_losses, train_accuracies, validation_accuracies

In [None]:
def evaluate_model(model, train_loader, validation_loader, device, eval_iter):
    model.eval()
    
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        validation_loss = calc_loss_loader(validation_loader, model, device, num_batches=eval_iter)
        
    model.train()
    
    return train_loss, validation_loss

In [None]:
#Initialize various variables
start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 5

#Initiate training
train_losses, validation_losses, train_accuracies, validation_accuracies, examples_seen = \
    train_classifier_simple(
        model, train_loader, validation_loader, optimizer, device, 
        num_epochs=num_epochs, eval_freq=50, eval_iter=5
    )

end_time = time.time()
execution_time_minutes = (end_time-start_time) / 60

print(f"Training completed in {execution_time_minutes:.2f} minutes.")

### Step 9. Evaluate fine-tuned model

#### Plot classification loss

In [None]:
def plot_values(
    epochs_seen, examples_seen, train_values, validation_values, label = "loss"):
    fig, ax1 = plot.subplots(figsize=(3,5))
    
    ax1.plot(epochs_seen, train_values, label=f"Training {label}")
    ax1.plot(epochs_seen, validation_values, linestyle = "-.", label=f"Validation {label}")
    
    ax1.set_xlabel("Epochs")
    as1.set_ylabel(label.capitalize())
    ax1.legend()
    
    ax2 = ax1.twiny()
    ax2.plot(examples_seen, train_values, alpha=0)
    ax2.set_xlabel("Examples seen")
    
    fig.tight_layout()
    plt.savefig(f"{label}-plot.pdf")
    plt.show()

In [None]:
#Create two 1D tensors (#torch.linspace(start, end, steps))
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
examples_seen_tensor = torch.linspace(0, examples_seen, len(train_losses))

#Create plot
plot_values(epochs_tensor, examples_seen_tensor, train_losses, validation_losses)

#### Plot classification accuracies

In [None]:
#Create two 1D tensors
epochs_tensor = torch.linspace(0, num_epochs, len(train_accuracies))
examples_seen_tensor = torch.linspace(0, examples_seen, len(train_accuracies))

#Create plot
plot_values(epochs_tensor, examples_seen_tensor, train_accuracies, validation_accuracies, label="accuracy")

#### Calculate performance metrics for training, validation and test sets across entire dataset

In [None]:
train_accuracy = calc_accuracy_loader(train_loader, model, device)
validation_accuracy = calc_accuracy_loader(validation_loader, model, device)
test_accuracy = calc_accuracy_loader(test_loader, model, device)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {validation_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

### Step 10. Use model on new data