## Stage 3: Fine Tuning the Model

Any foundational model is little more than a very highly skilled auto-complete, now comes the task of fine tuning.  This is where a foundational model is taught specifics about the job that it will eventaually do.  For the purposes of this exercise I am going to train a simple classifier using language inferrence to a job that you would assume would be through machine learning methods.  Secondly, I am going to make a simple agent model and also learn to score its accuracy using Ollama, it may be useful to refer to the first picture in My_GPT_construction as a guide to where I am.

**Objectives:**
- I need to download the fine tuning dataset and get it into dataloaders.
- Load an LLM to process this information, then train all appropriate weights.
- Evaluate and refine metrics to prove that the model is operating properly.

A lot of this code will be more placeholder code, I will try to check that it works on one example or try and find a way to get it to run through collab.  This is more for a reminder of structure because the hardware demands appear to be quite intense to produce something noteworthy.

Below is a the example work flow which I will have to do twice for classification fine tuning and agent fine tuning.

![Fine Tuning](resources/fine_tuning.jpg)

The first thing that we is data to work with so the code below downloads a public set of SMS messages labeled as either spam or legitimate (ham). It contains 5,574 English messages

# CLASSIFICATION FINE TUNING

In [143]:
import urllib.request
import ssl
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "resources/sms_spam_collection.zip"
extracted_path = "resources/"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Create an unverified SSL context
    ssl_context = ssl._create_unverified_context()

    # Downloading the file
    with urllib.request.urlopen(url, context=ssl_context) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)


resources/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [None]:
import pandas as pd

# load the data into a dataframe and clean it up the labes columns need to be numbers

df = pd.read_csv(data_file_path,sep="\t", header=None, names=["Label", "Text"])

# For better classification we want the same number of SPAM and HAM messages

def create_balanced_dataset(df):
    
    # Count the instances of "spam"
    num_spam = df[df["Label"] == "spam"].shape[0]
    
    # Randomly sample "ham" instances to match the number of "spam" instances
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    
    # Combine ham "subset" with "spam"
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])

    return balanced_df

balanced_df = create_balanced_dataset(df)

# this was missed in the lecture that we need to numerically label

label_mapping = {"ham": 0, "spam": 1}
balanced_df["Label"] = balanced_df["Label"].map(label_mapping)
print(balanced_df["Label"].value_counts())

Label
0    747
1    747
Name: count, dtype: int64


We will now split the data randomly into train, validation, test (70%,20%,10%)

In [145]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)

# This just saves the dataset for use another time

train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

The data loader will require all of the messages to be of the same length, therefore, you can either pad the shorter messages to the length of the longest one (preferred) or truncate all messages to an arbitrary number.

To do either of these things, here is the class construction.

In [146]:
import torch
from torch.utils.data import Dataset


class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256, device="cpu"):
        self.data = pd.read_csv(csv_file)

        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]

        self.device = device 

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            
            self.encoded_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]

        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long).to(self.device),
            torch.tensor(label, dtype=torch.long).to(self.device)
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        return max(len(encoded_text) for encoded_text in self.encoded_texts)

In [147]:
import tiktoken
# We instatiate this class on all data sets

tokenizer = tiktoken.get_encoding("gpt2")

train_dataset = SpamDataset(
    csv_file="train.csv",
    max_length=None,
    tokenizer=tokenizer
)

val_dataset = SpamDataset(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = SpamDataset(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

# Dataloader import is the same as before, however, 
# in this case the targets represent the class labels 
# rather than the next toekns in the text

from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [148]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

assert train_dataset.max_length <= BASE_CONFIG["context_length"], (
    f"Dataset length {train_dataset.max_length} exceeds model's context "
    f"length {BASE_CONFIG['context_length']}. Reinitialize data sets with "
    f"`max_length={BASE_CONFIG['context_length']}`"
)

In [149]:
from My_GPT import download_and_load_gpt2, load_weights_into_gpt, GPTModel

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")

settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval()



File already exists and is up-to-date: gpt2/124M/checkpoint




File already exists and is up-to-date: gpt2/124M/encoder.json




File already exists and is up-to-date: gpt2/124M/hparams.json




File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001




File already exists and is up-to-date: gpt2/124M/model.ckpt.index




File already exists and is up-to-date: gpt2/124M/model.ckpt.meta




File already exists and is up-to-date: gpt2/124M/vocab.bpe
Loading block 0
Loading block 1
Loading block 2
Loading block 3
Loading block 4
Loading block 5
Loading block 6
Loading block 7
Loading block 8
Loading block 9
Loading block 10
Loading block 11


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [150]:
# THIS IS A TEST CELL CAN BE REMOVED ONCE CONFIRMED WORKING
from My_GPT import generate_text_simple, text_to_token_ids, token_ids_to_text

text_1 = "Every effort moves you"

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(text_1, tokenizer),
    max_new_tokens=15,
    context_size=BASE_CONFIG["context_length"]
)

print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward.

The first step is to understand the importance of your work


Now that we have successfully loaded the GPT-2 infrastructure we want to adjust the output layer  from a dimension of 50257 (byte pair encoder output) to our classification types.

To start this we want to free all of parameters to make the it non trainable

In [151]:
for param in model.parameters():
    param.requires_grad = False

In [152]:
num_classes = 2

model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes)

Additionally, we configure the last transformer block and the final LayerNorm module,
which connects this block to the output layer, to be trainable
    

In [153]:
for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True

for param in model.final_norm.parameters():
    param.requires_grad = True

Once this code has run we will get an output which needs an argmax in this case because there are only two options. Similar to before you can apply as softmax and then argmax if there are more classes. 

Here is a function that will do that

In [154]:
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)

            with torch.no_grad():
                logits = model(input_batch)[:, -1, :]  # Logits of last output token
            predicted_labels = torch.argmax(logits, dim=-1)

            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break
    return correct_predictions / num_examples

In [155]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Note:
# Uncommenting the following lines will allow the code to run on Apple Silicon chips, if applicable,
# which is approximately 2x faster than on an Apple CPU (as measured on an M3 MacBook Air).
# As of this writing, in PyTorch 2.4, the results obtained via CPU and MPS were identical.
# However, in earlier versions of PyTorch, you may observe different results when using MPS.

if torch.cuda.is_available():
   device = torch.device("cuda")
elif torch.backends.mps.is_available():
   device = torch.device("mps")
else:
   device = torch.device("cpu")
print(f"Running on {device} device.")

model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes

torch.manual_seed(123) # For reproducibility due to the shuffling in the training data loader

train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=10)
val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=10)
test_accuracy = calc_accuracy_loader(test_loader, model, device, num_batches=10)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Running on mps device.
Training accuracy: 46.25%
Validation accuracy: 45.00%
Test accuracy: 48.75%


In [158]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)[:, -1, :]  # Logits of last output token
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

In [159]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [161]:
with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=8)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=8)
    test_loss = calc_loss_loader(test_loader, model, device, num_batches=8)

print(f"Training loss: {train_loss:.3f}")
print(f"Validation loss: {val_loss:.3f}")
print(f"Test loss: {test_loss:.3f}")


Training loss: 0.723
Validation loss: 0.851
Test loss: 0.773


In [162]:
# Overall the same as `train_model_simple` in chapter 5
def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                            eval_freq, eval_iter):
    # Initialize lists to track losses and examples seen
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            examples_seen += input_batch.shape[0] # New: track examples instead of tokens 
            global_step += 1

            ## 130 batches: training, eval_Freq = 50 --> after 50 batches are processed in each epoch, we print train loss and val loss

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Calculate accuracy after each epoch
        train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
        val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
        print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
        print(f"Validation accuracy: {val_accuracy*100:.2f}%")
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)

    return train_losses, val_losses, train_accs, val_accs, examples_seen

In [163]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [164]:
import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=5,
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 0.723, Val loss 0.755
Ep 1 (Step 000050): Train loss 0.632, Val loss 0.642
Ep 1 (Step 000100): Train loss 0.479, Val loss 0.514
Training accuracy: 87.50% | Validation accuracy: 97.50%
Ep 2 (Step 000150): Train loss 0.303, Val loss 0.175
Ep 2 (Step 000200): Train loss 0.047, Val loss 0.042
Ep 2 (Step 000250): Train loss 0.029, Val loss 0.061
Training accuracy: 95.00% | Validation accuracy: 92.50%
Ep 3 (Step 000300): Train loss 0.114, Val loss 0.129
Ep 3 (Step 000350): Train loss 0.069, Val loss 0.030
Training accuracy: 97.50% | Validation accuracy: 97.50%
Ep 4 (Step 000400): Train loss 0.014, Val loss 0.033
Ep 4 (Step 000450): Train loss 0.091, Val loss 0.057
Ep 4 (Step 000500): Train loss 0.121, Val loss 0.035
Training accuracy: 97.50% | Validation accuracy: 100.00%
Ep 5 (Step 000550): Train loss 0.182, Val loss 0.022
Ep 5 (Step 000600): Train loss 0.040, Val loss 0.029
Training accuracy: 100.00% | Validation accuracy: 100.00%
Training completed in 3.40 m

This can be across the whole data set instead of batches and will lose some accuracy but it looks pretty damn good !

In [170]:
train_accuracy = calc_accuracy_loader(train_loader, model, device)
val_accuracy = calc_accuracy_loader(val_loader, model, device)
test_accuracy = calc_accuracy_loader(test_loader, model, device)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 98.27%
Validation accuracy: 97.99%
Test accuracy: 96.33%


In [165]:
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
    model.eval()

    # Prepare inputs to the model
    input_ids = tokenizer.encode(text)
    supported_context_length = model.pos_emb.weight.shape[0]
    # Note: In the book, this was originally written as pos_emb.weight.shape[1] by mistake
    # It didn't break the code but would have caused unnecessary truncation (to 768 instead of 1024)

    # Truncate sequences if they too long
    input_ids = input_ids[:min(max_length, supported_context_length)]

    # Pad sequences to the longest sequence
    input_ids += [pad_token_id] * (max_length - len(input_ids))
    input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0) # add batch dimension

    # Model inference
    with torch.no_grad():
        logits = model(input_tensor)[:, -1, :]  # Logits of the last output token
    predicted_label = torch.argmax(logits, dim=-1).item()

    # Return the classified result
    return "spam" if predicted_label == 1 else "not spam"

Now we can plug in test spam to see what it classifies it as

In [166]:
text_1 = (
    "You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award."
)

print(classify_review(
    text_1, model, tokenizer, device, max_length=train_dataset.max_length
))

spam


In [None]:
text_2 = (
    "Hey, just wanted to check if we're still on"
    " for dinner tonight? Let me know!"
)

print(classify_review(
    text_2, model, tokenizer, device, max_length=train_dataset.max_length
))

not spam


In [169]:
# save the model state dict

torch.save(model.state_dict(), "review_classifier.pth")


In [None]:
# this how I can load this model again

model_state_dict = torch.load("review_classifier.pth")
model.load_state_dict(model_state_dict)

## INSTRUCTION FINE-TUNING

I am going to use two sets of questions for fine tuning, the first is only 1000 long from the book that this repo is based on.  The second is a more comprehensive set that was used to the 7b Stanford Alpaca model

In [None]:
import json
import os
import urllib
import ssl

def download_and_load_file(file_path, url):
    ssl_context = ssl.create_default_context()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = ssl.CERT_NONE

    if not os.path.exists(file_path):
        with urllib.request.urlopen(url, context=ssl_context) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()

    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    return data


file_path = "/resources/instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)

alpaca_path = "/resources/alpaca_data.json"
alpaca_url = (
    "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca"
    "/refs/heads/main/alpaca_data.json"
)

alpaca_data = download_and_load_file(alpaca_path,alpaca_url)


print("Number of entries:", len(data))
print("Number of entries:", len(alpaca_data))


Number of entries: 1100
Number of entries: 52002


In [3]:
print("Example entry:\n", alpaca_data[1000])

Example entry:
 {'instruction': 'List 3 historical events related to the following country', 'input': 'Canada', 'output': "The three historical events related to Canada are the Dominion of Canada's establishment in 1867, the adoption of the Canadian Flag in 1965, and the signing of the Canada-U.S. Free Trade Agreement in 1988."}


In [4]:
train_portion = int(len(data) * 0.85)  # 85% for training
test_portion = int(len(data) * 0.1)    # 10% for testing
val_portion = len(data) - train_portion - test_portion  # Remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 935
Validation set length: 55
Test set length: 110


In [None]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""

    return instruction_text + input_text

In [10]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)



In [12]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets

        # New: Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        # New: Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Note:
# Uncommenting the following lines will allow the code to run on Apple Silicon chips, if applicable,
# which is much faster than on an Apple CPU (as measured on an M3 MacBook Air).
# However, the resulting loss values may be slightly different.

if torch.cuda.is_available():
   device = torch.device("cuda")
elif torch.backends.mps.is_available():
   device = torch.device("mps")
else:
   device = torch.device("cpu")

print("Device:", device)

Device: mps


In [14]:
from functools import partial
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=1024)

In [17]:
from torch.utils.data import DataLoader
import tiktoken


num_workers = 0
batch_size = 8

torch.manual_seed(123)
tokenizer = tiktoken.get_encoding("gpt2")

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

We need to use a more powerful model than GPT-2 because this task will fail, however, as said before this will not run at all on my hardware.

In [18]:
from My_GPT import download_and_load_gpt2, GPTModel, load_weights_into_gpt

BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();



File already exists and is up-to-date: gpt2/355M/checkpoint
File already exists and is up-to-date: gpt2/355M/encoder.json
File already exists and is up-to-date: gpt2/355M/hparams.json
File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/355M/model.ckpt.index
File already exists and is up-to-date: gpt2/355M/model.ckpt.meta
File already exists and is up-to-date: gpt2/355M/vocab.bpe
Loading block 0
Loading block 1
Loading block 2
Loading block 3
Loading block 4
Loading block 5
Loading block 6
Loading block 7
Loading block 8
Loading block 9
Loading block 10
Loading block 11
Loading block 12
Loading block 13
Loading block 14
Loading block 15
Loading block 16
Loading block 17
Loading block 18
Loading block 19
Loading block 20
Loading block 21
Loading block 22
Loading block 23


I cannot run this code at all on my machine, so will have to stick together some pieces on Colab.  i understand the intention of the code but doing these steps on Colab is proving to be full of problems. I may look at ana artificail intelligence specific cluster instead.

In [20]:
from My_GPT import calc_loss_batch, calc_loss_loader, train_model_simple, generate, token_ids_to_text, text_to_token_ids

device = "cpu"  # Set device to CPU because all sorts of errors using MPS
model.to(device)

torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, max_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, max_batches=5)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

Training loss: 3.825879430770874
Validation loss: 3.7619039535522463


In [21]:
import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)

num_epochs = 1

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")


Epoch 1/1 started.


Epoch 1/1:   0%|          | 0/116 [00:07<?, ?it/s, loss=3.811, tokens=488]


Evaluating at step 0...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:12<00:00,  6.41s/it]
Epoch 1/1:   1%|          | 1/116 [00:20<39:40, 20.70s/it, loss=3.811, tokens=488]

Epoch 1 (Step 000000): Train loss 2.637, Val loss 2.626


Epoch 1/1:   4%|▍         | 5/116 [01:52<34:06, 18.44s/it, loss=1.357, tokens=3320]


Evaluating at step 5...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:14<00:00,  7.31s/it]
Epoch 1/1:   5%|▌         | 6/116 [02:07<44:39, 24.36s/it, loss=1.357, tokens=3320]

Epoch 1 (Step 000005): Train loss 1.174, Val loss 1.102


Epoch 1/1:   9%|▊         | 10/116 [03:20<29:45, 16.84s/it, loss=0.835, tokens=6088]


Evaluating at step 10...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:14<00:00,  7.04s/it]
Epoch 1/1:   9%|▉         | 11/116 [03:34<35:32, 20.31s/it, loss=0.835, tokens=6088]

Epoch 1 (Step 000010): Train loss 0.872, Val loss 0.945


Epoch 1/1:  13%|█▎        | 15/116 [04:51<27:15, 16.19s/it, loss=0.975, tokens=8968]


Evaluating at step 15...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:21<00:00, 10.81s/it]
Epoch 1/1:  14%|█▍        | 16/116 [05:13<38:36, 23.16s/it, loss=0.975, tokens=8968]

Epoch 1 (Step 000015): Train loss 0.856, Val loss 0.906


Epoch 1/1:  17%|█▋        | 20/116 [07:28<43:32, 27.22s/it, loss=0.836, tokens=11912]


Evaluating at step 20...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:21<00:00, 10.90s/it]
Epoch 1/1:  18%|█▊        | 21/116 [07:50<55:38, 35.14s/it, loss=0.836, tokens=11912]

Epoch 1 (Step 000020): Train loss 0.776, Val loss 0.881


Epoch 1/1:  22%|██▏       | 25/116 [09:50<39:58, 26.36s/it, loss=0.806, tokens=14576]


Evaluating at step 25...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:21<00:00, 10.50s/it]
Epoch 1/1:  22%|██▏       | 26/116 [10:11<47:43, 31.82s/it, loss=0.806, tokens=14576]

Epoch 1 (Step 000025): Train loss 0.753, Val loss 0.859


Epoch 1/1:  26%|██▌       | 30/116 [12:01<35:20, 24.66s/it, loss=0.736, tokens=17232]


Evaluating at step 30...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:17<00:00,  8.93s/it]
Epoch 1/1:  27%|██▋       | 31/116 [12:19<43:05, 30.42s/it, loss=0.736, tokens=17232]

Epoch 1 (Step 000030): Train loss 0.798, Val loss 0.836


Epoch 1/1:  30%|███       | 35/116 [14:07<34:20, 25.44s/it, loss=0.695, tokens=20088]


Evaluating at step 35...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:15<00:00,  7.70s/it]
Epoch 1/1:  31%|███       | 36/116 [14:22<36:20, 27.26s/it, loss=0.695, tokens=20088]

Epoch 1 (Step 000035): Train loss 0.715, Val loss 0.809


Epoch 1/1:  34%|███▍      | 40/116 [16:07<28:00, 22.11s/it, loss=0.745, tokens=22784]


Evaluating at step 40...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:17<00:00,  8.73s/it]
Epoch 1/1:  35%|███▌      | 41/116 [16:25<37:13, 29.78s/it, loss=0.745, tokens=22784]

Epoch 1 (Step 000040): Train loss 0.672, Val loss 0.806


Epoch 1/1:  39%|███▉      | 45/116 [18:23<29:37, 25.03s/it, loss=0.623, tokens=26000]


Evaluating at step 45...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:21<00:00, 10.59s/it]
Epoch 1/1:  40%|███▉      | 46/116 [18:44<38:54, 33.35s/it, loss=0.623, tokens=26000]

Epoch 1 (Step 000045): Train loss 0.633, Val loss 0.790


Epoch 1/1:  43%|████▎     | 50/116 [20:30<27:30, 25.00s/it, loss=0.582, tokens=28816]


Evaluating at step 50...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:23<00:00, 11.75s/it]
Epoch 1/1:  44%|████▍     | 51/116 [20:54<33:19, 30.76s/it, loss=0.582, tokens=28816]

Epoch 1 (Step 000050): Train loss 0.662, Val loss 0.783


Epoch 1/1:  47%|████▋     | 55/116 [22:53<26:17, 25.87s/it, loss=0.856, tokens=31648]


Evaluating at step 55...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:24<00:00, 12.32s/it]
Epoch 1/1:  48%|████▊     | 56/116 [23:17<33:48, 33.82s/it, loss=0.856, tokens=31648]

Epoch 1 (Step 000055): Train loss 0.760, Val loss 0.764


Epoch 1/1:  52%|█████▏    | 60/116 [25:15<25:49, 27.67s/it, loss=0.517, tokens=34352]


Evaluating at step 60...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:17<00:00,  8.95s/it]
Epoch 1/1:  53%|█████▎    | 61/116 [25:33<28:24, 30.99s/it, loss=0.517, tokens=34352]

Epoch 1 (Step 000060): Train loss 0.719, Val loss 0.743


Epoch 1/1:  56%|█████▌    | 65/116 [27:19<20:19, 23.92s/it, loss=0.608, tokens=36968]


Evaluating at step 65...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:18<00:00,  9.06s/it]
Epoch 1/1:  57%|█████▋    | 66/116 [27:37<23:44, 28.49s/it, loss=0.608, tokens=36968]

Epoch 1 (Step 000065): Train loss 0.652, Val loss 0.735


Epoch 1/1:  60%|██████    | 70/116 [29:23<17:24, 22.72s/it, loss=0.587, tokens=39832]


Evaluating at step 70...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:25<00:00, 12.82s/it]
Epoch 1/1:  61%|██████    | 71/116 [29:48<22:45, 30.35s/it, loss=0.587, tokens=39832]

Epoch 1 (Step 000070): Train loss 0.532, Val loss 0.729


Epoch 1/1:  65%|██████▍   | 75/116 [31:36<16:24, 24.01s/it, loss=0.898, tokens=42488]


Evaluating at step 75...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:18<00:00,  9.01s/it]
Epoch 1/1:  66%|██████▌   | 76/116 [31:54<18:25, 27.64s/it, loss=0.898, tokens=42488]

Epoch 1 (Step 000075): Train loss 0.569, Val loss 0.729


Epoch 1/1:  69%|██████▉   | 80/116 [33:15<11:40, 19.47s/it, loss=0.584, tokens=45088]


Evaluating at step 80...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:17<00:00,  8.60s/it]
Epoch 1/1:  70%|██████▉   | 81/116 [33:33<13:45, 23.58s/it, loss=0.584, tokens=45088]

Epoch 1 (Step 000080): Train loss 0.605, Val loss 0.725


Epoch 1/1:  73%|███████▎  | 85/116 [34:59<09:37, 18.63s/it, loss=0.715, tokens=47736]


Evaluating at step 85...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:18<00:00,  9.12s/it]
Epoch 1/1:  74%|███████▍  | 86/116 [35:17<11:55, 23.86s/it, loss=0.715, tokens=47736]

Epoch 1 (Step 000085): Train loss 0.509, Val loss 0.710


Epoch 1/1:  78%|███████▊  | 90/116 [37:00<09:38, 22.24s/it, loss=0.651, tokens=50568]


Evaluating at step 90...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:20<00:00, 10.32s/it]
Epoch 1/1:  78%|███████▊  | 91/116 [37:21<11:46, 28.26s/it, loss=0.651, tokens=50568]

Epoch 1 (Step 000090): Train loss 0.562, Val loss 0.691


Epoch 1/1:  82%|████████▏ | 95/116 [39:16<08:45, 25.04s/it, loss=0.653, tokens=53600]


Evaluating at step 95...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:17<00:00,  8.67s/it]
Epoch 1/1:  83%|████████▎ | 96/116 [39:33<10:06, 30.33s/it, loss=0.653, tokens=53600]

Epoch 1 (Step 000095): Train loss 0.500, Val loss 0.682


Epoch 1/1:  86%|████████▌ | 100/116 [41:17<06:11, 23.23s/it, loss=0.865, tokens=56560]


Evaluating at step 100...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:17<00:00,  8.74s/it]
Epoch 1/1:  87%|████████▋ | 101/116 [41:34<07:00, 28.06s/it, loss=0.865, tokens=56560]

Epoch 1 (Step 000100): Train loss 0.502, Val loss 0.677


Epoch 1/1:  91%|█████████ | 105/116 [43:11<04:03, 22.13s/it, loss=0.643, tokens=59440]


Evaluating at step 105...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:21<00:00, 10.75s/it]
Epoch 1/1:  91%|█████████▏| 106/116 [43:33<04:35, 27.58s/it, loss=0.643, tokens=59440]

Epoch 1 (Step 000105): Train loss 0.564, Val loss 0.670


Epoch 1/1:  95%|█████████▍| 110/116 [45:12<02:10, 21.75s/it, loss=0.722, tokens=62232]


Evaluating at step 110...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:19<00:00,  9.89s/it]
Epoch 1/1:  96%|█████████▌| 111/116 [45:32<02:13, 26.79s/it, loss=0.722, tokens=62232]

Epoch 1 (Step 000110): Train loss 0.555, Val loss 0.667


Epoch 1/1:  99%|█████████▉| 115/116 [47:09<00:21, 21.83s/it, loss=0.550, tokens=65080]


Evaluating at step 115...




Evaluating on training data...




Evaluating on validation data...


Evaluation: 100%|██████████| 2/2 [00:16<00:00,  8.16s/it]
Epoch 1/1: 100%|██████████| 116/116 [47:26<00:00, 24.54s/it, loss=0.550, tokens=65080]


Epoch 1 (Step 000115): Train loss 0.508, Val loss 0.664

EPOCH 1 COMPLETED - MODEL GENERATION:
Training completed in 47.67 minutes.


In [23]:
torch.manual_seed(123)


for entry in test_data[15:19]:

    input_text = format_input(entry)

    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
)

    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Classify the following items as either solid, liquid, or gas.

### Input:
Mercury, oxygen, wood

Correct response:
>> Mercury - Liquid
Oxygen - Gas
Wood - Solid

Model response:
>> Solid: Mercury
Liquid: Oxygen
Gas: Wood
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Convert 3 kilometers to meters.

Correct response:
>> 3 kilometers is 3000 meters.

Model response:
>> 3 kilometers is 3 meters.
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Convert the sentence to use an indefinite pronoun.

### Input:
Someone left a note.

Correct response:
>> A note was left by someone.

Model response:
>> The note was left by someone.
------------------

This model is not very good, you can see that the loss is still pretty high.  It seems I need a lot more training but due to my compute I cannot do this but I am happy it actually ran !

In [24]:
import re


file_name = f"{re.sub(r'[ ()]', '', CHOOSE_MODEL) }-sft.pth"
torch.save(model.state_dict(), file_name)
print(f"Model saved as {file_name}")

# Load model via
# model.load_state_dict(torch.load("gpt2-medium355M-sft.pth"))

Model saved as gpt2-medium355M-sft.pth


I have managed to train a model based on the alpaca data set of agent questions, which total approximately 52k.  I will not try and load this and see if it does any better.  However, on Colab I ran a standard Hugging Face version of GPT-2 because this custom implementation was too inefficient and would not run.

Therefore, to load it I will have to import and run it in a similar fashion.

In [32]:
# Import the correct model class from transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")  # Start with base architecture
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

# Load your fine-tuned weights
model.load_state_dict(torch.load("model_state_dicts/gpt2_superfinetuned-sft.pth", 
                     map_location=torch.device('cpu')))

# Set the model to evaluation mode
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [55]:
def text_to_token_ids_hugging_face(text, tokenizer):
    if hasattr(tokenizer, 'encode'):
        # For HuggingFace tokenizers
        return torch.tensor([tokenizer.encode(text)], dtype=torch.long)
    else:
        # For tiktoken tokenizers
        return torch.tensor([tokenizer.encode(text, allowed_special={'<|endoftext|>'})], dtype=torch.long)

def generate_hugging_face(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    device = next(model.parameters()).device
    idx = idx.to(device)
    
    # Get initial condition
    idx_cond = idx[:, -context_size:]
    
    for _ in range(max_new_tokens):
        with torch.no_grad():
            logits = model(idx_cond)
            if hasattr(logits, 'logits'):
                logits = logits.logits
            elif isinstance(logits, tuple):
                logits = logits[0]
            logits = logits[:, -1, :]

        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            neg_inf = torch.tensor(float("-inf"), device=device)
            logits = torch.where(logits < min_val, neg_inf, logits)

        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        if eos_id is not None and (idx_next == eos_id).all():  
            break

        idx = torch.cat((idx, idx_next), dim=1)
        idx_cond = idx[:, -context_size:]

    return idx

In [57]:
torch.manual_seed(123)


for entry in test_data[:3]:

    input_text = format_input(entry)

    token_ids = generate_hugging_face(
        model=model,
        idx=text_to_token_ids_hugging_face(input_text, tokenizer).to("cpu"),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )
    
    generated_text = tokenizer.decode(token_ids.flatten().tolist(), skip_special_tokens=True)
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
)

    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Rewrite the sentence using a simile.

### Input:
The car is very fast.

Correct response:
>> The car is as fast as lightning.

Model response:
>> The car is very fast.
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What type of cloud is typically associated with thunderstorms?

Correct response:
>> The type of cloud typically associated with thunderstorms is cumulonimbus.

Model response:
>> The type of cloud associated with thunderstorms is a convective cloud.
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Name the author of 'Pride and Prejudice'.

Correct response:
>> Jane Austen.

Model response:
>> The author of Pride and Prejudice is

I had to completely rewrite the infrastucture to get this to run only to find that the code was equally as poor.  I cannot run it any better at the moment because this was 3 epochs of 50k agentic questions but has not made a massive improvement to the answers.

To close there are some tests where I can use an LLM to train and rate the performance of this LLM.  This is becoming very prevalent because it means that you can ask and answer a huge number of questions to train a model whilst keeping consistent scoring of loss.

To do this I have downloaded Olama and the code from the lecture is below.  I do not know whether this will run at all.

In [None]:
import psutil

def check_if_running(process_name):
    running = False
    for proc in psutil.process_iter(["name"]):
        if process_name in proc.info["name"]:
            running = True
            break
    return running

ollama_running = check_if_running("ollama")

if not ollama_running:
    raise RuntimeError("Ollama not running. Launch ollama before proceeding.")
print("Ollama running:", check_if_running("ollama"))

In [None]:
import urllib.request

def query_model(
    prompt,
    model="llama3",
    url="http://localhost:11434/api/chat"
):
    # Create the data payload as a dictionary
    data = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "options": {     # Settings below are required for deterministic responses
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048
        }
    }


    # Convert the dictionary to a JSON formatted string and encode it to bytes
    payload = json.dumps(data).encode("utf-8")

    # Create a request object, setting the method to POST and adding necessary headers
    request = urllib.request.Request(
        url,
        data=payload,
        method="POST"
    )
    request.add_header("Content-Type", "application/json")

    # Send the request and capture the response
    response_data = ""
    with urllib.request.urlopen(request) as response:
        # Read and decode the response
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]

    return response_data

In [None]:
for entry in test_data[:3]:
    prompt = (
        f"Given the input `{format_input(entry)}` "
        f"and correct output `{entry['output']}`, "
        f"score the model response `{entry['model_response']}`"
        f" on a scale from 0 to 100, where 100 is the best score. "
    )
    print("\nDataset response:")
    print(">>", entry['output'])
    print("\nModel response:")
    print(">>", entry["model_response"])
    print("\nScore:")
    print(">>", query_model(prompt))
    print("\n-------------------------")