In [1]:
from importlib.metadata import version

pkgs = [
    "numpy",       # PyTorch & TensorFlow dependency
    "matplotlib",  # Plotting library
    "tiktoken",    # Tokenizer
    "torch",       # Deep learning library
    "tqdm",        # Progress bar
    "tensorflow",  # For OpenAI's pretrained weights
]
for p in pkgs:
    print(f"{p} version: {version(p)}")

numpy version: 2.0.2
matplotlib version: 3.10.3
tiktoken version: 0.9.0
torch version: 2.7.1
tqdm version: 4.67.1
tensorflow version: 2.19.0


In [2]:
import json
import os
import urllib


def download_and_load_file(file_path, url):

    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)

    # The book originally contained this unnecessary "else" clause:
    #else:
    #    with open(file_path, "r", encoding="utf-8") as file:
    #        text_data = file.read()

    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    return data


file_path = "instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 1100


In [3]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""

    return instruction_text + input_text

In [4]:
model_input = format_input(data[50])
desired_response = f"\n\n### Response:\n{data[50]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


In [5]:
train_portion = int(len(data) * 0.85)  # 85% for training
test_portion = int(len(data) * 0.1)    # 10% for testing
val_portion = len(data) - train_portion - test_portion  # Remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

In [6]:
train_data[0]

{'instruction': 'Evaluate the following phrase by transforming it into the spelling given.',
 'input': 'freind --> friend',
 'output': 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'}

In [7]:
entry = data[0]
instruction_plus_input = format_input(entry)
print(instruction_plus_input)
response_text = f"\n\n### Response:\n{entry['output']}"
print(response_text)
# tokenizer.encode(response_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Evaluate the following phrase by transforming it into the spelling given.

### Input:
freind --> friend


### Response:
The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".


In [8]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [9]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [10]:
batch = [
    [0, 1, 2],           # 5 tokens
    [10, 11],                  # 2 tokens  
    [20, 21, 22, 23],          # 4 tokens
    [30, 31, 32]               # 3 tokens
]

batch_max_length = max(len(item)+1 for item in batch)
print(batch_max_length)
item = batch[0]

pad_token_id = 50256
new_item = item.copy()
new_item += [pad_token_id]
print(new_item)

padded = (
    new_item + [pad_token_id] *
    (batch_max_length - len(new_item))
)
print(padded)

5
[0, 1, 2, 50256]
[0, 1, 2, 50256, 50256]


In [11]:
# Example to demonstrate this
def modify_without_copy(item):
    item += [999]  # This modifies the original!
    return item

def modify_with_copy(item):
    new_item = item.copy()
    new_item += [999]  # This only modifies the copy
    return new_item

In [12]:
# Test it
original_list = [1, 2, 3]
print("Original before:", original_list)


Original before: [1, 2, 3]


In [13]:
x = modify_with_copy(original_list)
print(original_list)
print(x)

[1, 2, 3]
[1, 2, 3, 999]


In [14]:
item

[0, 1, 2]

In [15]:
import copy

# Simple case (no nested objects) - shallow and deep copy behave the same
simple_list = [1, 2, 3]
shallow = simple_list.copy()
deep = copy.deepcopy(simple_list)

shallow.append(4)
print("Original:", simple_list)  # [1, 2, 3] - unchanged
print("Shallow:", shallow)   

Original: [1, 2, 3]
Shallow: [1, 2, 3, 4]


In [16]:
# Complex case (nested objects) - difference becomes clear
nested_list = [[1, 2], [3, 4], [5, 6]]
shallow = nested_list.copy()
deep = copy.deepcopy(nested_list)

# Modify a nested element
shallow[0].append(999)  # This affects the original too!
print("Original after shallow modification:", nested_list)  # [[1, 2, 999], [3, 4], [5, 6]]
print("Shallow:", shallow)  

Original after shallow modification: [[1, 2, 999], [3, 4], [5, 6]]
Shallow: [[1, 2, 999], [3, 4], [5, 6]]


In [17]:
# Reset and try deep copy
nested_list = [[1, 2], [3, 4], [5, 6]]
deep = copy.deepcopy(nested_list)
deep[0].append(999)  # This does NOT affect the original
print("Original after deep modification:", nested_list)    # [[1, 2], [3, 4], [5, 6]]
print("Deep:", deep)   

Original after deep modification: [[1, 2], [3, 4], [5, 6]]
Deep: [[1, 2, 999], [3, 4], [5, 6]]


In [18]:
def custom_collate_draft_1(
    batch,
    pad_token_id=50256,
    device="cpu"
):
    # Find the longest sequence in the batch
    # and increase the max length by +1, which will add one extra
    # padding token below
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs
    inputs_lst = []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        # Via padded[:-1], we remove the extra padded token
        # that has been added via the +1 setting in batch_max_length
        # (the extra padding token will be relevant in later codes)
        inputs = torch.tensor(padded[:-1])
        inputs_lst.append(inputs)

    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    return inputs_tensor

In [19]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = (
    inputs_1,
    inputs_2,
    inputs_3
)

print(custom_collate_draft_1(batch))

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


In [20]:
def custom_collate_draft_2(
    batch,
    pad_token_id=50256,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets
        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor

In [21]:
inputs, targets = custom_collate_draft_2(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    8,     9, 50256, 50256, 50256]])


In [22]:
# targets = torch.tensor([10, 50256, 50256, 50256, 20])  # Some targets with padding tokens
# targets_2 = torch.tensor([100, 50256, 50256, 20, 30])  # Some targets with padding tokens
# targets = torch.stack([targets_1, targets_2])

targets_1 = torch.tensor([10, 50256])
targets_2 = torch.tensor([50256, 50256])  # Some targets with padding tokens
targets = torch.stack([targets_1, targets_2])

pad_token_id = 50256
ignore_index = -100
print(targets.shape)
print(targets)

torch.Size([2, 2])
tensor([[   10, 50256],
        [50256, 50256]])


In [23]:
targets = torch.tensor([10, 50256])
mask = targets == 50256  # [False, True, True, True, False]
indices = torch.nonzero(mask)  # tensor([1, 2, 3]) - works fine

In [24]:
mask

tensor([False,  True])

In [25]:
x = torch.nonzero(mask)
x

tensor([[1]])

In [26]:
indices = torch.nonzero(mask).squeeze()
print(indices)
print(indices.numel())

tensor(1)
1


In [27]:
tensor_2d = torch.tensor([[4, 5], [3, 0]])
coords = torch.nonzero(tensor_2d)
print(coords)  

tensor([[0, 0],
        [0, 1],
        [1, 0]])


In [28]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets

        # New: Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        # New: Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [29]:
batch

([0, 1, 2, 3, 4], [5, 6], [7, 8, 9])

In [30]:
inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


In [31]:
logits_1 = torch.tensor(
    [[-1.0, 1.0],  # 1st training example
     [-0.5, 1.5]]  # 2nd training example
)
targets_1 = torch.tensor([0, 1])


loss_1 = torch.nn.functional.cross_entropy(logits_1, targets_1)
print(loss_1)

tensor(1.1269)


In [32]:
logits_2 = torch.tensor(
    [[-1.0, 1.0],
     [-0.5, 1.5],
     [-0.5, 1.5]]  # New 3rd training example
)
targets_2 = torch.tensor([0, 1, 1])

loss_2 = torch.nn.functional.cross_entropy(logits_2, targets_2)
print(loss_2)

tensor(0.7936)


In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [37]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=lambda batch: custom_collate_fn(batch, device=device),
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

In [39]:
print("Train loader:")
for i, (inputs, targets) in enumerate(train_loader):
    print(f"inputs.shape: {inputs.shape}, device: {inputs.device}")
    print(f"targets.shape: {targets.shape}, device: {targets.device}")
    if i > 10:
        break  # Just check the first batch

Train loader:
inputs.shape: torch.Size([8, 61]), device: cuda:0
targets.shape: torch.Size([8, 61]), device: cuda:0
inputs.shape: torch.Size([8, 57]), device: cuda:0
targets.shape: torch.Size([8, 57]), device: cuda:0
inputs.shape: torch.Size([8, 61]), device: cuda:0
targets.shape: torch.Size([8, 61]), device: cuda:0
inputs.shape: torch.Size([8, 68]), device: cuda:0
targets.shape: torch.Size([8, 68]), device: cuda:0
inputs.shape: torch.Size([8, 65]), device: cuda:0
targets.shape: torch.Size([8, 65]), device: cuda:0
inputs.shape: torch.Size([8, 63]), device: cuda:0
targets.shape: torch.Size([8, 63]), device: cuda:0
inputs.shape: torch.Size([8, 68]), device: cuda:0
targets.shape: torch.Size([8, 68]), device: cuda:0
inputs.shape: torch.Size([8, 69]), device: cuda:0
targets.shape: torch.Size([8, 69]), device: cuda:0
inputs.shape: torch.Size([8, 80]), device: cuda:0
targets.shape: torch.Size([8, 80]), device: cuda:0
inputs.shape: torch.Size([8, 63]), device: cuda:0
targets.shape: torch.Size([

In [41]:
print(inputs[0])

tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,
          257,  2882,   326, 20431, 32543,   262,  2581,    13,   198,   198,
        21017, 46486,    25,   198,  9487,  1958,   262,  1708,  3709,   355,
         2035,  4735,    11,  8122,    11,   393,  3623,    13,   198,   198,
        21017, 23412,    25,   198, 23709,    11, 19443,    11,  5053,  1505,
          198,   198, 21017, 18261,    25,   198, 23709,   532, 15831,   198,
           34,  2364,  1453,   532, 21020,   198, 12621,  1505,   532, 14345,
        50256, 50256, 50256], device='cuda:0')


In [42]:
print(targets[0])

tensor([  318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,   257,
         2882,   326, 20431, 32543,   262,  2581,    13,   198,   198, 21017,
        46486,    25,   198,  9487,  1958,   262,  1708,  3709,   355,  2035,
         4735,    11,  8122,    11,   393,  3623,    13,   198,   198, 21017,
        23412,    25,   198, 23709,    11, 19443,    11,  5053,  1505,   198,
          198, 21017, 18261,    25,   198, 23709,   532, 15831,   198,    34,
         2364,  1453,   532, 21020,   198, 12621,  1505,   532, 14345, 50256,
         -100,  -100,  -100], device='cuda:0')


In [44]:
from gpt_download import download_and_load_gpt2
from previous_chapters import GPTModel, load_weights_into_gpt
# If the `previous_chapters.py` file is not available locally,
# you can import it from the `llms-from-scratch` PyPI package.
# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg
# E.g.,
# from llms_from_scratch.ch04 import GPTModel
# from llms_from_scratch.ch05 import download_and_load_gpt2, load_weights_into_gpt


BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

2025-08-11 05:52:49.176028: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-11 05:52:49.748337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754916769.986984   59031 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754916770.043974   59031 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754916770.506231   59031 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 