In [1]:
from importlib.metadata import version

pkgs = ["matplotlib", 
        "numpy", 
        "tiktoken", 
        "torch",
        "tensorflow" # For OpenAI's pretrained weights
       ]
for p in pkgs:
    print(f"{p} version: {version(p)}")

matplotlib version: 3.10.3
numpy version: 2.0.2
tiktoken version: 0.9.0
torch version: 2.7.1
tensorflow version: 2.19.0


In [3]:
import torch
from previous_chapters import GPTModel
# If the `previous_chapters.py` file is not available locally,
# you can import it from the `llms-from-scratch` PyPI package.
# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg
# E.g.,
# from llms_from_scratch.ch04 import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();  # Disable dropout during inference

In [5]:
import tiktoken
from previous_chapters import generate_text_simple

# Alternatively:
# from llms_from_scratch.ch04 import generate_text_simple

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())


In [20]:
start_context = "Every effort moves you <|endoftext|>"
tokenizer = tiktoken.get_encoding("gpt2")

# encoded = tokenizer.encode(start_context)
# print(f"encoded:{encoded}")

encoded = tokenizer.encode(start_context, allowed_special={'<|endoftext|>'})
print(f"encoded:{encoded}")

print("Token ID -> Text mapping:")
for i, token_id in enumerate(encoded):
    token_text = tokenizer.decode([token_id])
    print(f"Token {i}: ID {token_id} -> '{token_text}'")

encoded_tensor = torch.tensor(encoded)
print(f"encoded_tensor:{encoded_tensor}")

encoded_tensor_list = encoded_tensor.tolist()
print(f"encoded_tensor_list:{encoded_tensor_list}")
tokenizer.decode(encoded_tensor_list)

encoded:[6109, 3626, 6100, 345, 220, 50256]
Token ID -> Text mapping:
Token 0: ID 6109 -> 'Every'
Token 1: ID 3626 -> ' effort'
Token 2: ID 6100 -> ' moves'
Token 3: ID 345 -> ' you'
Token 4: ID 220 -> ' '
Token 5: ID 50256 -> '<|endoftext|>'
encoded_tensor:tensor([ 6109,  3626,  6100,   345,   220, 50256])
encoded_tensor_list:[6109, 3626, 6100, 345, 220, 50256]


'Every effort moves you <|endoftext|>'

In [21]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print(f"generated token_ids:{token_ids}")
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

generated token_ids:tensor([[ 6109,  3626,  6100,   345, 34245,  5139,  2492, 25405, 17434, 17853,
          5308,  3398, 13174, 43071]])
Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [22]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [24]:
with torch.no_grad():
    logits = model(inputs)

print(f"logits.shape:{logits.shape}")  # (2, 3, 50257)

probas = torch.softmax(logits, dim=-1)
print(f"probas.shape:{probas.shape}")  # (2, 3, 50257)

logits.shape:torch.Size([2, 3, 50257])
probas.shape:torch.Size([2, 3, 50257])


In [27]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print(f"Predicted token IDs shape: {token_ids.shape}")  # (2, 3)

print(f"Predicted token IDs: {token_ids}")

Predicted token IDs shape: torch.Size([2, 3, 1])
Predicted token IDs: tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


In [28]:
targets

tensor([[ 3626,  6100,   345],
        [ 1107,   588, 11311]])

In [31]:
token_ids[0].flatten()

tensor([16657,   339, 42826])

In [32]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix


In [33]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: tensor([7.4540e-05, 3.1061e-05, 1.1563e-05])
Text 2: tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])


In [34]:
batch_size, seq_len = targets.shape
batch_indices = torch.arange(batch_size).unsqueeze(1)  # Shape: (2, 1)
seq_indices = torch.arange(seq_len).unsqueeze(0)       # Shape: (1, 3)

# Extract target probabilities for all samples at once
target_probas = probas[batch_indices, seq_indices, targets]
print("Target probabilities shape:", target_probas.shape)  # (2, 3)
print("Target probabilities:")
print(target_probas)

Target probabilities shape: torch.Size([2, 3])
Target probabilities:
tensor([[7.4540e-05, 3.1061e-05, 1.1563e-05],
        [1.0337e-05, 5.6776e-05, 4.7559e-06]])


In [40]:
target_probas_gather = torch.gather(probas, dim=-1, index=targets.unsqueeze(-1)).squeeze(-1)

In [42]:
target_probas_gather

tensor([[7.4540e-05, 3.1061e-05, 1.1563e-05],
        [1.0337e-05, 5.6776e-05, 4.7559e-06]])

In [43]:
torch.cat((target_probas_1, target_probas_2))

tensor([7.4540e-05, 3.1061e-05, 1.1563e-05, 1.0337e-05, 5.6776e-05, 4.7559e-06])

In [44]:
# Compute logarithm of all token probabilities
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])


In [None]:
# Calculate the average probability for each token
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor(-10.7940)


In [46]:
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)

tensor(10.7940)


In [47]:
# Logits have shape (batch_size, num_tokens, vocab_size)
print("Logits shape:", logits.shape)

# Targets have shape (batch_size, num_tokens)
print("Targets shape:", targets.shape)

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])


In [49]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [50]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

tensor(10.7940)


In [52]:
perplexity = torch.exp(loss)
print(perplexity)


tensor(48725.8203)


In [58]:

probas_flat = torch.softmax(logits_flat, dim=-1)
target_probas_flat_gather = torch.gather(probas_flat, dim=-1, index=targets_flat.unsqueeze(-1))

torch.mean(torch.log(target_probas_flat_gather))

tensor(-10.7940)