In [1]:
from importlib.metadata import version

pkgs = ["matplotlib", 
        "numpy", 
        "tiktoken", 
        "torch",
        "tensorflow" # For OpenAI's pretrained weights
       ]
for p in pkgs:
    print(f"{p} version: {version(p)}")

matplotlib version: 3.10.3
numpy version: 2.1.3
tiktoken version: 0.9.0
torch version: 2.7.0
tensorflow version: 2.19.0


In [2]:
import torch
from gpt import GPTModel
GPT_CONFIG_124M={
    "vocab_size":50257,
    "context_length":256,
    "emb_dim":468,
    "n_heads":12,
    "n_layers":12,
    "drop_rate":0.1,
    "qkv_bias":False
}

In [3]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 468)
  (pos_emb): Embedding(256, 468)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=468, out_features=468, bias=False)
        (W_key): Linear(in_features=468, out_features=468, bias=False)
        (W_value): Linear(in_features=468, out_features=468, bias=False)
        (out_proj): Linear(in_features=468, out_features=468, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=468, out_features=1872, bias=True)
          (1): GELU()
          (2): Linear(in_features=1872, out_features=468, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [8]:
import tiktoken
from gpt import generate_text_simple
def text_to_token_ids(text,tokenizer):
    encoded=tokenizer.encode(text,allowed_special={'<|endoftext|>'})
    encoded_tensor=torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor
def token_ids_to_text(token_ids,tokenizer):
    flat=token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [9]:
start_context="Every effort moves you"
tokenizer=tiktoken.get_encoding("gpt2")
token_ids=generate_text_simple(model=model,idx=text_to_token_ids(start_context,tokenizer),max_new_tokens=10,context_size=GPT_CONFIG_124M['context_length'])

In [10]:
print("output text:\n", token_ids_to_text(token_ids,tokenizer))

output text:
 Every effort moves you behaves Nolan contact contributingchair guessesimil YellowArthur university


# Calculating the text generation loss: cross-entropy and perplexity

In [11]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) # "really like chocolate"]

In [12]:
with torch.no_grad():
    logits=model(inputs)
probas=torch.softmax(logits,dim=-1)
print(probas.shape)

torch.Size([2, 3, 50257])


In [13]:
token_ids=torch.argmax(probas,dim=-1,keepdim=True)
print(token_ids)

tensor([[[37259],
         [28694],
         [33906]],

        [[36750],
         [43005],
         [13144]]])


In [14]:
print(f"Targets batch 1: {token_ids_to_text(targets[0],tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(),tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  rainy009Va


In [16]:
text_idx=0
target_probas_1=probas[text_idx,[0,1,2],targets[text_idx]]
print("Text 1:",target_probas_1)
text_idx=1
target_probas_2=probas[text_idx,[0,1,2],targets[text_idx]]
print("Text 2:",target_probas_2)

Text 1: tensor([4.9556e-06, 8.7060e-06, 2.5850e-05])
Text 2: tensor([9.7422e-06, 3.1885e-05, 3.1743e-05])


compute logarithm of all token probabilities

In [22]:
log_probas=torch.log(torch.cat((target_probas_1,target_probas_2)))
print(log_probas)

tensor([-12.2150, -11.6515, -10.5632, -11.5390, -10.3534, -10.3578])


calculate the average probabiity for each token

In [23]:
avg_log_probas=torch.mean(log_probas)
print(avg_log_probas)

tensor(-11.1133)


In [24]:
neg_avg_log_probas=avg_log_probas*-1
print(neg_avg_log_probas)

tensor(11.1133)


In [25]:
print("logitsa shape",logits.shape)
print("targets shape",targets.shape)

logitsa shape torch.Size([2, 3, 50257])
targets shape torch.Size([2, 3])


In [26]:
logits_flat=logits.flatten(0,1)
targets_flat=targets.flatten()
print("Flattened logits:",logits_flat.shape)
print("Flattened targets:",targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [27]:
#defining the loss functions
loss=torch.nn.functional.cross_entropy(logits_flat,targets_flat)
print(loss)

tensor(11.1133)


In [29]:
#perplexity of the model
perplexity=torch.exp(loss)
print(perplexity)

tensor(67058.9297)


# Calculating the training and validation set losses

In [30]:
import os
import urllib.request

file_path = "the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode('utf-8')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

In [31]:
print(text_data[:99])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [32]:
print(text_data[-99:])

it for me! The Strouds stand alone, and happen once--but there's no exterminating our kind of art."


In [33]:
total_characters=len(text_data)
total_tokens=len(tokenizer.encode(text_data))
print("Characters:",total_characters)
print("tokens:",total_tokens)

Characters: 20479
tokens: 5145


In [34]:
from dataloader import create_dataloader_v1

In [35]:
train_ratio=0.90
split_idx=int(train_ratio*len(text_data))
train_data=text_data[:split_idx]
val_data=text_data[split_idx:]

In [40]:
torch.manual_seed(123)
train_loader=create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)
val_loader=create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

In [41]:
# Sanity check

if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")

In [42]:
print("Train loader:")
for x,y in train_loader:
    print(x.shape,y.shape)
print("\nValidation loader:")
for x,y in val_loader:
    print(x.shape,y.shape)

Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


In [43]:
train_tokens=0
for input_batch,target_batch in train_loader:
    train_tokens+=input_batch.numel()
val_tokens=0
for input_batch,target_batch in val_loader:
    val_tokens+=input_batch.numel()
print("Training tokens:",train_tokens)
print("Validation tokens:",val_tokens)
print("All tokens:",train_tokens+val_tokens)

Training tokens: 4608
Validation tokens: 512
All tokens: 5120


In [47]:
def calc_loss_batch(input_batch,target_batch,model,device):
    input_batch,target_batch=input_batch.to(device),target_batch.to(device)
    logits=model(input_batch)
    loss=torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())
    return loss
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [48]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [49]:
model.to(device)
torch.manual_seed(123)
with torch.no_grad():
    train_loss=calc_loss_loader(train_loader,model,device)
    val_loss=calc_loss_loader(val_loader,model,device)
print("Training loss:",train_loss)
print("Validation loss:",val_loss)

Training loss: 11.001415040757921
Validation loss: 10.9550199508667
