### 5.1.1 Using gpt to generate Text

In [5]:
%load_ext autoreload
%autoreload 2

from generate_text import generate_text

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import torch
from gpt_model import GPTModel

In [7]:
GPT_CONFIG_124M = {
"vocab_size": 50257,
"context_length": 256,
"emb_dim": 768,
"n_heads": 12,
"n_layers": 12,
"drop_rate": 0.1,
"qkv_bias": False
}
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (token_embeddings): Embedding(50257, 768)
  (position_embeddings): Embedding(256, 768)
  (drop_embeddings): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (transformer): MultiHeadAttention(
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): FeedForwardNetwork(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNormalization()
      (norm2): LayerNormalization()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )


In [8]:

import tiktoken
def text_to_token(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special= {'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = generate_text(
model=model,
idx=text_to_token(start_context, tokenizer),
max_tokens=10,
context_size=GPT_CONFIG_124M["context_length"]
)
print("Output text:\n", token_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you 960esame WindsorFE Keith awaitedSer GaelListMine


### 5.1.2 Calculating text generation loss


In [10]:
inputs = torch.tensor([[16833, 3626, 6100],[40, 1107, 588]])
#Every effort moves, I really like
#We want
#effort move you, really like chocolate
#Meaning for every token we want it to correctly predict the next token
targets = torch.tensor([[3626, 6100, 345 ], [1107, 588, 11311]])

In [12]:
with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits, dim=-1)
print(probas.shape)

torch.Size([2, 3, 50257])


In [15]:
#Next we find the arg max i.e for every token in each of the 2 examples we find the token with the highest probability
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token ids:\n", token_ids)
print(f"Targets batch 1: {token_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1:"f" {token_to_text(token_ids[0].flatten(), tokenizer)}")

Token ids:
 tensor([[[13207],
         [  552],
         [42826]],

        [[18236],
         [34817],
         [ 7055]]])
Targets batch 1:  effort moves you
Outputs batch 1: hole compNetflix


We now want to evaluate this

In [18]:
text_idx = 0;
target_probas1 = probas[text_idx, [0,1,2], targets[text_idx]]
print("Target: 1\n",  target_probas1)
text_idx = 1;
target_probas2 = probas[text_idx, [0,1,2], targets[text_idx]]
print("Target: 2:\n", target_probas2)

Target: 1
 tensor([5.0549e-05, 2.7952e-05, 8.2801e-06])
Target: 2:
 tensor([1.2945e-05, 3.2755e-05, 5.2184e-06])


We want to maximize the above probs
We concatenate these 2 tensors into one and then apply log

In [19]:
log_probas = torch.log(torch.cat((target_probas1, target_probas2)))
print(log_probas)

tensor([ -9.8926, -10.4850, -11.7017, -11.2548, -10.3265, -12.1633])


In [20]:
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor(-10.9706)


In [21]:
neg_avg_log_probas = -1* avg_log_probas
print(neg_avg_log_probas)

tensor(10.9706)


In [23]:
print("Logits shape: ", logits.shape)
print("Target shape: ", targets.shape)

Logits shape:  torch.Size([2, 3, 50257])
Target shape:  torch.Size([2, 3])


For the cross_entropy loss function in PyTorch, we want to flatten these tensors
by combining them over the batch dimension:

In [25]:
logits_flat = logits.flatten(0,1 )
targets_flat = targets.flatten()
print("Flattened logits shape: ", logits_flat.shape)
print("Flattened targets shape: ", targets_flat.shape)

Flattened logits shape:  torch.Size([6, 50257])
Flattened targets shape:  torch.Size([6])


In [26]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print("Loss: ", loss)

Loss:  tensor(10.9706)


### 5.1.3 Training and validation losses

In [30]:
file_path="the-verdict.txt"
with open(file_path, 'r', encoding='utf-8') as file:
    text_data = file.read();
total_characters = len(text_data)
total_tokens =len(tokenizer.encode(text_data))
print("Characters: ", total_characters)
print("total_tokens: ", total_tokens)

Characters:  20398
total_tokens:  5064


In [34]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data= text_data[split_idx:]
print(len(train_data))
print(len(val_data))
print(len(train_data) + len(val_data))

18358
2040
20398


Next, we divide the dataset into a training and a validation set and use the data
loaders from text-processing.iypnb to prepare the batches for LLM training

In [36]:
from dataloader_v1 import create_dataloader_v1
torch.manual_seed(123)
train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)
val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)
print("Training loader:\n")
for x,y in train_loader:
    print(x.shape, y.shape)
print("validation loader: \n")
for x, y in val_loader:
    print(x.shape, y.shape)


Training loader:

torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
validation loader: 

torch.Size([2, 256]) torch.Size([2, 256])


Next, we implement a utility function to calculate the cross entropy loss of a given
batch returned via the training and validation loader:

In [38]:
def calc_loss_batch(input_batch, output_batch, device, model):
    input_batch = input_batch.to(device)
    output_batch = output_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), output_batch.flatten())
    return loss 

In [40]:
def calc_loss_loader(data_loader, model ,device, num_batches=None):
    total_loss = 0
    if (len(data_loader) == 0):
        return float('nan')
    elif num_batches == None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, output_batch) in enumerate(data_loader):
        if (i < num_batches):
            loss = calc_loss_batch(input_batch, output_batch, device, model)
            total_loss += loss
        else:
            break
    return total_loss / num_batches

In [42]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: tensor(10.9995)
Validation loss: tensor(11.0198)
