<a href="https://colab.research.google.com/github/aashu-0/llm-from-scratch/blob/main/llm_book_notes/05pretraining-gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# getting scripts from github
!git clone https://github.com/aashu-0/llm-from-scratch.git
%cd llm-from-scratch/llm_book_notes

Cloning into 'llm-from-scratch'...
remote: Enumerating objects: 60, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 60 (delta 27), reused 45 (delta 15), pack-reused 0 (from 0)[K
Receiving objects: 100% (60/60), 74.76 KiB | 602.00 KiB/s, done.
Resolving deltas: 100% (27/27), done.
/content/llm-from-scratch/llm_book_notes


In [2]:
import sys
sys.path.append('/content/llm-from-scratch/llm_book_notes')

#### **Pretraining on unlabeled data**

To access weight of any layer : `layer_name.weight`

To access all model trainable parameters: `model.parameters()`

In [6]:
# 1. Text Generation

import torch
from GPT import GPTModel

GPT_CONFIG_124M = {
    'vocab_size': 50257,
    'context_length': 256,
    'emb_dim': 768,
    'n_heads':12,
    'n_layers': 12,
    'drop_rate': 0.1,
    'qkv_bias': False
}

torch.manual_seed(123)
model = GPTModel(config=GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_featur

In [8]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m0.9/1.2 MB[0m [31m28.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [10]:
import tiktoken
from GPT import generate_text_simple

# function to text to token_id and token_ids_to_text
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special= {'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    decoded_text = tokenizer.decode(token_ids.squeeze(0).tolist())
    return decoded_text

In [12]:
start_context = 'Every Effort Moves you'
tokenizer = tiktoken.get_encoding('gpt2')

input_ids = text_to_token_ids(start_context, tokenizer)

token_ids = generate_text_simple(
    model = model,
    idx = input_ids,
    max_new_tokens = 10,
    context_size= GPT_CONFIG_124M['context_length']
)

output = token_ids_to_text(token_ids, tokenizer)

print(f'Input Text: {start_context}')
print(f'Output Text: {output}')

Input Text: Every Effort Moves you
Output Text: Every Effort Moves you finisheduxeHandle appropriation pigment cotton feellike poll liberate


In [13]:
print(tokenizer.encode('every effort moves you'))
print(tokenizer.encode('I really like chocolate'))

[16833, 3626, 6100, 345]
[40, 1107, 588, 11311]


#### 1. Text Generation

In [15]:
# inputs and targets (shifting concept)
inputs = torch.tensor([[16833, 3626, 6100],
                       [40, 1107, 588]])

targets = torch.tensor([[3626, 6100, 345],
                        [1107, 588, 11311]])

In [16]:
# calculating probability scores
with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits, dim=-1)
print(f'Logits: {logits}\n')
print(f'Probas Shape: {probas.shape}\n')   #[batch_size, n_tokens, emb_dim]
print(f'Probas: {probas}')

Logits: tensor([[[ 0.1113, -0.1057, -0.3666,  ...,  0.2843, -0.8824,  0.1074],
         [-0.6109, -0.5167, -0.7613,  ...,  0.5450, -1.0319, -0.2175],
         [ 0.5707, -0.6459, -0.0701,  ...,  0.7419, -0.1806, -0.2217]],

        [[-0.2968,  0.1949, -0.1649,  ..., -0.4867,  0.7218, -0.1714],
         [-0.8375,  0.0612, -0.4641,  ...,  0.2327, -0.3889, -0.0770],
         [ 0.5614,  0.6919,  0.8915,  ..., -0.9472,  1.2411, -0.2056]]])

Probas Shape: torch.Size([2, 3, 50257])

Probas: tensor([[[1.8849e-05, 1.5172e-05, 1.1687e-05,  ..., 2.2409e-05,
          6.9776e-06, 1.8776e-05],
         [9.1569e-06, 1.0062e-05, 7.8786e-06,  ..., 2.9090e-05,
          6.0103e-06, 1.3571e-05],
         [2.9877e-05, 8.8507e-06, 1.5741e-05,  ..., 3.5456e-05,
          1.4094e-05, 1.3526e-05]],

        [[1.2561e-05, 2.0538e-05, 1.4332e-05,  ..., 1.0389e-05,
          3.4784e-05, 1.4239e-05],
         [7.2731e-06, 1.7864e-05, 1.0565e-05,  ..., 2.1206e-05,
          1.1390e-05, 1.5559e-05],
         [2.949

In [17]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print(f'Token IDs: {token_ids}')
print(f'Token IDs Shape: {token_ids.shape}')

Token IDs: tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])
Token IDs Shape: torch.Size([2, 3, 1])


In [18]:
#token ids to output text

target_batch_1 = token_ids_to_text(targets[0], tokenizer)
output_batch_1 = token_ids_to_text(token_ids[0].flatten(), tokenizer)

print(f'Targets Batch 1 : {target_batch_1}')
print(f'Output Batch 1: {output_batch_1}')

Targets Batch 1 :  effort moves you
Output Batch 1:  Armed heNetflix


#### 2. Text Evaluation

1. `Logits`

2. `Probabilities`

3. `Target Probabilities`

4. `Log Probabilities`

5. `Average Log Probability`

6. `Negative avg log probability`

In [19]:
targets[0], targets[1]

(tensor([3626, 6100,  345]), tensor([ 1107,   588, 11311]))

In [20]:
# getting the probability scores corresponding to target tokens
# Target Probabilities

text_idx = 0
target_probas_1 = probas[text_idx, [0,1,2], targets[text_idx]]
print(f'Text 1 probability: {target_probas_1}')

text_idx = 1
target_probas_2 = probas[text_idx, [0,1,2], targets[text_idx]]
print(f'Text 2 probability: {target_probas_2}')

Text 1 probability: tensor([7.4540e-05, 3.1061e-05, 1.1563e-05])
Text 2 probability: tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])


In [21]:
# log_probability
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
log_probas

tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])

Why log of probability scores
1. stronger penalization for incorrect predictions.
    - `prob = 0.9 => -log(0.9) = 0.10`
    - `prob = 0.01 => -log(0.01) = 4.6`
2. makes the loss func convex
3. prevent numerical underflow
4. handles zero prob, avoiding undefined gradients

In [22]:
# avg log probabiltiy
avg_log_probas = torch.mean(log_probas)
avg_log_probas

tensor(-10.7940)

In [23]:
# neg avg log probability
neg_avg_log_prob = avg_log_probas*-1
neg_avg_log_prob

tensor(10.7940)

`Cross Entropy Loss`: negative log-likelihood

takes care of steps `2` to `5`

In [25]:
print(f'Logits Shape: {logits.shape}')
print(f'Targets Shape: {targets.shape}')

Logits Shape: torch.Size([2, 3, 50257])
Targets Shape: torch.Size([2, 3])


Why flatten?

because `cross_entropy()` expects `targets` to be 1D`[N]` and `logits` as 2D tensor`[N,C]`

where
* `N` = number of samples (batch_size * seq_length)

* `C` = num of classes (vocab_size)

In [26]:
# flatten these tensors

logits_flat = logits.flatten(0,1)
targets_flat = targets.flatten()

print(f'Flattened Logits Shape: {logits_flat.shape}')
print(f'Targets Logits Shape: {targets_flat.shape}')

Flattened Logits Shape: torch.Size([6, 50257])
Targets Logits Shape: torch.Size([6])


In [27]:
import torch.nn.functional as F
loss = F.cross_entropy(logits_flat, targets_flat)
print(loss)

tensor(10.7940)


**Perplexity**
- measures how well the prob distribution predicted by the model matches the actual distribution of words in the dataset
- it quantifies uncertainty or randomness in predictions
- lower = better model, higher = worse model
- `perplexity = torch.exp(loss)`
- more interpretable than raw_loss

#### Training and Validation losses

In [28]:
# fecthing the raw_data from the_verdict.txt file

file_path = 'the-verdict.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text_data = file.read()

text_data[:50]

'I HAD always thought Jack Gisburn rather a cheap g'

In [29]:
total_chars = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print(f'Total number of characters: {total_chars}')
print(f'Total number of tokens: {total_tokens}')

Total number of characters: 20479
Total number of tokens: 5145


In [32]:
train_split = 0.9
split_idx = int(train_split* len(text_data))

train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

# train_data, val_data

In [33]:
# training and validation dataloader

from CustomDatasetDataloader import create_dataloders_v1
torch.manual_seed(123)

train_dataloader = create_dataloders_v1(
    txt=train_data,
    batch_size= 2,
    max_length= GPT_CONFIG_124M['context_length'],
    stride = GPT_CONFIG_124M['context_length'],
    drop_last= True,
    shuffle = True,
    num_workers = 0)

val_dataloader = create_dataloders_v1(
    txt=val_data,
    batch_size= 2,
    max_length= GPT_CONFIG_124M['context_length'],
    stride = GPT_CONFIG_124M['context_length'],
    drop_last= False,
    shuffle = False,
    num_workers = 0)

In [37]:
for x, y in train_dataloader:
  print(x.shape, y.shape)

torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [38]:
for x,y in val_dataloader:
  print(x.shape, y.shape)

torch.Size([2, 256]) torch.Size([2, 256])


There are 9 training set batches with shape `[2, 256]` (i.e two samples and 256 tokens each) and 1 validation batch with shape `[2, 256]`


In [39]:
# cross entropy loss for a given batch
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)

    logits = model(input_batch)
    loss = F.cross_entropy(logits.flatten(0,1), target_batch.flatten())
    return loss

In [41]:
# computing loss over all the batches
# using num_batches so that we can specify lesser number of batches

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader) ==0:
        return float('nan')
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i< num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss/num_batches #avg loss


In [42]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

with torch.no_grad():
    train_loss = calc_loss_loader(train_dataloader, model, device)
    val_loss = calc_loss_loader(val_dataloader, model, device)

print(f'Training loss: {train_loss}')
print(f'Testing loss: {val_loss}')

Training loss: 10.98758316040039
Testing loss: 10.981104850769043


the loss values are high as we haven't trained our model yet.

#### Training function

In [43]:
# function for pretraining llm

def train_model_simple(model,train_dataloader, val_dataloader, optimizer,
                       device, num_epochs, eval_freq, eval_iter, start_context,
                       tokenizer):
    train_losses, val_losses, track_tokens_seen = [],[],[]
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()

        for input_batch, target_batch in train_dataloader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step +=1

            # evaluation step
            if global_step % eval_freq ==0:
                train_loss, val_loss = evaluate_model(model, train_dataloader,
                                                      val_dataloader, device,
                                                      eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)

                track_tokens_seen.append(tokens_seen)
                print(f'Epoch: {epoch+1} | Step: {global_step:.06d}')
                print(f'Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}')

        # prints a sample text after each epoch
        generate_and_print_sample(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_tokens_seen


`evaluate_model `->  prints train and val losses after each model update

In [44]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device,
                                      num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device,
                                    num_batches=eval_iter)

        model.train()
        return train_loss, val_loss

`generate_and_print_sample`: takes a text snippet as input, converts into token_ids -> feeds it to the llm -> generate a text sample using `generate_text_simple`

In [None]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer.to(device))

    with torch.no_grad():
        token_ids = generate_text_simple(model, idx=encoded, max_new_tokens=50,
                                         context_size=context_size)

    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace('\n', ' '))
    model.train()

Training a `GPTModel` for 10 epochs.

optimizer: `AdamW`

lr = `0.0004`

weight_decay = `0.1`

In [None]:
torch.manual_seed(123)
model = GPTModel(config=GPT_CONFIG_124M)

model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay = 0.1)

num_epochs = 10
train_losses, val_losses, token_seen = train_model_simple(model,
                                                          train_loader,
                                                          val_loader,
                                                          optimizer,
                                                          device,
                                                          num_epochs,
                                                          eval_freq=5,
                                                          eval_iter=5,
                                                          start_context="Every effort moves you",
                                                          tokenizer= tokenizer)

Plotting Training and Validation losses

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

# MaxNLocator -> tries to select resonable num of ticks with integer values instead of decimel points


def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize =(5,3))
    ax1.plot(epochs_seen, train_losses, label= 'Training Loss')
    ax1.plot(epochs_seen, val_losses, linestyle ='-.', label = 'Validation Loss')

    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.legend(loc= 'upper right')

    ax1.xaxis.set_major_locator(MaxNLocator(integer = True))
    ax2 = ax1.twiny() # creates a twin x-axis (shared y-axis) for existing axis ax1

    ax2.plot(tokens_seen, train_losses, alpha=0) # alpha = 0 -> invisible plot
    ax2.set_xlabel('Tokens seen')
    fig.tight_layout()
    plt.show()



epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

#### Decoding Strategies
ways to introduce randomness in the output.

1. *temperature scaling*

2. *top-k sampling*

In [None]:
# print output text using our pretrained model

model.to('cpu')
model.eval()

tokenizer = tiktoken.get_encoding('gpt2')
token_ids = generate_text_simple(model= model,
                                 idx= text_to_token_ids('Every effort moves you', tokenizer),
                                 max_new_tokens= 25,
                                 context_size= GPT_CONFIG_124M['context_length'])


print(f'Output text: {token_ids_to_text(token_ids, tokenizer)}')

currently our pretrained llm will generate same outputs even if we run the above cell multiple times...keeping the `start_context` same.

in `generate_text_simple` we are selecting token with max probability as next token...known as greddy decoding.

now, we will use probabilistic sampling for next token selection using `torch.multinomial`

In [None]:
# illustration

import torch

vocab ={"closer": 0,
        "every": 1,
        "effort": 2,
        "forward": 3,
        "inches": 4,
        "moves": 5,
        "pizza": 6,
        "toward": 7,
        "you": 8}

inv_vocab = {v:k for k, v in vocab.items()}


# let's say returns following nect-token logits given start_context ='Every effort moves you'
next_token_logits = torch.tensor([4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79])

probas = torch.softmax(next_token_logits, dim=0)
next_token_id = torch.argmax(probas).item()
inv_vocab[next_token_id], probas

('forward',
 tensor([6.0907e-02, 1.6313e-03, 1.0019e-04, 5.7212e-01, 3.4190e-03, 1.3257e-04,
         1.0120e-04, 3.5758e-01, 4.0122e-03]))

In [None]:
# let's do the same using probabilistic sampling

torch.manual_seed(123)
next_token_id = torch.multinomial(probas, num_samples=1).item()
inv_vocab[next_token_id]

'toward'

In [None]:
def print_sampled_tokens(probas):
    torch.manual_seed(123)
    sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1_000)]  # 1_000 is same as 1000(just for better readability)

    sampled_ids = torch.bincount(torch.tensor(sample))

    for i, freq in enumerate(sampled_ids):
        print(f'{freq} X {inv_vocab[i]}')

print_sampled_tokens(probas)

71 X closer
2 X every
0 X effort
544 X forward
2 X inches
1 X moves
0 X pizza
376 X toward
4 X you


`temperature scaling` -> logits divided by some number > 0

In [None]:
def softmax_with_temp(logits, temp):
    scaled_logits = logits/temp
    return torch.softmax(scaled_logits, dim=0)

In [None]:
# import matplotlib.pyplot as plt

temps =[1,0,1,5]
scaled_probas = [softmax_with_temp(next_token_logits, t) for t in temps]

x = torch.arange(len(vocab))
bar_width = 0.15

fig, ax = plt.subplots(figsize= (5,3))
for i, t in enumerate(temps):
    rects = ax.bar(x+ i*bar_width, height=scaled_probas[i], width= bar_width,
                   label= f'Temperature= {t}')

ax.set_ylabel('Probability')
ax.set_xticks(x)
ax.set_xticklabels(vocab.keys(), rotation=90)
ax.legend()
plt.tight_layout()
plt.show()