# Pretraining on unlabeled data

In this notebook we will look at the following

![test](./ThisChapter.png)

We will take (copy over) the ``GPTModel``(and all other dependencies)  we coded previously in this notebook



In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset


class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        # This is new, we will add an optional Linear layer to project the output.
        self.out_proj = nn.Linear(d_out, d_out)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in_ca = x.shape
        keys_mha = self.W_key(x)       # (b, num_tokens, d_out)
        values_mha = self.W_value(x)   # (b, num_tokens, d_out)
        queries_mha = self.W_query(x)  # (b, num_tokens, d_out)

        # d_out is same as num_heads * head_dim
        # view reshapes the tensor without changing its data, in this case we project the
        # last d_out dimension to (num_heads, head_dim)
        keys_mha = keys_mha.view(b, num_tokens, self.num_heads, self.head_dim) # (b, num_tokens, num_heads, head_dim)
        values_mha = values_mha.view(b, num_tokens, self.num_heads, self.head_dim) # (b, num_tokens, num_heads, head_dim)
        queries_mha = queries_mha.view(b, num_tokens, self.num_heads, self.head_dim) # (b, num_tokens, num_heads, head_dim)

        # To calculation the attention score, we need the last two dimensions to be num_tokens and head_dim
        # thus we need to transpose the 1st and 2nd dimensions
        queries_mha.transpose_(1, 2)  # (b, num_heads, num_tokens, head_dim)
        keys_mha.transpose_(1, 2)     # (b, num_heads, num_tokens, head_dim)
        values_mha.transpose_(1, 2)   # (b, num_heads, num_tokens, head_dim)

         # Let calculate the attention scores, this is the dot product of queries and keys
        attn_scores_mha = queries_mha @ keys_mha.transpose(-2, -1) # (b, num_heads, num_tokens, num_tokens)

        # Apply the mask, the dimensions of the attn scores are still (b, num_heads, num_tokens, num_tokens)
        #  the mask is 2D and is applied to the last two dimensions only
        attn_scores_mha.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf) #(b, num_heads, num_tokens, num_tokens)
        attn_weights_mha = torch.softmax(attn_scores_mha / self.head_dim ** 0.5, dim=-1) #(b, num_heads, num_tokens, num_tokens)
        # Apply dropout to the attention weights
        attn_weights_mha = self.dropout(attn_weights_mha) # (b, num_heads, num_tokens, num_tokens)
        # attn_weights_mha @ values_mha gives (b, num_heads, num_tokens, head_dim)
        # We need to transpose the 1st and 2nd (both 0 indexed) dimensions to get (b, num_tokens, num_heads, head_dim)
        context_vecs_mha = (attn_weights_mha @ values_mha).transpose(1,2) # (b, num_tokens, num_heads, head_dim)
        # We will reshape the context vectors back to (b, num_tokens, d_out) where d_out = num_heads * head_dim
        context_vecs_mha = context_vecs_mha.contiguous().view(b, num_tokens, self.d_out) # (b, num_tokens, d_out)
        # Finally we will project the output using the out_proj layer
        context_vecs_mha = self.out_proj(context_vecs_mha)
        return context_vecs_mha


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

class LayerNorm(nn.Module):
    def __init__(self, emb_dim, eps = 1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean_batch = torch.mean(x, dim=-1, keepdim=True)
        # unbiased=False means we do not use Bessel's correction, that is, we divide by N instead of N-1 (basel's correction)
        var_batch = torch.var(x, dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean_batch) / torch.sqrt(var_batch + self.eps)
        return norm_x * self.scale + self.shift


class FeedForward(nn.Module):
    def __init__(self, cfg, hidden_layer_dim_factor = 4):
        super().__init__()
        emb_dim = cfg["emb_dim"]
        self.layers = nn.Sequential(
            nn.Linear(emb_dim, hidden_layer_dim_factor * emb_dim),
            GELU(),
            nn.Linear(hidden_layer_dim_factor * emb_dim, emb_dim)
        )

    def forward(self, in_batch):
        return self.layers(in_batch)

class TransformerBlock(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            dropout=cfg["drop_rate"],
            num_heads=cfg["n_heads"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.dropout_shortcut = nn.Dropout(cfg["drop_rate"])
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.ff = FeedForward(cfg)

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.dropout_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.dropout_shortcut(x)
        x = x + shortcut
        return x


class GPTModel(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        return self.out_head(x)

Below we will instantiate the GPT model with the configuration of the smallest GPT-2 model (124M parameters), however we will reduce the context length to 256 for faster training. Additionally we will define two methods ``text_to_token_ids`` and ``token_ids_to_text``

In [10]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,  # we will use a smaller context length for faster training, original is 1024
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (dropout_shortcut): Dropout(p=0.1, inplace=False)
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_featu

In [11]:
import tiktoken

def generate_text_simple(model, idx,
                          max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        # Take the context_size tokens to predict the next token
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():  # No need to track gradients
            logits = model(idx_cond) # (batch_size, context_size, vocab_size)
        # Take the last generated token for this is the next token
        logits = logits[:, -1, :] # (batch_size, vocab_size)
        probs = torch.softmax(logits, dim=-1) # (batch_size, vocab_size)
        idx_next = torch.argmax(probs, dim=-1, keepdim=True) # (batch_size, 1)
        idx = torch.cat((idx, idx_next), dim=-1) # (batch_size, current_seq_len + 1)
    return idx


def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    return torch.tensor(encoded).unsqueeze(0)  # Add batch dimension

def token_ids_to_text(token_ids, tokenizer):
    token_ids = token_ids.squeeze(0).tolist()  # Remove batch dimension and convert to list
    return tokenizer.decode(token_ids)

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
    )
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


We can see that the output is not very meaningful, this is because the model is not trained yet. However all our required components are in place.

We will next look at loss metric for the generated output

### Calculating the text generation loss

![test](./TextGenerationSummary.png)


In [12]:
import tiktoken
import torch

input_text1 = "every effort moves"
input_text2 = "I really like"

expected_output_text1 = " effort moves you"
expected_output_text2 = " really like chocolate"

tokenizer = tiktoken.get_encoding("gpt2")
inputs = torch.vstack([text_to_token_ids(input_text1, tokenizer), text_to_token_ids(input_text2, tokenizer)])
print(f"Input token ids are:\n{inputs}")

targets = torch.vstack([text_to_token_ids(expected_output_text1, tokenizer), text_to_token_ids(expected_output_text2, tokenizer)])
print(f"Output token ids are:\n{targets}")


Input token ids are:
tensor([[16833,  3626,  6100],
        [   40,  1107,   588]])
Output token ids are:
tensor([[ 3626,  6100,   345],
        [ 1107,   588, 11311]])


Lets feed the inputs to the model and get the logits

In [13]:
with torch.no_grad():
    logits = model(inputs)
print(f"Logits shape: {logits.shape}")
probas = torch.softmax(logits, dim=-1)
print(f"Probs shape: {probas.shape}")

Logits shape: torch.Size([2, 3, 50257])
Probs shape: torch.Size([2, 3, 50257])


There are 2 batches, each with 3 tokens and each token as a probability distribution over the vocabulary of size 50257. What we need is the maximum probability for each of these 3 tokens in 2 batches.

Notice how we retain the third dimension by using ``keepdim=True``, if this wasn't provided the the result would have been of shape (2, 3) instead of (2, 3, 1)

In [14]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print(f"token_ids are {token_ids} \n\nand has shape {token_ids.shape}")

token_ids are tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]]) 

and has shape torch.Size([2, 3, 1])


Lets decode and print thiese generated tokens, notice how the generated tokens are not the expected ones. We now need a loss function to measure how far off we are from the expected output. The goal is to increase the softmax probability of the expected output tokens.

With a vocabulary size of 50257, the chance probability of getting the correct token is 1/50257 = 0.0000199, this is very low.

In [15]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1:"
      f" {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix


Remember target is what we expect the model to output. Lets look at the probabilities of these expected tokens in the generated probabilities.

probs has shape (2, 3, 50257) and target has shape (2, 3), our goal while training is to increase the probabilities of these expected tokens relative to other tokens.

In [16]:
text_idx = 0
target_probas_1 = probas[text_idx, torch.arange(targets.shape[1]), targets[text_idx]]
text_idx = 1
target_probas_2 = probas[text_idx, torch.arange(targets.shape[1]), targets[text_idx]]
print("Text 1:", target_probas_1)
print("Text 2:", target_probas_2)


Text 1: tensor([7.4541e-05, 3.1061e-05, 1.1563e-05])
Text 2: tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])


We will next calculate the negative log likelihood loss (NLLLoss) which is commonly used for classification problems

![Test](./NLL.png)

In [17]:
# Flatten and compute the log probabilities
log_probas = torch.log(torch.cat([target_probas_1, target_probas_2]))
print(log_probas)
#Compute the average negative log likelihood loss
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)
# We always minimise the loss, thus we take the negative, goal is to make this log_probas as close to 0 as possible
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)


tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])
tensor(-10.7940)
tensor(10.7940)


Pytorch has a built in in ``cross_entropy`` loss function which combines the log softmax and ``NLLLoss`` in one function. We will use this to calculate the loss, recall ``logits`` are the raw outputs of the model before applying softmax

We will flatten the first two dimensions of the logits

In [18]:
print(f"Logits shape is {logits.shape}")
print(f"Targets shape is {targets.shape}")
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()
print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print("Cross entropy loss is:", loss)
print("Perplexity: ",torch.exp(loss))

Logits shape is torch.Size([2, 3, 50257])
Targets shape is torch.Size([2, 3])
Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])
Cross entropy loss is: tensor(10.7940)
Perplexity:  tensor(48725.8203)


Perplexity: A common metric for evaluating language models is perplexity, which is the exponentiation of the cross-entropy loss. It provides a measure of how well the model predicts a sample. A lower perplexity indicates a better predictive model. in above case of ``10.790`` perplexity is ``torch.exp(10.7940) = 48725.8203`` which means that the model is unsure of which token to predict next among 48725 tokens, this is close to the chance probability of 50257 tokens

Next we will prepare the dataset to train our small model

In [20]:
file_path = "./data/the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as f:
    text_data = f.read()
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20479
Tokens: 5145


The dataloaders can be visualized as below

![test](./Dataloaders.png)

In [41]:
# Recreate the same data loaders we implemented in chapter 2

from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, stride, max_length):
        self.input_ids = []
        self.target_ids = []
        tokens = tokenizer.encode(text)
        for i in range(0, len(tokens) - max_length - 1, stride):
            input_id = tokens[i: i + max_length]
            target_id = tokens[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_id))
            self.target_ids.append(torch.tensor(target_id))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(text, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    return DataLoader(dataset=GPTDataset(text=text, tokenizer=tokenizer, stride=stride, max_length=max_length),
                      batch_size=batch_size,
                      drop_last=drop_last,
                      num_workers=num_workers,
                      shuffle=shuffle)

In [42]:
# Split data in train and validation set and create two data loaders
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)
val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

print("Train loader:")
for x, y in train_loader:
    print(x.shape, y.shape)

print("\nValidation loader:")
for x, y in val_loader:
    print(x.shape, y.shape)



Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


Let us now implement the ``cross_entropy`` loss function for our given batch data and then the ``cross_entropy`` for the loader

In [47]:
import torch.nn as nn
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device) # shape (batch_size, seq_len)
    target_batch = target_batch.to(device) # shape (batch_size, seq_len)
    logits = model(input_batch) # Generates (logits) of shape (batch_size, seq_len, vocab_size)
    return nn.functional.cross_entropy(
                logits.flatten(0, 1),   # shape (batch_size * seq_len, vocab_size)
                target_batch.flatten()) # shape (batch_size * seq_len)

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    else:
        num_batches = len(data_loader) if num_batches is None else min(num_batches, len(data_loader))
        for i, (input_batch, target_batch) in enumerate(data_loader):
            if i < num_batches:
                total_loss += calc_loss_batch(input_batch, target_batch, model, device).item()
            else:
                break
        return total_loss / num_batches

Lets apply this to test on a model without training

In [59]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print("Using device:", device)
model.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
print("Train loss:", train_loss)
print("Val loss:", val_loss)


Using device: mps
Train loss: 10.98758316040039
Val loss: 10.98110580444336


### Train an LLM

With all the nuts and bolts in place we will now train our LLM