# GPT For Bigram Language Model

Here we build a complete working Generative Pre-trained Transformer (GPT) model from scratch capable of generating text in the style of Shakespeare.

## 1. Setup and Hyperparameters

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
atch_size = 16#how many independent sequences will we process in parallel
block_size = 32# what is the maximum context length for predictions
max_iters = 5000# how many training iterations
eval_interval = 100# evaluate the loss every eval_interval iterations
learning_rate = 1e-3# learning rate for optimization
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200# how many batches to use for evaluation
n_embd = 64# the embedding dimension
n_head = 4# the number of heads in the multi-head attention model
n_layer = 4# the number of layers in the transformer
dropout = 0.0# the dropout rate

## 2. Data Loading and Tokenization

In [4]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
# encoder: take a string, output a list of integers
def encode(s): return [stoi[c] for c in s]
# decoder: take a list of integers, output a string
def decode(l): return ''.join([itos[i] for i in l])
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

## 3. Data Batching and Loss Estimation


### Data Loading Function `(get_batch)`


In [5]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

### Loss Estimation Function `(estimate_loss)`

In [6]:
@torch.no_grad()
def estimate_loss():
    out = {}# results will be collected here
    model.eval()# set the model to evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)# tensor to hold the loss values
        for k in range(eval_iters):
            X, Y = get_batch(split)# get a batch of data
            logits, loss = model(X, Y)# forward pass through the model
            losses[k] = loss.item()# store the loss value
        out[split] = losses.mean()# compute the mean loss for this split
    model.train()# set the model back to training mode
    return out

## 4. Transformer Model Architecture

### `Head` Module: A Single Head of Self-Attention

- `__init__:`
    - It creates three `nn.Linear` layers to produce the Query, Key, and Value vectors from the input with no bias term.
    - `self.register_buffer('tril', ...)`: This creates a lower-triangular matrix of ones called `tril`. A buffer is a parameter that is part of the model's state but is not trained by the optimizer. This mask is essential for creating a decoder-style Transformer that can only attend to previous tokens in the sequence.

- `forward:`
    - It calculates the attention scores (`wei`) by taking the dot product of queries and keys and scaling by the square root of the channel dimension (C**-0.5).
    - `wei.masked_fill(...)`: This applies the causal `mask`. It prevents tokens from seeing future tokens by setting the attention scores for future positions to negative infinity.
    - `F.softmax`: Converts the scores into a probability distribution.
    - `wei @ v`: Performs the weighted aggregation of the value vectors to produce the output of the attention head.

In [7]:
#One head of self attention
class Head(nn.Module):
    
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias = False)# key projection
        self.query = nn.Linear(n_embd, head_size, bias = False)# query projection
        self.value = nn.Linear(n_embd, head_size, bias = False)# value projection
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))# lower triangular matrix for masking
        self.dropout = nn.Dropout(dropout)# dropout layer
    
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (B,T,head_size)
        q = self.query(x) # (B,T,head_size)
        # compute attention scores
        wei = q @ k.transpose(-2, -1) * C**-0.5 # (B,T,head_size) @ (B,head_size,T) -> (B,T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))# apply the mask to ensure causality
        wei = F.softmax(wei, dim=-1)# softmax to get attention weights
        wei = self.dropout(wei)# apply dropout to the attention weights
        v = self.value(x) # (B,T,head_size)
        out = wei @ v # (B,T,T) @ (B,T,head_size) -> (B,T,head_size)
        return out