# Chat GPT

In [None]:
#| default_exp bigram

In [None]:
#|export
import requests
from fastcore.all import *
from rich import print
import numpy
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

<torch._C.Generator>

In [None]:
#|export
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Read & Explore Data

In [None]:
#|export
path = urlsave('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt', '../data/input.txt')

In [None]:
#|export
text = path.read_text(encoding='utf-8')

In [None]:
print("Length of the dataset ", len(text))

In [None]:
# lets look at the first 1000 characters 
print(text[:1000])

## Tokenization

In [None]:
#|export
chars = L(set(text)).sorted()
vocab_size = len(chars)

In [None]:
print(''.join(chars), vocab_size)

In [None]:
#|export
stoi = {ch: i for i, ch in enumerate(chars)}
itos = dict(map(reversed, stoi.items()))
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [None]:
print(encode("hii there"))
print(decode(encode("hii there")))

In [None]:
#|export
# lets now encode the entire text dataset and store it into a torch.Tensor
data = torch.tensor(encode(text), dtype=torch.long)

In [None]:
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earlier will look like this to the GPT

## Train/Validation Split

In [None]:
#|export
# lets now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest will be val
train_data = data[:n]
val_data = data[n:]

## Dataloader: Batches of chunks of data

In [None]:
block_size = 8
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when input is {context} the target: {target}')

In [None]:
#|export
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split, batch_size = batch_size, block_size = block_size):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y        

In [None]:
xb, yb = get_batch('train')
print('inputs')
print(xb.shape)
print(xb)
print('targets')
print(yb.shape)
print(yb)
print('--------')
    
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

In [None]:
print(xb) # our input to the transformer

## Simplest Baseline: Bigram Language Model, Loss, Generation

In [None]:
#|export
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        
        # idx and targets are both (B, T) of tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:,  -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

In [None]:
#|export
def generate_from_the_model(m, max_new_tokens=100):
    idx = torch.zeros((1, 1), dtype=torch.long)
    print(decode(m.generate(idx, max_new_tokens=max_new_tokens)[0].tolist()))

## Training the Bigram Model

In [None]:
@torch.no_grad()
def estimate_loss(batch_size=32, eval_iters=200):
    out = {}
    model.eval() # setting to evaluation phase
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, batch_size = batch_size)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # setting to training phase
    return out

In [None]:
#|export
def train(model, max_iters, batch_size = 32, eval_iters = 200, eval_interval = 200):
    
    # create a PyTorch optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    
    for step in range(max_iters):

        if step % eval_interval == 0:
            losses = estimate_loss(batch_size, eval_iters)
            print(f"step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
            
        # sample a batch of data
        xb, yb = get_batch('train', batch_size=batch_size)

        # evaluate the loss
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print(loss.item())

In [None]:
#|export
model = BigramLanguageModel(vocab_size).to(device)
train(model, 15_000)

In [None]:
generate_from_the_model(model)

In [None]:
generate_from_the_model(model, 300)

In [None]:
generate_from_the_model(model, 700)

## Version 1
> Averaging past context with for loops, the weakest form of aggregation

### The mathematical trick in self-attention

In [None]:
# consider the following toy example:

B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# We want x[b, t] = mean_{i<=t} x[b, i]
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [None]:
x[0]

tensor([[-0.6693, -0.8456],
        [-0.6534,  0.8349],
        [-0.6808,  1.1112],
        [ 1.6128, -1.4807],
        [-0.5419, -0.4675],
        [ 0.1127, -0.1059],
        [-0.0641, -1.5335],
        [-0.3041,  1.6519]])

In [None]:
xbow[0]

tensor([[-0.6693, -0.8456],
        [-0.6613, -0.0053],
        [-0.6678,  0.3669],
        [-0.0977, -0.0950],
        [-0.1865, -0.1695],
        [-0.1366, -0.1589],
        [-0.1263, -0.3553],
        [-0.1485, -0.1044]])

## Version 2
> Matrix multiply as weighted aggregation

In [None]:
torch.manual_seed(42)
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('c=')
print(c)

In [None]:
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [None]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('c=')
print(c)

In [None]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('c=')
print(c)

In [None]:
wei = torch.tril(torch.ones(T, T))
wei = wei/wei.sum(1, keepdim=True)
xbow2 = wei @ x # (T, T) @ (B, T, C) --> (B, T, T) @ (B, T, C) --> (B, T, C)

In [None]:
wei.shape

torch.Size([8, 8])

In [None]:
xbow2.shape

torch.Size([4, 8, 2])

In [None]:
torch.allclose(xbow2, xbow)

True

In [None]:
xbow, xbow2

(tensor([[[-6.6931e-01, -8.4557e-01],
          [-6.6134e-01, -5.3177e-03],
          [-6.6781e-01,  3.6686e-01],
          [-9.7657e-02, -9.5026e-02],
          [-1.8651e-01, -1.6953e-01],
          [-1.3665e-01, -1.5892e-01],
          [-1.2628e-01, -3.5530e-01],
          [-1.4851e-01, -1.0440e-01]],
 
         [[ 6.1571e-01,  6.3541e-01],
          [-6.0056e-01,  3.3687e-01],
          [-3.4395e-01,  4.4428e-01],
          [-4.7228e-01,  3.2352e-01],
          [-4.1381e-01,  3.5067e-01],
          [-2.2773e-01,  1.9506e-01],
          [-3.0587e-01,  1.4499e-01],
          [ 1.2853e-01,  2.8592e-01]],
 
         [[-2.7886e-04,  4.0268e-01],
          [-1.8299e-01,  3.5538e-01],
          [-5.2246e-01,  4.5156e-01],
          [ 3.1017e-02,  3.2158e-01],
          [-1.0164e-01,  3.3109e-01],
          [-9.4260e-02,  6.2927e-02],
          [ 5.2089e-02, -7.4155e-03],
          [ 1.2877e-01, -1.4536e-01]],
 
         [[-7.1193e-01,  4.9925e-01],
          [-2.2146e-01,  1.1845e+00],
   

## Version 3:
> Adding softmax

In [None]:
tril = torch.tril(torch.ones(T, T)); tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [None]:
wei = torch.zeros((T, T)); wei

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
wei = wei.masked_fill(tril == 0, float('-inf'))
wei

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
wei = F.softmax(wei, dim=-1); wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [None]:
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

## Clean up the Model Code

In [None]:
n_embd = 32

In [None]:
class BigramLanguageModel(nn.Module):
    
    def __init__(self): # removed vocab_size
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
    
    def forward(self, idx, targets = None):
        # idx and targets are both (B, T) tensor of integers
        tok_embd  = self.token_embedding_table(idx) # (B, T, C), C--> n_embd
        logits = self.lm_head(tok_embd) # (B, T, vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:,  -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
        

In [None]:
model = BigramLanguageModel()

In [None]:
train(model, 15_000)

## Positional Encoding

In [None]:
class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        # each token directly reads off the logit for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
    
    def forward(self, idx, target=None):
        B, T, C = idx.shape
        
        # idx and targets are both (B, T) tenspr of integers
        tok_emb = self.token_embedding_table(idx) # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C) <-- Positional Embedding
        x = tok_emb + pos_embd # (B, T, C) <--- Adding the Positional Embedding to Token Embedding
        logits = self.lm_head(x) # (B, T, vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:,  -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

## Version 4
> Self Attention

In [None]:
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ x

out.shape

torch.Size([4, 8, 32])

In [None]:
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)

# lets see a single Head perform self attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) --> (B, T, T)

tril = torch.tril((torch.ones(T, T)))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ x

out.shape

torch.Size([4, 8, 32])

In [None]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5553, 0.4447, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0024, 0.9931, 0.0045, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3056, 0.0135, 0.5681, 0.1128, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1010, 0.6721, 0.0109, 0.0859, 0.1300, 0.0000, 0.0000, 0.0000],
        [0.1368, 0.0013, 0.0382, 0.0324, 0.0061, 0.7852, 0.0000, 0.0000],
        [0.0440, 0.1134, 0.0289, 0.1427, 0.5070, 0.0722, 0.0917, 0.0000],
        [0.0259, 0.4333, 0.0183, 0.2266, 0.0400, 0.1100, 0.1283, 0.0176]],
       grad_fn=<SelectBackward0>)

In [None]:
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)

# lets see a single Head perform self attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) --> (B, T, T)

tril = torch.tril((torch.ones(T, T)))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ x

v = value(x)
out = wei @ x

out.shape

torch.Size([4, 8, 32])

## Scaled Self Attention
> Why divide by sqrt(head_size)?

In [None]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size ** -0.5

In [None]:
k.var(), q.var(), wei.var()

(tensor(1.1065), tensor(1.1194), tensor(1.2355))

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 8, dim=-1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])