In [2]:
# read the input
with open('input.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

In [3]:
print("# characters", len(text))

# characters 1115393


In [4]:
# unique characers that occur
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
# create a mapping from characters to integers
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l )

print(encode("hii there !"))
print(decode(encode("Hi there !")))

[46, 47, 47, 1, 58, 46, 43, 56, 43, 1, 2]
Hi there !


In [6]:
# encode the entire text dataset and store it in a torch.tensor
import torch
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)

torch.Size([1115393]) torch.int64


In [7]:
# Let's split the data into train and val sets
n  = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
block_size = 8

train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
# one single block has <block_size> examples

x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f"when input in {context} the target: {target}")

when input in tensor([18]) the target: 47
when input in tensor([18, 47]) the target: 56
when input in tensor([18, 47, 56]) the target: 57
when input in tensor([18, 47, 56, 57]) the target: 58
when input in tensor([18, 47, 56, 57, 58]) the target: 1
when input in tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input in tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input in tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [10]:
torch.manual_seed(1337)
batch_size = 4  # how many independent sequences will we process in parallel
block_size = 8  # maximum context length for predictions

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1:i + block_size + 1] for i in ix ])
    return x, y

xb, yb = get_batch('train')
print('inputs:\n', xb.shape, xb)
print('outputs:\n', yb.shape, yb)

print('---')
for b in range(batch_size):
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")
    break

inputs:
 torch.Size([4, 8]) tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
outputs:
 torch.Size([4, 8]) tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])
---
when input is [53] the target: 59
when input is [53, 59] the target: 6
when input is [53, 59, 6] the target: 1
when input is [53, 59, 6, 1] the target: 58
when input is [53, 59, 6, 1, 58] the target: 56
when input is [53, 59, 6, 1, 58, 56] the target: 47
when input is [53, 59, 6, 1, 58, 56, 47] the target: 40
when input is [53, 59, 6, 1, 58, 56, 47, 40] the target: 59


In [11]:
# bigram model
# direct pytorch implementation

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets = None):

        # idx and targets are both (B, T) tensor of integers 
        logits = self.token_embedding_table(idx) # (batch size, time(block_size), channels)
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus on the last time step only
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim = -1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim = 1) # (B, T + 1)
        return idx
    
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss.item())

print(decode(m.generate(torch.zeros((1,1),dtype = torch.long), max_new_tokens = 100)[0].tolist()))

torch.Size([32, 65])
4.894842624664307

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [12]:
# create a pyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [13]:
batch_size = 32
for steps in range(1000):

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none= True)
    loss.backward()
    optimizer.step()

print(loss.item())

3.666220188140869


In [14]:
print(decode(m.generate(torch.zeros((1,1),dtype = torch.long), max_new_tokens = 100)[0].tolist()))


Wh;;Sq.f ustNzknc
kwgOj$dhPWr,SV?hsusiKpgXXUh;Apmem d?hESXI.i;TrJgkiF-oKbXCAA -botrngFCHAUQkn$

pn$w


The mathematical trick in self-attention

In [15]:
# consider the following toy example

torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [16]:
# bag of words - used by people when averaging up things
# inital approach to aggregating information - average the information for all the characters preceeding the current character
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        xbow[b, t] = torch.mean(xprev, 0)


In [17]:
# since this is super inefficient, we use matrix multiplication to fix this
# we can row-normalised traingular matrices for this neat trick
wei = torch.tril(torch.ones (T, T)) # wei is short for weights
wei = wei/wei.sum(1, keepdim = True)
xbow2 = wei @ x # (B(added), T, T) @ (B, T, C) ----> (B, T, C) . parallel multiplication along the batch

In [18]:
# version 3: using softmax instead of tril
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril ==0, float('-inf'))
wei = F.softmax(wei, dim = 1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

# its exactly the same, but it is more intuitive when we set the values to -inf



True

In [None]:
# version 4: self-attention ! We made it
torch.manual_seed(1337)
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)

# lets see a single head perform self attention
head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)
value = nn.Linear(C, head_size, bias = False)

wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) ----> (B, T, T)


# wei is now a function in a data dependent manner
tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril ==0, float('-inf'))
wei = F.softmax(wei, dim = 1)

v = value(x)    # (B, T, head_size)
out = wei @ v    # (B, T, T) @ (B, T, head_size) ----> (B, T, head_size)

# out = wei @ x

out.shape

# every single node(token) at each position emits two positions - query and key
# query - what I am looking for
# key - what do I contain
# valu - what do I pass on
# product of query and key gives us wei (or affinities)
# The output shape changes because we multiply wei with value now
# also makes sense why we use softmax - to normalise the weights
# consider x to be private information to the token, v is the information that is aggregated and passed on


# NOTES
# 1. attention is a communication mechanism. Can be seen as nodes in a directed graph communicating with each other and passing information
# 2. There is no notion of space. Attention acts over a set of vectors. This is why we add positional embeddings
# 3. Each example accorss batch dimension is processed completely independently and never talk to each other
# 4. in an encoder attention block, we can delete the line which does masking, allowing all tokens to communcate with each other freely.
# 5. in a decoder attention block, we need the masking to prevent tokens from "cheating" by looking ahead.
# 6. self attention means that the keys, queries and values all come from the same place (x in this case). In cross attention, queries come from one place, and keys and values come from another place (as is the case in an encoder decoder block, for example).
# 7. scaled attention dvides the wei by sqrt(head_size) before applying softmax to prevent large dot products which lead to vanishing gradients


torch.Size([4, 8, 16])

In [22]:
# layer normalization - a normalization strategy that normalizes the features within a single layer
# we normalize the rows of the input rather than the columns



class LayerNorm1d:

    def __init__(self, dim, eps = 1e-51):
        self.eps = eps
        # parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        xmean = x.mean(1, keepdim = True) # batch mean
        xvar = x.var(1, keepdim = True) # batch variance
    
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)       # normalize to unit variance
        self.out = self.gamma * xhat + self.beta

        # update the buffers:
        # if self.training:
        #     with torch.no_grad():
        #         self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        #         self.running_Var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        # since out computations doesnt span across examples we dont need to calculate this
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]


In [None]:
# in the original paper, add and norm are applied after every attention block and MLP block. 
# now, in practice it common, to apply the layer norm before transformation