In [1]:
import torch, torch.nn as nn, numpy as np, gzip
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
learning_rate = 1e-2
max_iters = 3000
eval_interval = 300
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

In [2]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with gzip.open('tiny_shakespeare.txt.gz', 'rb') as f: text = f.read().decode('utf-8')

In [None]:
print(text[:1000])

In [3]:
len(text)

1115394

In [4]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars), '\nvocabsize=', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz 
vocabsize= 65


In [5]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) } #string to integer
itos = { i:ch for i,ch in enumerate(chars) } #integer to string
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [6]:
itos[0], itos[1], itos[2], '...', itos[63], itos[64]

('\n', ' ', '!', '...', 'y', 'z')

In [7]:
stoi['y']

63

In [8]:
print(encode('hi there'))
print(decode(encode('hi there')))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


In [9]:
#encode all the shakespeare tex
data = torch.tensor(encode(text), dtype=torch.long)
data.shape, data.dtype

(torch.Size([1115394]), torch.int64)

In [10]:
data[:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [11]:
# Train and test splits
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [12]:
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [13]:
#lot's of examples in every block
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context= x[:t+1]
    target = y[t]
    print(f'when input is {context} the target: {target}')

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [14]:
torch.manual_seed(1337)

batch_size = 4 # indep seq processed in parallel
block_size = 8 # max content length for predictions

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [15]:
xb,yb = get_batch('train')
xb.shape, yb.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

In [16]:
for bl in xb:
    print(decode(bl.tolist()).replace('\n','|'))
print()
for bl in yb:
    print(decode(bl.tolist()).replace('\n','|')) #'\n' to '|' just to see

Let's he
for that
nt that 
MEO:|I p

et's hea
or that 
t that h
EO:|I pa


In [17]:
print('inputs:\n',xb.shape,'\n',xb,'targets:\n',yb.shape,'\n',yb)

#4x8 array contains 32 independent examples

for b in range(batch_size):
    print()
    for t in range(block_size):
        context = xb[b,:t+1]
        target =  yb[b,t]
        print(f'when input is {context.tolist()}, the target: {target}')

inputs:
 torch.Size([4, 8]) 
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]]) targets:
 torch.Size([4, 8]) 
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])

when input is [24], the target: 43
when input is [24, 43], the target: 58
when input is [24, 43, 58], the target: 5
when input is [24, 43, 58, 5], the target: 57
when input is [24, 43, 58, 5, 57], the target: 1
when input is [24, 43, 58, 5, 57, 1], the target: 46
when input is [24, 43, 58, 5, 57, 1, 46], the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43], the target: 39

when input is [44], the target: 53
when input is [44, 53], the target: 56
when input is [44, 53, 56], the target: 1
when input is [44, 53, 56, 1], the target: 58
when input is [44, 53, 56, 1, 58], the target: 46
when 

In [18]:
# super simple bigram model,just creates 
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # 65x65 table

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        return logits #scores for next value of sequence

#(B,T,C) = (batch, time, channel) = 4, 8, 65

In [19]:
# nn.Embedding(vocab_size, vocab_size) just returns the 65 numbers for each of the 4x8 inputs
m = BigramLanguageModel(vocab_size)
out = m(xb, yb)
out.shape # predictions for all the positions

torch.Size([4, 8, 65])

Now calculate a training loss, doesn't yet have a generate function:

In [20]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        B, T, C = logits.shape
        #cross entropy wants B*T x C instead of B x T x C, so flatten via .view()
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)

        return logits, loss

In [21]:
#4x8 flattened to 32
m = BigramLanguageModel(vocab_size)
logits,loss = m(xb, yb)
logits.shape, loss

(torch.Size([32, 65]), tensor(4.7288, grad_fn=<NllLossBackward0>))

In [22]:
#expected loss for random guessing -log(1/65) = log(65)
np.log(65), torch.log(torch.tensor(65))

(4.174387269895637, tensor(4.1744))

Now add the generate:

In [23]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None: #called via self(idx) below, if without targets yet
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self(idx) # get the predictions
            # focus only on the last time step
            logits = logits[:, -1, :] # only use last elt in time dimension, becomes (B, C), and a bigram model
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1), one sample for each
            idx = torch.cat((idx, idx_next), dim=1) # append sampled index to the running sequence, (B, T+1)
        return idx #now extended by max_new_tokens

In [24]:
m = BigramLanguageModel(vocab_size) #now with generate
logits,loss = m(xb, yb)
logits.shape, loss

(torch.Size([32, 65]), tensor(4.2793, grad_fn=<NllLossBackward0>))

In [25]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device) #B=T=1, start with single 0 = \n as context
print(decode(m.generate(context, max_new_tokens=100)[0].tolist())) # get 500 tokens


MufR3!yC NHKGY?C3eDEGMp-VCcZN3aoRN&powfXmB.EOX.aFl3sitGj&WcGpoEfRvHLbR?
JxHofRX,MccAWBTF.ALF$mQ;!GY:


Not yet trained, and only looking at embedding value for last element in string so not expecting anything coherent, but will use same function to look further back in history.

Now define optimizer to train it:

In [26]:
model = BigramLanguageModel(vocab_size)
m = model.to(device)

batch_size=32
# create a PyTorch optimizer
learning_rate = 1e-3 # 1e-2, 1e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [27]:
#utility function to report loses

@torch.no_grad() #deactivates auto-grad, no backprop, reduce memory usage, speed up computations
def estimate_loss():
    out = {}
    model.eval()  #doesn't do anything yet, no dropout or batchnorm layers
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [28]:
for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()  #average loss over multiple batches, less noisy
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 2.4617, val loss 2.4792
step 300: train loss 2.4618, val loss 2.4839
step 600: train loss 2.4563, val loss 2.4819
step 900: train loss 2.4632, val loss 2.4863
step 1200: train loss 2.4580, val loss 2.4896
step 1500: train loss 2.4514, val loss 2.4844
step 1800: train loss 2.4564, val loss 2.4768
step 2100: train loss 2.4591, val loss 2.4953
step 2400: train loss 2.4593, val loss 2.4709
step 2700: train loss 2.4576, val loss 2.4834


In [29]:
# generate from trained model, reasonable-ish for bigram model?
context = torch.zeros((1, 1), dtype=torch.long, device=device) #B=T=1, 0 = \n
print(decode(m.generate(context, max_new_tokens=400)[0].tolist())) # get 400 tokens


Fan cher t wou ty s cer thaper could ul thas end wallouled
ARI:
Sake,XFinndichaivece lix; ckik:

ADWhanduno thiow t io nag buty:
Fo dithar athin' rve t s Hibrd le ord, GLI he dordorthing us t w
AMLINENGoulisf th nerd y wid ackerur de
Tom de ash les ar3CHUSSI sobesht towe.
I wr S:

TMEE n; ede d INCO s ho y th o wie:
TExend ano parr areas.
Whatwndud I suced hist yorig gn beald mbucike, hagor.
LORKI


**Note:** why does minimizing the loss by backpropagation lead to the bigram probability distribution in this case?

Consider a single letter, say 'a'. Many other letters appear after 'a' in the training set so the loss can never be zero. Suppose the empirical (via number counts) distribution of letters following 'a' is some set of probabilities $q_i$, and suppose the probabilities predicted by the model are $p_i$. Then the loss, when character $i$ is the correct answer, is $-\log p_i$. In the training set, $i$ is correct a fraction $q_i$ of the time, so the expected loss is $-\sum q_i \log p_i$, with the constraint that $\sum_i p_i =1$. Using a Lagrange multiplier to impose the constraint, and taking derivatives with respect to the $p_i$ and setting them to zero gives the unique minimum, with $p_i=q_i$ for all $i$. The same is true for the predictions of letters following any other letter, so minimizing the cross-entropy loss in this simple case leads to predictions that converge on the empirical bigram probability distribution.

# mathematical trick in self-attention

In [30]:
B,T,C = 4,8,2 #batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

Want tokens to talk to each other, but
tokens should only communicate with earlier ones, so for the 5th token take the average over the previous ones (weak form of interaction, but will be improved)

In [31]:
# Version 1
# We want x[b,t] = mean_{i<=t} x[b,i]
#version 1
xbow = torch.zeros((B,T,C))  #bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)

In [32]:
x[0]

tensor([[ 0.5881, -0.8298],
        [-0.6216,  0.1701],
        [ 0.8077,  0.4167],
        [-1.9930,  0.5087],
        [-0.7838, -2.1713],
        [-0.0594,  0.5931],
        [-0.8105, -0.4394],
        [ 0.6405,  0.9083]])

In [33]:
xbow[0] #succesive entries are averages of the ones above

tensor([[ 0.5881, -0.8298],
        [-0.0168, -0.3298],
        [ 0.2580, -0.0810],
        [-0.3047,  0.0664],
        [-0.4005, -0.3811],
        [-0.3437, -0.2187],
        [-0.4104, -0.2503],
        [-0.2790, -0.1054]])

In [34]:
# lower triangular
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [35]:
a = torch.tril(torch.ones(3, 3))
a / torch.sum(a, 1, keepdim=True)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

In [36]:
#trick using matrix multiplication
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
#a = torch.ones(3, 3)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=\n',a,'\n--\nb=', b, '\n--\nc=', c)

a=
 tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]]) 
--
b= tensor([[2., 7.],
        [6., 4.],
        [6., 5.]]) 
--
c= tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [37]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
# (T,T) @ (B,T,C) broadcast
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [38]:
xbow[0], xbow2[0]

(tensor([[ 0.5881, -0.8298],
         [-0.0168, -0.3298],
         [ 0.2580, -0.0810],
         [-0.3047,  0.0664],
         [-0.4005, -0.3811],
         [-0.3437, -0.2187],
         [-0.4104, -0.2503],
         [-0.2790, -0.1054]]),
 tensor([[ 0.5881, -0.8298],
         [-0.0168, -0.3298],
         [ 0.2580, -0.0810],
         [-0.3047,  0.0664],
         [-0.4005, -0.3811],
         [-0.3437, -0.2187],
         [-0.4104, -0.2503],
         [-0.2790, -0.1054]]))

In [39]:
tril = torch.tril(torch.ones(T, T))
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [40]:
torch.zeros((T,T)).masked_fill(tril == 0, float('-inf'))

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [41]:
F.softmax(torch.zeros((T,T)).masked_fill(tril == 0, float('-inf')), dim=-1)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [42]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))  #interaction strength, affinities will be data dependent
wei = wei.masked_fill(tril == 0, float('-inf'))  #from past can't communicate
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x  #aggregate values depending how interesting find each other => self-attention
torch.allclose(xbow, xbow3)

True

implement the weighted aggregation using matrix multiplication `@`:<br>
some tokens will find other tokens more or less interesting

determining affinities in this data-dependent way is called 'self-attention'<br>
for each character, we learn three vectors (sets of numbers in same embedding dimension `head_size`):<br>

query vector = a character "broadcasts" what it's looking for<br>
key vector = other characters "respond" with what they contain<br>
value vector = what the other characters contribute (the quantity to be weighted in the sum)

multiply key and query together pairwise and sum (dot product) gives the context-dependent affinities.<br>
If key and query are aligned, then learn more about that token by giving greater weight to its contribution (its vale vector).<br>

In [43]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16

key = nn.Linear(C, head_size, bias=False) #so matrix mult with fixed weights
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)

#all produce key and query, no communication yet, but now all queries dot product all keys
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)
#wei no longer starts at zero

# now do what we did before, except no longer  #wei = torch.zeros((T,T))
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))  #implements past only, delete for encoder block!
wei = F.softmax(wei, dim=-1)

#out = wei @ x  # not x, which would be (B,T,T) @ (B, T, C) -> B,T,C

v = value(x) # (B, T, 16)
out = wei @ v  # (B,T,T) @ (B, T, 16) -> B, T, head_size

out.shape # (B, T, head_size)

torch.Size([4, 8, 16])

In [44]:
wei[0] # no longer constant

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

Consider the last row above. The eighth token knows what content it has, knows what position it's in, and based on that, creates a query for what it's looking for, metaphorically say<br>
"i'm a vowel, in this position, am looking for any consonants at positions up to four"

All the other nodes in turn emit keys, one might be "I am a consonant, and am at position 4"<br>
That query/vector pair has a large dot product, so they find each other and create a high affinity .2297 indicating that the 4th position is interesting to the last one.<br>
the info that the 4th position will give is stored in its v.<br>
Through the softmax, etc, above that info is aggregated into the 8th position output of the transformer.

Unlike convolution, there's no notion of space -- different positions can communicate fully (for decoders used to generate, with all past positions; and with all past and future positions for encoders used to analyze text for translation.)

Self-attention is when the same source x produces the query/keys/values.<br>
In cross-attention, it can come from different source (e.g., from an encoder if translating between languages).

Dividing the query$\cdot$key by the sqrt of the head_size keeps the variance constant so that multiple repeat stages don't cause the outputs to either explode or diminish excessively (preventing useful backpropagation).

'Multi-head attention' just means include `n_head` separate attention blocks running in parallel, whose output is then concatenated together in a big wide vector of length `n_head*head_size` for the next layer of number crunching.

(Karpthy) Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [45]:
# identically the code above
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
                                          #normalize it so scaled attention
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) #future doesn't communicate 
                                                                                  #with past (decoder block)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v   # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [46]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd), #add some computation intermediate layer
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))  #residual (skip) connections
        x = x + self.ffwd(self.ln2(x))
        return x

In [47]:
# no longer a simple bigram model, now uses transformer weights
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [48]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel? -> 64
block_size = 32 # what is the maximum context length for predictions?         -> 256 (to analyze and predict 257)
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3  # -> 3e-4 (for bigger net)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64  # -> 384
n_head = 4  # -> 6  (384 / 6 = 64 dim heads)
n_layer = 4  # -> 6
dropout = 0.0 # -> .2

In [49]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

0.209729 M parameters


In [50]:
##### don't rerun this in class, would take a few min
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.3469, val loss 4.3500
step 100: train loss 2.6628, val loss 2.6695
step 200: train loss 2.5108, val loss 2.5142
step 300: train loss 2.4100, val loss 2.4193
step 400: train loss 2.3371, val loss 2.3464
step 500: train loss 2.2739, val loss 2.2872
step 600: train loss 2.2130, val loss 2.2417
step 700: train loss 2.1677, val loss 2.1927
step 800: train loss 2.1268, val loss 2.1538
step 900: train loss 2.0979, val loss 2.1382
step 1000: train loss 2.0536, val loss 2.1016
step 1100: train loss 2.0315, val loss 2.0833
step 1200: train loss 1.9965, val loss 2.0603
step 1300: train loss 1.9699, val loss 2.0470
step 1400: train loss 1.9536, val loss 2.0300
step 1500: train loss 1.9263, val loss 2.0085
step 1600: train loss 1.9193, val loss 2.0084
step 1700: train loss 1.9059, val loss 2.0053
step 1800: train loss 1.8737, val loss 1.9727
step 1900: train loss 1.8594, val loss 1.9813
step 2000: train loss 1.8533, val loss 1.9687
step 2100: train loss 1.8459, val loss 1.9558


In [51]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


Duke lest thingmnager of
Gloust the son: fullsse
So be-luts, of we bive mads; my sin their me?

Purnising:
O, in the plosonest rail
Liking your Strand peant ou,
And then ic if or stiegs, let siding recoice with a mock.

Clast:
Nown that with much.

MARCIUS:
I powal; nors, then! jup
Come with may. Any, good you
'tise much beholderens sproness, and power it,
The's of my but I will gape. There's the vole of your gary.

YORK:
For not my lie this, or is be of doth vometuce mut live live
Fill no makes; my peace in your spight.
The sun or thoot not, sir, tonge
To wice to awavater'd and these upon a pleases.

Fightle of any yield. York will young
To friend ChrisTance ento murds to flue; God he but onest friends,
What once begencess friend: foll.

DUKE VINCIO:
Ay, his slows bralishery tendeming.

PETRUCHIO:
Sisno: I cannot the keep! as possed in like gelenengel,
So fortund an or zery with 'old love: give,
No wescome, much going true mie them;
Qead steech your, sir? it is fraiess,
Whose yurip
I

The above was trained on a laptop, no special hardware, and is recognizably Shakespeare-like.

Further improvement requires increasing some of the parameters, e.g., increase batchsize from 16 to 64, increase blocksize from 32 to 256 (i.e., use interrelations of 256 character sequence to predict the 257th), increase from 64 to 384 dimensional character embeddings, increase from 4 to 6 transformer heads, from 4 to 6 layers, and for the bigger network tne down the learning rate and add some dropout.

In [52]:
#with the even bigger hyperparams commented out above, -> 10M
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel? (was 16)
block_size = 256 # what is the maximum context length for predictions?     (was 32) (to analyze and predict 257)
learning_rate = 3e-4  # (was 1e-3) 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 384  # was 64
n_head = 6  # (was 4)  (384 / 6 = 64 dim heads)
n_layer = 6  # (was 4)
dropout = 0.2 # -> (was 0)

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

10.788929 M parameters


50x more parameters than the aboe 200k<br>
run the larger model on a GPU (Karpathy ran on an A100 GPU, and doesn't recommend trying on a CPU),<br>
and the 1.82 loss came down to 1.48, with even more Shakespeare-like generation:

In [53]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

#...
#1.82 loss goes down to 1.48 (15 min on Karpathy's A100 GPU, don't try on CPU ...):

The top in a world by susphoring grace.

LUCIO:
We muse hath resistes him so sovere: son't his other wrough
stands of coverent sh'd: he has here, and stand it
and poor exceeder or a Henry's last, stay
not in faith, forewell's base of graves, thanks, happy comparel,
warmentfully: may as face by the courst, that strangth
errise hath breathed. Hastings come to Valenting.

HERMIONE:
Well have been bolly poor late
Is the lords.

ABELLA:
Let's found: I will kind him;
I do braw'sy him business wherein far his face.

LUCENTIO:
He is last afford: make him diseably to London,
Take him great Hastings, boldness in his natic keeps,
To oftragn lost me ready glust through the house.
Why chose that I dares it be a Montague.

MONTAGUE:
Woe's Claudly Haste of his own at last the Volscient,
And seen'd helpit: bearn to do it be, and most hop,
Miscause's more conterar than without this lambs
Shall down appla fortune flight flowers.

FRIAR LAUAURENCE:
His son, do your morself, that leaven your honours
Suffe