# Let's build a generative pretrained transformer, from scratch!

Transformer: a type of neural net architecture.\
Dataset: shakespeare dataset


#### SETUP

In [3]:
import urllib.request
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
filename = 'shakespeare.txt'
urllib.request.urlretrieve(url, filename)

('shakespeare.txt', <http.client.HTTPMessage at 0x7fc7c8777610>)

In [4]:
with open('./shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [9]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
# all unique chars that occur in this dataset
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(f"Length: {vocab_size}")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Length: 65


In [6]:
# tokenize the chars into ints by creating a lookup table; many other schemas for doing this
# google uses SentencePiece, and OpenAI tiktoken
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode('Hi!'))
print(decode(encode('Hi!')))

[20, 47, 2]
Hi!


In [8]:
# encode the entire text dataset and store it inside of a tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [9]:
# finally, let's separate out the data into a train and validation split

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

#### FEED DATA INTO TRANSFORMER

Instead of feeding all data into transformer at once, feed chunks of data into it piece by piece (blocks), with mini-batches

In [10]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size+1] # e.g. the next character after the block
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target is: {target}")


# we want the network to be used to seeing things all the way from context of 1 to block_size
# this is useful during inference where context / input can vary

When input is tensor([18]) the target is: 47
When input is tensor([18, 47]) the target is: 56
When input is tensor([18, 47, 56]) the target is: 57
When input is tensor([18, 47, 56, 57]) the target is: 58
When input is tensor([18, 47, 56, 57, 58]) the target is: 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target is: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is: 58


In [12]:
# for all of these blocks of data, we also have mini-batches for efficiency reasons
# these keep the GPUs busy, as they are very good for parallel processing of data that
# can process these in parallel

torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # get random positions to get blocks from
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"When input is {context.tolist()} the target is: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
When input is [24] the target is: 43
When input is [24, 43] the target is: 58
When input is [24, 43, 58] the target is: 5
When input is [24, 43, 58, 5] the target is: 57
When input is [24, 43, 58, 5, 57] the target is: 1
When input is [24, 43, 58, 5, 57, 1] the target is: 46
When input is [24, 43, 58, 5, 57, 1, 46] the target is: 43
When input is [24, 43, 58, 5, 57, 1, 46, 43] the target is: 39
When input is [44] the target is: 53
When input is [44, 53] the target is: 56
When input is [44, 53, 56] the target is: 1
When input is [44, 53, 56, 1] the target is: 58
When input is [44, 53, 56, 1, 58] 

In [21]:
# Start feeding this into the neural network; we'll use bigrams for simplicity
import torch
import torch.nn as nn 
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, n_emb):
        super().__init__()
        # each token directly reads off logits for the next token, from the lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        
        # add a positional encoding
        self.position_embedding_table = nn.Embedding(block_size, n_emb)
        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        # idx and target are both (B, T) tensor of integers
        B, T = idx.shape
        logits = self.token_embedding_table(idx) # (B, T, C)

        tok_emb = self.token_embedding_table(idx) # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C), holding both token identities and positions. Doesn't matter much in a Bigram model where history is only 1 step
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            # Pytorch takes in Channel as the second dimension, so we need to reshape
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T) # This will shape into a single dimension, data is maintained but shape changes

            loss = F.cross_entropy(logits, targets) # Measures quality of the logits wrt the targets

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx) # calls the forward function, without targets (no ground truth to compare against, just generating)
            # loss not used but we still pluck it out to make sure logits isn't a tuple containing both

            # focus only on the last time step
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=-1)

            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

# We should be able guess what the loss should be, because we have 65 possible vocab elements / unique chars
# ie. -ln(1/65) = 4.17
# because our actual loss is higher, this means we are not doing a good job at predicting the next character

idx = torch.zeros((1, 1), dtype=torch.long) # 1x1 tensor holding a zero, starting with uniline
output = m.generate(idx, max_new_tokens=100)[0].tolist() # [0] just makes sure it isn't a nested tensor, and tolist() converts the tensor to an array
print(decode(output)) # we defined an int to char mapping earlier. This will be awful as we haven't trained the model yet

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


What are B, T, and C?
- B: Minibatch size, the pieces of data being passed in the neural network vs the entire dataset in one go
- T: Temporal dimension, typically used in models that process sequential data, like text or time. This represents the sequence length or number of time steps in the data
- C: Feature dimension capturing different aspects of the input information

Note that in the generate function above, we're pushing in a longer context like before, but because this is a Bigram class we're actually only taking in one token of context. We'll want to implement a non-bigram model later to take full advantage of the history.

In [22]:
# Start training the model
# Use adam optimization, to start off
# This is an adaptive learning rate optimization algorithm computing individual learning rates for different parameters
# this is better than based stochastic gradient descent with a constant rate, because it adapts to the parameters; diff parameters can receive different-sized
# updates based on their importance and frequency in the data. This is especially useful for when there is sparse data
# ie. data with many zeros and few non-zero values

optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
# often leads to faster convergence, eg. the point of minimum loss in fewer epochs or wall-clock time

In [32]:
batch_size = 32
for steps in range(10000):
    # sample a bunch of data
    xb, yb = get_batch('train') # function we defined earlier gets a random batch of data from the training set

    # evaluate the loss for bigram model we defined earlier and can call
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item()) # can also put this in the loop for debugging purposes
    

2.385589838027954


In [35]:
output = m.generate(idx, max_new_tokens=400)[0].tolist() 
print(decode(output))


Corngyoouremeesofoatcangiladrndo canon Sencit mmoue mong. cowheerakert prillajor m Enks be ck che Noisithend iclyothed, alyeset
NGETillatothers ma crg.
NThertinathineprs se d fat d'sacairu:
Theaive blshig; m Cam ancrthtinous hece meburgour ye.
Bu, d alles : ppe saces se,
FLe as, g s souranod'TE CLen w seseer urer? anthecach DUndooure sethe
FRDUS:
TAUpet m torenongs hiloroous heis RABe ve d owourit


#### Let's now move on from a bigram model, which doesn't use a transformer to have all tokens attend to all other tokens, to implementing such a capability.

In [41]:
# self-attention toy example

torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C) 

# one interaction type is 'summing / averaging' the previous tokens; this is very weak and you lose a lot of info, but it works for this toy example
# this is called a bag-of-words model

# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))

for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0) # 0th dimension in this case is the t dimension

print(x[0])

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])


In [39]:
print(xbow[0]) # moving down row by row, it's an average of all of the previous

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


In [54]:
# refactoring the above for loops O(n^2) to use a matrix multiplication trip
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 8)) # triangle arrangement. Down the line, these will be weights, so we can do weighted aggregation
a = a / torch.sum(a, dim=1, keepdim=True) # normalize the rows to sum to 1

print(f'A = {a}')

c = a @ x # matrix multiplication
# print(f'\nB = {x}')
print(f'\nC = {c}')

# We can see here by essentially multiplying the two matrices where the first is normliazed, we get the same result as the for loop

A = tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]])

C = tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420]]])


In [57]:
# refactoring above matrix multiplication to also use softmax.

tril = torch.tril(torch.ones(T, T)) # ensures no communication with 'future' tokens. We delete this in an 'encoder' attention block; decoder blocks are usually
# used in autoregressive settings, like language modelling

wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf')) # by setting something to infinity, that means it will be zero after softmax
wei = F.softmax(wei, dim=-1)

print(wei)

xbow2 = wei @ x
torch.allclose(xbow, xbow2)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

In [62]:
# finally, let's refactor the above to use self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# we'll avoid setting wei to all zeros; now it should actually be a learnable parameter. Self-attention will solve this
# every single node / token at each position emits two vectors
# 1) query which roughly indicates what it is looking for, and 2) key is what they contain
# doing a dot product between the all query and key vectors will give us a attention score, giving us wei

# each token independetly emits a query and key vector; no communication has happened yet
# self-attention: key and values are produced from the same source as queries. In cross attention k and v might come from a separate encoder module
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)

# enabling communication / defining the affinities between these
wei = q @ k.transpose(-2, -1) * C**(-0.5) # (B, T, T) scaled attention

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
# wei = wei.masked_fill(tril == 0, float('-inf')) # by setting something to infinity, that means it will be zero after softmax
# wei = F.softmax(wei, dim=-1)
# out = wei @ x
v = value(x)
out = wei @ v

In [63]:
wei[0]

tensor([[-0.3116, -0.2300,  0.0999,  0.3821, -0.1887,  0.3470,  0.1903, -0.0801],
        [-0.5893, -0.2927,  0.0184,  0.5972, -0.3858,  0.1841, -0.0098,  0.0517],
        [-0.1808, -0.2229,  0.0135, -0.0674, -0.1740, -0.2528,  0.0132, -0.1688],
        [ 0.1385, -0.1417, -0.0595, -0.1502, -0.0990, -0.2069, -0.2285, -0.1814],
        [-0.2221,  0.0033, -0.1393, -0.2334,  0.3600,  0.1527,  0.0657,  0.1637],
        [-0.0553,  0.4270, -0.0195, -0.1755,  0.5913, -0.4460,  0.2508,  0.2156],
        [ 0.1923,  0.3474, -0.0463, -0.0558,  0.1077,  0.2230, -0.0969,  0.1423],
        [-0.3190, -0.0729, -0.1468,  0.1043, -0.1412, -0.1035,  0.1137,  0.1114]],
       grad_fn=<SelectBackward0>)

#### Let's now insert a single self-attention block to our network

In [257]:
n_embd=65
dropout = 0.2 # 20% of intermediate calculations are dropped out

In [247]:
class Head(nn.Module): # single head of self-attention
    def __init__(self, head_size, n_embd):        
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [149]:
# Start feeding this into the neural network; we'll use bigrams for simplicity
import torch
import torch.nn as nn 
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        # each token directly reads off logits for the next token, from the lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        
        # add a positional encoding
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        self.sa_head = Head(n_embd) # TODO: not a real todo, just to highlight changes

        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        # idx and target are both (B, T) tensor of integers
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C), holding both token identities and positions. Doesn't matter much in a Bigram model where history is only 1 step
        x = self.sa_head(x) # applying a single head of self-attention
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            # Pytorch takes in Channel as the second dimension, so we need to reshape
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T) # This will shape into a single dimension, data is maintained but shape changes
            loss = F.cross_entropy(logits, targets) # Measures quality of the logits wrt the targets

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # make sure we don't accidentally get an idx outside of block size
            idx_cond = idx[:, -block_size:] # (B, T)

            # get predictions
            logits, loss = self(idx_cond) # calls the forward function, without targets (no ground truth to compare against, just generating)
            # loss not used but we still pluck it out to make sure logits isn't a tuple containing both

            # focus only on the last time step
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=-1)

            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

# We should be able guess what the loss should be, because we have 65 possible vocab elements / unique chars
# ie. -ln(1/65) = 4.17
# because our actual loss is higher, this means we are not doing a good job at predicting the next character

idx = torch.zeros((1, 1), dtype=torch.long) # 1x1 tensor holding a zero, starting with uniline
output = m.generate(idx, max_new_tokens=100)[0].tolist() # [0] just makes sure it isn't a nested tensor, and tolist() converts the tensor to an array
print(decode(output)) # we defined an int to char mapping earlier. This will be awful as we haven't trained the model yet

torch.Size([256, 65])
tensor(4.2145, grad_fn=<NllLossBackward0>)

w
$,IEJULJT;noQEmTyiA&'!fL aqp x,3bSTAE OvsOw:Os$ !,LMM,?vJdoq?BeW$zYDJaJiPKdYlP'cQLrmjWX$xaFbAFB!Gu


In [258]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
max_iters = 15000
eval_interval = 500
eval_iters = 200

In [None]:
batch_size = 32
for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        out = {}
        m.eval()
        for split in ['train', 'val']:
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                X, Y = get_batch(split)
                logits, loss = m(X, Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
        m.train()
        print(f"step {iter}: train loss {out['train']:.4f}, val loss {out['val']:.4f}")

    # sample a bunch of data
    xb, yb = get_batch('train') # function we defined earlier gets a random batch of data from the training set

    # evaluate the loss for bigram model we defined earlier and can call
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [154]:
output = m.generate(idx, max_new_tokens=500)[0].tolist() # [0] just makes sure it isn't a nested tensor, and tolist() converts the tensor to an array
print(decode(output)) # we defined an int to char mapping earlier. This will be awful as we haven't trained the model yet


et fem of, ards ome hume
Yor ad angn wat amn wa ncor ty les's
I
Winod ty! Waut thes an
Andsl keey tul, emicopr, dizet cand my,
You:
I owing uinasrt thay she st ds tho; nst ph ua stes hath broe Hasincotr by frith sin,
Thif are fthe gth ndis boucanghe rt:
Hof hay fo.

GROMEENONREES:

Loullousprs prr intde hovau ofte, cth the avelllel?

BAUIn femy he fat ther g
DUSerclanee a babtly datretaleeve tupre Glogitoo' ehy tin wingg abthat:
Ded.

KINT:
Thy tepr
ickso, yopisitsthe whand-mn mmish.

RKID is hr


#### Let's try implementing multi-headed attention

Residual / skip connections help solve some optimization errors with deep neural nets (like the one we are starting to build with blocks). 

You fork off from a residual pathway, perform some computation, and then project that back to the pathways through 'pluses'. The gradients from the loss hop from every addition node to the input (+).

In [250]:
# scale up the model
batch_size = 64 # independent sequences to proces in parallel
block_size = 256 # maximum content length for predictions, up from 'eight
n_head = 6
n_embd = 384 # before it was 65; with 6 heads, we found a multiple

In [249]:
# communication
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, e_embd=n_embd):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd) # residual connection projection back to residual pathway
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [240]:
# finally, we have layernorm. This is a normalization technique that normalizes the activations of the network; very similar to batchnorm
# except it normalizes the activations of the network at each layer, rather than normalizing the activations of the network at each batch
# ie. we normalize the rows, not the columns

# VERY LITTLE ABOUT TRANSFORMERS IS ACTUALLY NEW; IT'S JUST A COMBINATION OF EXISTING TECHNIQUES
# HOWEVER, THIS DOES DEPART A BIT FROM THE ORIGINAL PAPER
# Now it's common to apply layernorm before the transformation (ie self attention and feedback forward in the block)

class LayerNorm1d: # (used to be BatchNorm1d)

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean -- this is 0 in batchnorm
    xvar = x.var(1, keepdim=True) # batch variance -- this is 0 in batchnorm
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [241]:
# computation, interspersed through the network. All of the tokens independently compute
# note in this paper: https://arxiv.org/abs/1512.03385, authors mention
# that while dmodel = 512, the inner-layer has dff = 2048. Different parameters are used
# layer to layer. Therefore, we should actually multiply the dmodel by 4 for the inner layer
# this is a typical transformer design pattern

class FeedForward(nn.Module):
    # simple linear layer followed by a nonlinearity (ie. 0s for negatives)

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            # I can also put nn.Linear(n_embd, n_embd) here instead of self.proj
        )
        self.proj = nn.Linear(4*n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = self.net(x)
        out = self.dropout(self.proj(out))
        return out

In [253]:
# transformer: communication -> computation -> communication -> computation -> ...
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # adding in 'x +' for residual connections, the sa is the forked off computation
        x = x + self.ffwd(self.ln2(x))
        return x

I have some overfitting in training (as seen in the diff between training and val loss) so I will also include some dropout.

In [None]:
# Start feeding this into the neural network; we'll use bigrams for simplicity
import torch
import torch.nn as nn 
from torch.nn import functional as F
torch.manual_seed(1337)

class GPTLanguageModel(nn.Module):
    def __init__(self, n_embd): # head_size is n_embd (dimensionality of model), but divided not by # heads
        super().__init__()
        # each token directly reads off logits for the next token, from the lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        
        # add a positional encoding
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # self.sa_heads = MultiHeadAttention(6, n_embd // 6)
        # self.ffwd = FeedForward(n_embd)

        self.blocks = nn.Sequential(
            Block(n_embd, n_head),
            Block(n_embd, n_head),
            Block(n_embd, n_head),
            nn.LayerNorm(n_embd) # also add this right before the end of the transformer
        )

        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        # idx and target are both (B, T) tensor of integers
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C), holding both token identities and positions. Doesn't matter much in a Bigram model where history is only 1 step
        # x = self.sa_heads(x) 
        # # Feedforward gives tokens time to *think* before making a prediction via logits
        # x = self.ffwd(x) # multiplied by 4?
        x = self.blocks(x)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            # Pytorch takes in Channel as the second dimension, so we need to reshape
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T) # This will shape into a single dimension, data is maintained but shape changes
            loss = F.cross_entropy(logits, targets) # Measures quality of the logits wrt the targets

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # make sure we don't accidentally get an idx outside of block size
            idx_cond = idx[:, -block_size:] # (B, T)

            # get predictions
            logits, loss = self(idx_cond) # calls the forward function, without targets (no ground truth to compare against, just generating)
            # loss not used but we still pluck it out to make sure logits isn't a tuple containing both

            # focus only on the last time step
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=-1)

            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

    
m2 = GPTLanguageModel(n_embd)
logits, loss = m2(xb, yb)
print(logits.shape)
print(loss)

In [260]:
# optimizer2 = torch.optim.AdamW(m2.parameters(), lr=1e-3)
optimizer2 = torch.optim.AdamW(m2.parameters(), lr=3e-4) # reducing learning rate after scaling up the model

In [263]:
max_iters = 5000
eval_interval = 100
eval_iters = 200

In [None]:
for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        out = {}
        m2.eval()
        for split in ['train', 'val']:
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                X, Y = get_batch(split)
                logits, loss = m2(X, Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
        m2.train()
        print(f"step {iter}: train loss {out['train']:.4f}, val loss {out['val']:.4f}")

    # sample a bunch of data
    xb, yb = get_batch('train') # function we defined earlier gets a random batch of data from the training set

    # evaluate the loss for bigram model we defined earlier and can call
    logits, loss = m2(xb, yb)
    optimizer2.zero_grad(set_to_none=True)
    loss.backward()
    optimizer2.step()

In [None]:
idx = torch.zeros((1, 1), dtype=torch.long) # 1x1 tensor holding a zero, starting with uniline
output = m2.generate(idx, max_new_tokens=500)[0].tolist() # [0] just makes sure it isn't a nested tensor, and tolist() converts the tensor to an array
print(decode(output)) # we defined an int to char mapping earlier. This will be awful as we haven't trained the model yet

Find the GPU-version here: https://colab.research.google.com/drive/1rI22Stc8px3af_atuK8a17Zi2ahaMkEB