In [1]:
# read into text
with open('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [2]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)
print(''.join(chars))

65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [3]:
# map char (str) to integer by index in the chars list
# ambigous ordering, losing a data dimension in proximity?
# tradeoff between size of vocab and dimension of embedding
ctoi = {ch:i for i,ch in enumerate(chars)}
itoc = {i:ch for i,ch in enumerate(chars)}
# lambda = mini functions
encode = lambda s: [ctoi[c] for c in s]
decode = lambda l: ''.join([itoc[i] for i in l])

In [4]:
# encode entire dataset
# store in torch.Tensor (multi-d array optimized for computation)
import torch
# torch.long = 64-bit integers (cannot use floats for certain downstream ops)
# tensor = array w/ autograd, gpu accel, vectorized/parallelized
data = torch.tensor(encode(text), dtype=torch.long)


In [5]:
# split data into 90% train vs. 10% validation
# no shuffle! since order matters
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [6]:
# only train transformer on chunks of max length = block_size
block_size = 8
train_data[:block_size+1]
# the +1 accounts for how 9 integers contain 8 "examples" of input-output pairs

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [7]:
x = train_data[:block_size] # all the training data, not always all used
y = train_data[1:block_size+1] # offset by 1, targets for each position, outputs for each "example"
for t in range(block_size):
  context = x[:t+1] # context window increases
  target = y[t]
  print(f"Input: {context}, Target: {target}")


Input: tensor([18]), Target: 47
Input: tensor([18, 47]), Target: 56
Input: tensor([18, 47, 56]), Target: 57
Input: tensor([18, 47, 56, 57]), Target: 58
Input: tensor([18, 47, 56, 57, 58]), Target: 1
Input: tensor([18, 47, 56, 57, 58,  1]), Target: 15
Input: tensor([18, 47, 56, 57, 58,  1, 15]), Target: 47
Input: tensor([18, 47, 56, 57, 58,  1, 15, 47]), Target: 58


In [12]:
# stack up blocks so GPU can process in parallel
torch.manual_seed(1337)

batch_size = 4 # num independent sequences processed in parallel
block_size = 8 # max context length for predictions
# context size isn't constant (it's a max) to juice out the max examples
# causal masking = preventing attention to future tokens
# also helps the model infer when context provided < max
# autoregressive framework
# this is called prefix training & it's free with causal masking

def get_batch(split):
  # generate small batch of data (input & output)
  data = train_data if split == 'train' else val_data
  # get random position (-block_size) to account for valid starting positions
  # get {batch_size} number of these random offsets
  ix = torch.randint(len(data)-block_size, (batch_size,))
  # torch.stack stacks up into rows of 4x8 tensor
  # 32 examples, x and y just hold the end points
  x = torch.stack([data[i:i+block_size] for i in ix]) # row = context block
  y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # x shifted by 1
  return x,y

xb, yb = get_batch('train')

In [13]:
# this gets the actual full examples
for b in range(batch_size): # batch dimension
  for t in range(block_size): # time dimension
    context = xb[b, :t+1]
    target = yb[b, t] # will be used for loss function later on
    print(f"input: {context.tolist()}, target: {target}")


input: [24], target: 43
input: [24, 43], target: 58
input: [24, 43, 58], target: 5
input: [24, 43, 58, 5], target: 57
input: [24, 43, 58, 5, 57], target: 1
input: [24, 43, 58, 5, 57, 1], target: 46
input: [24, 43, 58, 5, 57, 1, 46], target: 43
input: [24, 43, 58, 5, 57, 1, 46, 43], target: 39
input: [44], target: 53
input: [44, 53], target: 56
input: [44, 53, 56], target: 1
input: [44, 53, 56, 1], target: 58
input: [44, 53, 56, 1, 58], target: 46
input: [44, 53, 56, 1, 58, 46], target: 39
input: [44, 53, 56, 1, 58, 46, 39], target: 58
input: [44, 53, 56, 1, 58, 46, 39, 58], target: 1
input: [52], target: 58
input: [52, 58], target: 1
input: [52, 58, 1], target: 58
input: [52, 58, 1, 58], target: 46
input: [52, 58, 1, 58, 46], target: 39
input: [52, 58, 1, 58, 46, 39], target: 58
input: [52, 58, 1, 58, 46, 39, 58], target: 1
input: [52, 58, 1, 58, 46, 39, 58, 1], target: 46
input: [25], target: 17
input: [25, 17], target: 27
input: [25, 17, 27], target: 10
input: [25, 17, 27, 10], targe

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embd = 32

class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # learnable embedding table = matrix lookup table (vocab_size, vocab_size)
    # row i = logits/unnormalized scores/prob distr for next token given curr token = i
    # param 1 = num_embeddings (# unique tok in vocab)
    # param 2 = embedding_dim (size of vec representing a token)
    # usually embedding_dim < num_embeddings, but we have small vocab 
    # self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    # decompose 
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    # positional encoding
    self.position_embedding_table = nn.Embedding(block_size, n_embd)

    # go to token embed -> logits
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B,T=idx.shape
    # idx and targets are both (B,T) tensor
    tok_embd = self.token_embedding_table(idx) # (B,T,n_embd)
    pos_embd = self.position_embedding_table(torch.arange(T)) # (T,n_embd)
    x = tok_embd + pos_embd # (B,T,C), addition works nicely
    # now x hold token identities & positions
    logits = self.lm_head(x) # (B,T,vocab_size) 

    if targets is None:
        loss = None
    else:
        # use built-in -log likelihood loss
        # pytorch wants C as second dim 
        # stretch out into 2D 
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
    return logits, loss
  def generate(self, idx, max_new_tokens):
    # max_new_tokens = how many new tokens we want to generate
    # idx = (B, T) array of indices rn
    for _ in range(max_new_tokens):
      # get ALL predictions
      logits, loss = self(idx)
      # model predicts logits for every position, we only care about next token
      # take only the last time step prediction
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax: logits -> probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from distribution to get next token
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx
m = BigramLanguageModel()
logits, loss = m(xb, yb)
print(loss)

# 1x1 tensor holding a 0, kickoff character
idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=200)[0].tolist()))


tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJt-wBpm&yiltNCjeO3:Cx&vvMYW-txjuAd IRFbTpJ$zkZelxZtTlHNzdXXUiQQY:qFINTOBNLI,&oTigq z.c:Cq,SDXzetn3XVj


In [16]:
# setup before we train
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
# m.parameters() returns iterator over all knobs to tune (embedding matrices, biases, weights etd)
# each param has .data tensor & .grad tensor


In [18]:
batch_size = 32 # increased from before
block_size = 8
n_iter = 10000
import math
# lol lowk just keep running this
for steps in range(n_iter):
  # sample a new batch of data
  xb, yb = get_batch('train')
  # eval the loss
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True) # zero out gradients!!!
  loss.backward() # get grads for all params
  optimizer.step() # using grad to update params
  if steps%1000 == 0:
    print("Iteration: " + str(steps))
    print("Loss: " + str(loss.item()))
    # on average, the model is choosing among *perplexity* plausible next tokens
    # since vocab size = 65, random guessing = ln(65)~4
    print("Perplexity: " + str(math.exp(loss.item())))
    print("====")


Iteration: 0
Loss: 2.5512022972106934
Perplexity: 12.822510987982028
====
Iteration: 1000
Loss: 2.4487292766571045
Perplexity: 11.573630488650798
====
Iteration: 2000
Loss: 2.4547176361083984
Perplexity: 11.64314548081702
====
Iteration: 3000
Loss: 2.453296184539795
Perplexity: 11.62660707046919
====
Iteration: 4000
Loss: 2.3911964893341064
Perplexity: 10.926559637065708
====
Iteration: 5000
Loss: 2.476576328277588
Perplexity: 11.900451353664693
====
Iteration: 6000
Loss: 2.4607365131378174
Perplexity: 11.713435462882147
====
Iteration: 7000
Loss: 2.3521885871887207
Perplexity: 10.508543439111865
====
Iteration: 8000
Loss: 2.375598669052124
Perplexity: 10.757451411940833
====
Iteration: 9000
Loss: 2.336932420730591
Perplexity: 10.34944009069425
====


In [19]:
# check out improvements to predictions after training
idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=500)[0].tolist()))
# tokens still not talking to each other!


IOr with is aras Tht; thap
YORENGind hathinthethasiouro oulof I and s l SS:
Iffof; hat t-aroresere and; s fover;

AUMENGHALA:
A:
This;
I t.
NAn thal
Fiotha her owa
Fouidif toury aris ior yoress ane hit in,
O:
LETAUns.
Isat t  st far thas s sthasers t Bokerdace
My e
TENIris,
G oue, hon buime.
adive momo, warawoofe, M: atre deseeshen tar me ifukeshaceweag t io, d at.
KE: co ctisefang he t veswerde, t thises;
Bund wiemetiarele hen
le, be ad, jush we, withindire INENGRe' thovexpu a PESAn stlis wilur


### some clever math...

In [20]:
# toy example
torch.manual_seed(1089)
B,T,C=4,8,2 # batch time channels
x=torch.randn(B,T,C)
x.shape

# we want the 8 tokens to talk to each other
# info should only flow from past -> present not vice versa
# let's give the t-th tok the average of all toks preceding


torch.Size([4, 8, 2])

In [21]:
# bag of words
# VERSION 1 [naive]
# we want x[b,t]=mean{x[b,i] where i<t}
xbow1 = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # slice up to t -> (t,C)
        xbow1[b,t] = torch.mean(xprev,0)
# this works but is inefficient!
# matrix multiplication!

In [24]:
# VERSION 2 [matmul]
# lower triangular ones matrix
wei = torch.tril(torch.ones((T,T)))
# weighted sums 
wei = wei / wei.sum(1,keepdim=True)
# (B,T,T) @ (B,T,C) ----> (B,T,C)
xbow2 = wei @ x # batch multiply
print(wei)
torch.allclose(xbow1, xbow2)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

In [29]:
# VERSION 3 [softmax]
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T)) # begin @ 0
# make all elements where tril = 0 -> -inf 
# past cannot see the future
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
print(wei)
xbow3 = wei @ x
torch.allclose(xbow1, xbow3)
# we will use this one because we can learn the affinities!

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

### self-attention!

In [None]:
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# single head of self attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k=key(x) # (B,T,16)
q=query(x) # (B,T,16)
wei=k @ q.transpose(-2, -1) # (B,T,16) x (B,16,T) = (B,T,T)
 
tril = torch.tril(torch.ones(T,T)) # get rid of this to unmask for encoder
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v=value(x)
out = wei @ v
# out = wei @ x

# elements across batch dimensions never talk to each other
# no notion of space... yet
# "self" means KV and Q are produced from same tokens

In [41]:
# keep wei diffuse before softmaxxing
# otherwise will converge to one-hot
wei 

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5877, 0.4123, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4457, 0.2810, 0.2733, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2220, 0.7496, 0.0175, 0.0109, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0379, 0.0124, 0.0412, 0.0630, 0.8454, 0.0000, 0.0000, 0.0000],
         [0.5497, 0.2187, 0.0185, 0.0239, 0.1831, 0.0062, 0.0000, 0.0000],
         [0.2576, 0.0830, 0.0946, 0.0241, 0.1273, 0.3627, 0.0507, 0.0000],
         [0.0499, 0.1052, 0.0302, 0.0281, 0.1980, 0.2657, 0.1755, 0.1474]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4289, 0.5711, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5413, 0.1423, 0.3165, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0635, 0.8138, 0.0557, 0.0669, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4958, 0.0758, 0.2224, 0.0156, 0.1905, 0.0000, 0.0000, 0.0000],
         [0.3957, 0.112