In [15]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [16]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115393


In [17]:
# let's look at the first 1000 characters
# print(text[:1000])

In [18]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [19]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [20]:
import torch
import torch.nn as nn
import torch.optim
from torch.nn import functional as F
torch.manual_seed(1337) # for reproducibility

data = torch.tensor(encode(text), dtype=torch.long)
# print(data.shape, data.dtype)

In [21]:
n = int(0.9 * len(data)) # first 90% of data for training
train_data = data[:n]
val_data = data[n:] # last 10% of data for validation

In [22]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [23]:
x = train_data[:block_size] # first block_size characters
y = train_data[1:block_size+1] # targets are the same but shifted one
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"input {context}, target: {target}")

input tensor([18]), target: 47
input tensor([18, 47]), target: 56
input tensor([18, 47, 56]), target: 57
input tensor([18, 47, 56, 57]), target: 58
input tensor([18, 47, 56, 57, 58]), target: 1
input tensor([18, 47, 56, 57, 58,  1]), target: 15
input tensor([18, 47, 56, 57, 58,  1, 15]), target: 47
input tensor([18, 47, 56, 57, 58,  1, 15, 47]), target: 58


In [24]:
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    idx = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idx])
    y = torch.stack([data[i+1:i+block_size+1] for i in idx])
    return x, y

xb, yb = get_batch('train')
# print("inputs: ", xb.shape)
# print(xb)


# for b in range(batch_size):
#     for t in range(block_size):
#         context = xb[b, :t+1]
#         target = yb[b, t]
#         # print(f"input {context}, target: {target.item()}")

In [25]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # run forward pass
            logits, _ = self(idx)
            # get last element of timestep (prediction for the next token)
            logits = logits[:, -1, :]
            # softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    
m = BigramLanguageModel(vocab_size)
out, loss = m(xb, yb)
print(out.shape)
print(loss)


idx = torch.zeros((1, 1), dtype=torch.long) # start with a single token
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.9456, grad_fn=<NllLossBackward0>)

lfJeukRuaRJKXAYtXzfJ:HEPiu--sDioi;ILCo3pHNTmDwJsfheKRxZCFs
lZJ XQc?:s:HEzEnXalEPklcPU cL'DpdLCafBheH


In [26]:
# create optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [27]:
batch_size = 32
for steps in range(1):
    xb, yb = get_batch('train')
    # print(xb.shape, yb.shape)
    
    # evaluate the loss
    logits, loss = m(xb, yb)
    
    # backpropagation
    optimizer.zero_grad(set_to_none=True) # zero out gradients
    loss.backward() # compute gradients
    optimizer.step() # update parameters
    
print(loss.item())

4.601938247680664


In [28]:
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))


pZwkXNTgQTW.!qXrXAtfBPWSGEzEfqvmjGW?
NBHl;tJHMcKOjPyM.pLFSJQ;tGBY
QNxZnH Sm WyQOYZqIsqpYCyEFhMvV
FYa


In [54]:
# self attn masking
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B, T, C)


head_size = 16

key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)
wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)



tril = torch.tril(torch.ones(T, T)) # lower triangular matrix
# wei = torch.zeros((T, T)) 
wei = wei.masked_fill(tril == 0, float('-inf')) # mask out the upper triangular part
wei = F.softmax(wei, dim=-1) # normalize to sum to 1
# out = wei @ x
v = value(x)
out = wei @ v # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)

out.shape

torch.Size([4, 8, 16])

In [52]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [34]:
# we want x[b,t] = mean_{i<=t} x[b,i]

# bag of words
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        xbow[b, t] = xprev.mean(dim=0) 

In [41]:
# can do BOW using matrix multiplication
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True) # normalize to sum to 1
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
# print("a: ")
# print(a)
# print("b: ")
# print(b)
# print("c: ")
# print(c)

In [42]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (T, T) @ (B, T, C) -> broadcast batch dim: (B, T, T) @ (B, T, C) ==> (B, T, C)

In [43]:
torch.allclose(xbow, xbow2) # check that both methods give the same result

True

In [44]:
# version 3
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf')) # mask out the upper triangular part
wei = F.softmax(wei, dim=-1) # normalize to sum to 1
xbow3 = wei @ x # (T, T) @ (B, T,
torch.allclose(xbow, xbow3) # check that both methods give the same result

True