# 1. Load Corpse

In [3]:
!curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -o input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  9924k      0 --:--:-- --:--:-- --:--:-- 10.1M


In [1]:
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [2]:
len(text)

1115394

In [3]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



# 2. Prepare Datasets

In [4]:
# Vocabulary
vocab = sorted(list(set(text)))
vocab_size = len(vocab)
print("".join(vocab))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
# Encode and Decode function
stoi = {ch: i for i, ch in enumerate(vocab)}
itos = {i: ch for i, ch in enumerate(vocab)}
encode = lambda s: [stoi[ch] for ch in s]
decode = lambda l: "".join(itos[i] for i in l)

print(encode("hii python"))
print(decode(encode("hii python")))

[46, 47, 47, 1, 54, 63, 58, 46, 53, 52]
hii python


In [6]:
# Training and validation data
import torch

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [15]:
# Batches
import torch

torch.manual_seed(1337)

batch_size = 4
block_size = 8


def get_batch(split: str):
    data = train_data if split == "train" else val_data
    ix = torch.randint(0, len(data) - block_size, (batch_size,))
    xb = torch.stack([data[i : i + block_size] for i in ix])
    yb = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return xb, yb


xb, yb = get_batch("train")
print("inputs")
print(xb.shape)
print(xb)
print("targets")
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, : t + 1]
        target = yb[b, t]
        print(f"When the inputs are: {context.tolist()}, target: {target}")

inputs
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
When the inputs are: [24], target: 43
When the inputs are: [24, 43], target: 58
When the inputs are: [24, 43, 58], target: 5
When the inputs are: [24, 43, 58, 5], target: 57
When the inputs are: [24, 43, 58, 5, 57], target: 1
When the inputs are: [24, 43, 58, 5, 57, 1], target: 46
When the inputs are: [24, 43, 58, 5, 57, 1, 46], target: 43
When the inputs are: [24, 43, 58, 5, 57, 1, 46, 43], target: 39
When the inputs are: [44], target: 53
When the inputs are: [44, 53], target: 56
When the inputs are: [44, 53, 56], target: 1
When the inputs are: [44, 53, 56, 1], target: 58
When the inputs are: [44

# 3. BiGram

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(1337)


class BiGramLanguageModel(nn.Module):
    def __init__(self, vocab_size) -> None:
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):  # idx is (B, T)
        logits = self.token_embedding_table(idx)  # logits is (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)  # To fulfill PyTorch cross entropy requirements
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)  # PyTorch requires (B, C, T)
        return logits, loss

    def generate(self, idx, num_examples):
        for _ in range(num_examples):
            logits, _ = self(idx)
            logits = logits[:, -1, :]  # (B, C). Only needs the last result of block_size (T)
            probs = F.softmax(logits, dim=-1)  # (B, C). Softmax on dimension C
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)

        return idx


m = BiGramLanguageModel(vocab_size=vocab_size)
xb, yb = get_batch("train")
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

# We see the loss on the randomly inited bigram LM is close to uniformly distributed loss
print(-torch.log(torch.tensor(1 / vocab_size)))

# Inference
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), num_examples=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.7051, grad_fn=<NllLossBackward0>)
tensor(4.1744)

pxMHoRFJa!JKmRjtXzfN:CERiC-KuDHoiMIB!o3QHN
,SPyiFhRKuxZOMsB-ZJhsucL:wfzLSPyZalylgQUEU cLq,SqV&vW:hhi


In [40]:
# Training
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)


def train(epoch: int):
    for _ in range(epoch):
        xb, yb = get_batch("train")
        logits, loss = m(xb, yb)
        optimizer.zero_grad(set_to_none=True)

        loss.backward()
        optimizer.step()

    return loss.item()


print(train(1000))

2.658010721206665


In [42]:
# We see the pattern looks similar to Shakespeare
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), num_examples=500)[0].tolist()))


AU,
Gowist Weano th t?EX&jFumat the of laticond ionedrt, cIce ce n thive'dZKIve, avind n, shadYZTH!abIJ&--cothere, m;
Tr myoman iss no t Fk cois The fy.Ebe hysoimay atode isevV:ZUAElanon ishuromV&Qxl?ughen,

TRT:
Pl, nopomngoreppurkRCKpo, me s ft he te s I'tun tav&
Bnd or o ft ges, IN.

WARSio. IVT:
SSivk!
Hand, fatrarcugeQG, he ame s mymenosos and DKpue whea abvpangmymy dvin
CJins wW:CJm bYWCkSise mye adettatlyxMOLWAscotheirilknes thethin w,S$

IUzord;Wthimy my by ithes etavery y: hal; t d.
NYw


# 4. Attention Intuition

We have a tensor (4, 8, 2)

In [14]:
import torch

torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

Let's now compute the cummulative average along the T-dimension.

> (Causal Attention) We want the element to have information from elements that are in front of it.

We want the the result to the the same size as the original tensor: (4, 8, 2),

where each element xbow[b_i, t_j, c_k] is avg(x[b_i, t_0 : t_j,c_k]), the average along the T-dimension.

In [24]:
# Naive way
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)

xbow = torch.zeros([B, T, C])  # Bad-Of-Words
for b in range(B):
    for t in range(T):
        xprev = x[b, : t + 1]  # (t, C)
        xbow[b, t] = torch.mean(xprev, dim=0)  # (C,)

print(x[0])
print(xbow[0])

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


Now let's leverage matrix multiplication to do it efficiently, without for-loops.

In [25]:
# If we have a matrix all of 1s.
import torch

torch.manual_seed(1337)

mat_A = torch.ones((3, 3))
mat_B = torch.randint(0, 9, (3, 2), dtype=torch.float32)
print(mat_A)
print(mat_B)

# The matmul just sums up all row in mat_B!
print(mat_A @ mat_B)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[2., 3.],
        [3., 3.],
        [7., 0.]])
tensor([[12.,  6.],
        [12.,  6.],
        [12.,  6.]])


In [26]:
# What if we only want to sum up the elements before self (inclusive)?
import torch

torch.manual_seed(1337)

# Use lower triangular matrix (mask)!
mat_A = torch.tril(torch.ones((3, 3)))
mat_B = torch.randint(0, 9, (3, 2), dtype=torch.float32)
print(mat_A)
print(mat_B)

# The matmul now sums up rows until the current row in mat_B.
print(mat_A @ mat_B)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[2., 3.],
        [3., 3.],
        [7., 0.]])
tensor([[ 2.,  3.],
        [ 5.,  6.],
        [12.,  6.]])


In [27]:
# To have the average, we just need to normalize the triangular matrix.
import torch

torch.manual_seed(1337)

mat_A = torch.tril(torch.ones((3, 3)))
mat_A /= mat_A.sum(dim=1, keepdim=True)
mat_B = torch.randint(0, 9, (3, 2), dtype=torch.float32)
print(mat_A)
print(mat_B)

# The matmul now averages up rows until the current row in mat_B.
print(mat_A @ mat_B)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[2., 3.],
        [3., 3.],
        [7., 0.]])
tensor([[2.0000, 3.0000],
        [2.5000, 3.0000],
        [4.0000, 2.0000]])


In [28]:
import torch

torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)

wei = torch.tril(torch.ones((T, T)))
wei /= wei.sum(dim=1, keepdim=True)
print(wei)

xbow2 = wei @ x
assert torch.allclose(xbow, xbow2)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


Now let's use softmax to replace the manual normalization of the trilangular matrix.

Notice that softmax firstly does `exp()`, thus we need to do `log()` to our triangular matrix.

1s becomes 0s, and 0s becomes '-inf'.

In [29]:
# Softmax
import torch
import torch.nn.functional as F

torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
print(wei)
wei = F.softmax(wei, dim=-1)
print(wei)

xbow3 = wei @ x
torch.allclose(xbow3, xbow)

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

Single Head Self-Attention

In [30]:
# Q, K, V
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)

# ==== single head self-attention ====
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)  # (B, T, head_size)
q = query(x)  # (B, T, head_size)
# matmul
wei = q @ k.transpose(-2, -1)  # (B, T, 16) @ (B, 16, T) -> (B, T, T)
# mask
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
# softmax
wei = F.softmax(wei, dim=-1)

# out = wei @ x
v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

1. Attention is a way of communication. The token at its current place can communicate with other tokens before or after it.

`q` is the query, `k` is the key, `v` is the value.

Every elment is asking for something using the query, and gives key as what it has. Once it gets attention from others, it gives value.


2. There is no information of space, or position in attention. Thus one needs to embed the position information before doing attention. In contract, CNN is space aware by nature.


3. In the example above we only attend to the tokens that are in front, that is "causal attention", used in decoder, meaning that current token does not talk to future tokens. In general, there is no restriction and an token can attend to tokens that are both in front or behind (encoder).


4. Tokens only communicate within others in the same batch. There will not be communication / attention among tokens from different batches.


5. In the example above we have q, k, v all from tokens themselves, i.e. self-attention. In some cases, 

6. Scale. A coefficient of `head_size ** -0.5` is required to ensure variance close to 1 => no element is to picky after softmax.

In [31]:
import torch

k = torch.randn((B, T, head_size))
q = torch.randn((B, T, head_size))

In [32]:
# variance is close to 1
k.var()

tensor(1.1016)

In [35]:
# variance is close to 1
q.var()

tensor(1.0290)

In [34]:
# No scaling -> variance is big
wei = q @ k.transpose(-2, -1)
wei.var()

tensor(16.8750)

In [36]:
# After scaling, the variance is close to 1.
wei = q @ k.transpose(-2, -1) * head_size**-0.5
wei.var()

tensor(1.0547)

In [45]:
# wei will go through softmax, which amplifies the most significant element.
print(F.softmax(torch.tensor([0.2, 0.3, 0.5, 0.1, -0.1]), dim=-1))
print(F.softmax(torch.tensor([0.2, 0.3, 0.5, 0.1, -0.1]) * 10, dim=-1))

# We don't want any single element to always have significant effect in attention, especially in initialization.

tensor([0.1961, 0.2167, 0.2646, 0.1774, 0.1452])
tensor([0.0413, 0.1122, 0.8292, 0.0152, 0.0021])
