In [2]:
import torch

In [70]:
import torch.nn as nn
from torch.nn import functional as F

# attention example

# batch - dimension that represents number of samples being processed simultaneously in one forward pass
# time (also called sequence length) - number of time steps (or tokens) in the input sequence
# channels (also called embedding size or features) - number of features or dimesionality of each token representation (dimension of embeddings)
    # corresponds to the size of hidden layers in the transformer
batch_size, seq_len, embedding_dim = 4, 8, 32
x = torch.randn(batch_size, seq_len, embedding_dim) 

# single head perform self attention
head_size = 16
key = nn.Linear(embedding_dim, head_size, bias=False)
query = nn.Linear(embedding_dim, head_size, bias=False) 
value = nn.Linear(embedding_dim, head_size, bias=False)
k = key(x) # [4, 8, 16]
q = query(x) # [4, 8, 16]

# Q matmul K^T

raw_attention = q @ k.transpose(-2, -1)

# scaled attention 1/sqrt(head_size)
raw_attention = raw_attention * head_size**-0.5

# add masking so later does not influence earlier
tril = torch.tril(torch.ones(seq_len, seq_len))
masked_attention = raw_attention.masked_fill(tril == 0, float("-inf")) # any characters after the sequence set to -inf

# apply softmax
attention_weights = F.softmax(masked_attention, dim=-1)

v = value(x)
attention = attention_weights @ v

attention.shape

torch.Size([4, 8, 16])

In [217]:
# hyper params
batch_size = 16 # amount of independent sequences processed in parallel
block_size = 32 # max content length for predictions

max_iterations = 50000
evaluation_interval = 1000
learning_rate = 1e-3
device = "cuda" if torch.cuda.is_available() else "cpu"

evaluation_iterations = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0

torch.manual_seed(1337)

<torch._C.Generator at 0x7fb6040d45f0>

In [66]:
# get data
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [67]:
# Build a tokenizer
chars = sorted(list(set(text)))
vocab_size = len(chars)

char_to_token = {}
token_to_char = {}
for i in range(len(chars)):
    char_to_token[chars[i]] = i
    token_to_char[i] = chars[i]

encode = lambda word: [char_to_token[char] for char in word]
decode = lambda tokens: [token_to_char[token] for token in tokens]

In [68]:
decode(encode("test"))

['t', 'e', 's', 't']

In [71]:
# load data and tokenize
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape)


torch.Size([1115394])


In [73]:
# split into training and validation
n = int(len(data) * 0.9)
data_train = data[:n]
data_test = data[n:]

print(len(data_train), len(data_test))

1003854 111540


In [172]:
# generate batches of data
def get_batch(split):
    data = data_train if split == "train" else data_test
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device) 
    return x, y

In [173]:
class AttentionHead(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch_size, seq_len, embedding_dim = x.shape
        k = self.key(x) # (batch_size, seq_len, embedding_dim)
        q = self.query(x) # (batch_size, seq_len, embedding_dim)
        
        # compute attention scores
        # Q matmul K^T
        raw_attention = q @ k.transpose(-2, -1)

        # scaled attention 1/sqrt(head_size)
        raw_attention = raw_attention * head_size**-0.5 # (batch_size, seq_len, seq_len)

        # add masking so later does not influence earlier
        tril = torch.tril(torch.ones(seq_len, seq_len))
        masked_attention = raw_attention.masked_fill(self.tril[:seq_len, :seq_len] == 0, float("-inf")) # any characters after the sequence set to -inf

        # apply softmax
        attention_weights = F.softmax(masked_attention, dim=-1)  # (batch_size, seq_len, seq_len)

        attention_weights = self.dropout(attention_weights)
        # perform weighted aggregation of values
        v = self.value(x)
        attention = attention_weights @ v # (batch_size, seq_len, embedding_dim)
        return attention

In [190]:
class MultiHeadAttention(nn.Module):
    # multiple heads of self attention in parallel
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([AttentionHead(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # combine all attention
        out = torch.cat([attentionHead(x) for attentionHead in self.heads], dim=-1)
        # modify embeddings with combined attention
        out = self.dropout(self.proj(out))
        return out

In [191]:
class FeedForward(nn.Module):
    # linear layer followed by non linearity
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
            return self.net(x)

In [192]:
class Block(nn.Module):
    # transformer block: communication followed by computation
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [193]:
# Simple BigramLanguageModel

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        # each token directly reads off logits for next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        batch_size, seq_len = idx.shape

        # idx and targets are both (batch_size, seq_len) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (batch_size, seq_len, emb_dimension)
        pos_emb = self.position_embedding_table(torch.arange(seq_len, device=device)) # (seq_len, emb_dimension)
        x = tok_emb + pos_emb # (batch_size, seq_len, emb_dimension)
        x = self.blocks(x) # (batch_size, seq_len, emb_dimension)
        x = self.ln_f(x) # (batch_size, seq_len, emb_dimension)
        logits = self.lm_head(x) # (batch_size, seq_len, vocab_size)

        if targets is None:
            loss = None
        else:
            batch_size, seq_len, emb_dimension = logits.shape
            logits = logits.view(batch_size * seq_len, emb_dimension)
            targets = targets.view(batch_size * seq_len)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (batch_size, seq_len) array of indices in current context
        for _ in range(max_new_tokens):
            # crop idx to last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (batch_size, emb_dimension)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (batch_size, emb_dimension)
            # sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (batch_size, seq_len + 1)
        return idx
        
        
       

In [194]:
model = BigramLanguageModel()
m = model.to(device)

In [195]:
# prin the number of parameters in the model
print(sum(p.numel() for p in m.parameters()), "parameters")

209729 parameters


In [196]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [197]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(evaluation_iterations)
        for k in range(evaluation_iterations):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [218]:
for iter in range(max_iterations):
    # every once in a while evalaute loss on train and val sets
    if iter % evaluation_interval == 0 or iter == max_iterations - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch("train")

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 1.6095, val loss 1.7770
step 1000: train loss 1.5246, val loss 1.7094
step 2000: train loss 1.5233, val loss 1.7182
step 3000: train loss 1.5094, val loss 1.6947
step 4000: train loss 1.5083, val loss 1.6935
step 5000: train loss 1.4989, val loss 1.6733
step 6000: train loss 1.4995, val loss 1.6758
step 7000: train loss 1.4798, val loss 1.6685
step 8000: train loss 1.4846, val loss 1.6762
step 9000: train loss 1.4719, val loss 1.6460
step 10000: train loss 1.4718, val loss 1.6702
step 11000: train loss 1.4496, val loss 1.6386
step 12000: train loss 1.4553, val loss 1.6546
step 13000: train loss 1.4534, val loss 1.6458
step 14000: train loss 1.4594, val loss 1.6474
step 15000: train loss 1.4449, val loss 1.6457
step 16000: train loss 1.4459, val loss 1.6498
step 17000: train loss 1.4396, val loss 1.6465
step 18000: train loss 1.4349, val loss 1.6425
step 19000: train loss 1.4217, val loss 1.6309
step 20000: train loss 1.4353, val loss 1.6442
step 21000: train loss 1.4

In [220]:
context = torch.tensor([[char_to_token[char] for char in "ROMEO:"]], dtype=torch.long, device=device)
print("".join(decode(model.generate(context, max_new_tokens=2000)[0].tolist())))

ROMEO:
Strude inform my poor.
Though I have real'd your mastrans; and knees you requirest
Have as many hurts of George,
Farewill's villain in the jury wind upon,
At chance thee, and hence to the denger
Lies are, and they and lick me: or God,
Commendance, tyll have word'st title, gives! Romeo!

COMINIUS:
I would indeed there is your lifest gayss
From a trait? well, therefore do with childimb and I
Happily now, and becomes of us!

QUEEN ELIZABETH:
I hard and my untroop,
Being and we shorter I mear-font he with themselves
from the bribed, my eands that have
will be indequer me to the died upon
Thereables in content, and I do long of with bed inderious dear.

First Murderer:
Not all day and behave! Lave your own Romeo's arm farewell'd.

Third Servingman:
Where will west, my care
Marcius ward, thy mirtion. For el there not should be long of the king to-n,
From it on. We hadeful song down,
By live, to those hath with the morning to wredged
For you; Buckingham, wearing that.

KING HENRY VI:
A