In [40]:
import datasets
import argparse
import torch

In [41]:
# hyperparameters
block_size = 8
bach_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

device: cuda


In [42]:
# load data
dataset = datasets.load_from_disk("dataset")

In [43]:
# encoding and decoding  
chars = sorted(set("\n\n".join(dataset["train"]["abc notation"]+dataset["validation"]["abc notation"])))
vocab_size = len(chars) 
chat2index = {ch:i for i, ch in enumerate(chars)}
index2chat = {i:ch for i, ch in enumerate(chars)}
encode = lambda x: [chat2index[c] for c in x]
decode = lambda x: "".join([index2chat[c] for c in x])

In [44]:
# encode training data
# dataset = dataset.map(lambda x: {"abc notation": encode(x["abc notation"])})
text = "\n\n".join(dataset["train"]["abc notation"]+dataset["validation"]["abc notation"])
training_data = torch.tensor(encode(text))
print(len(training_data))

63170498


In [45]:
# encode validation data
# dataset = dataset.map(lambda x: {"abc notation": encode(x["abc notation"])})
text = "\n\n".join(dataset["validation"]["abc notation"]+dataset["validation"]["abc notation"])
validation_data = torch.tensor(encode(text))
print(len(validation_data))

1247506


In [46]:
# example of training samples
b_size = 8
x = training_data[:b_size]
y = training_data[1:b_size+1]
for t in range(1, b_size):
    context = x[:t]
    target = y[t]
    print(context, "->", target)

tensor([56]) -> tensor(17)
tensor([56, 26]) -> tensor(0)
tensor([56, 26, 17]) -> tensor(44)
tensor([56, 26, 17,  0]) -> tensor(26)
tensor([56, 26, 17,  0, 44]) -> tensor(17)
tensor([56, 26, 17,  0, 44, 26]) -> tensor(15)
tensor([56, 26, 17,  0, 44, 26, 17]) -> tensor(24)


In [47]:
# bach generator
def get_batch(split, block_size=8, bach_size=32):
    if split == "train":
        data = training_data
    elif split == "validation":
        data = validation_data
    else:
        raise ValueError("split must be 'train' or 'validation'")
    start_idx = torch.randint(0, data.size(0) - block_size, (bach_size,))
    x = torch.stack([data[idx:idx+block_size] for idx in start_idx]).to(device)
    y = torch.stack([data[idx+1:idx+block_size+1] for idx in start_idx]).to(device)
    return x, y

In [48]:
torch.manual_seed(42)
x, y = get_batch("train")
print(x.shape, y.shape)
print('input')
print(x)
print('target')
print(y)

for b in range(bach_size):
    for t in range(block_size):
        context = x[b, :t+1]
        target = y[b, t]
        print(context, "->", target)

torch.Size([32, 8]) torch.Size([32, 8])
input
tensor([[67,  1, 92,  1, 68, 71, 71,  1],
        [92,  3, 36,  3,  1, 68, 18, 36],
        [34,  1, 92,  1, 67, 68, 67,  1],
        [61, 68,  1, 67, 69,  1, 67, 69],
        [34, 33,  1, 39, 34, 37, 65,  1],
        [ 1, 92,  1,  0,  1, 68, 34, 39],
        [ 3, 39,  3,  1, 34, 69,  1, 68],
        [67,  1, 34,  1, 33,  1, 92,  1],
        [ 1, 14, 36, 30, 36,  1,  8, 37],
        [37, 66,  3,  1, 37, 18,  1, 92],
        [ 1, 65, 18,  1, 92,  1, 66, 19],
        [39,  1, 92,  1, 34, 67, 68,  1],
        [92,  1, 38, 33,  1, 33, 15, 34],
        [ 1, 92,  1, 37, 33, 33,  1, 67],
        [ 3, 33, 23,  3,  1, 65, 18, 65],
        [68, 70, 68,  1, 92,  1,  8, 68],
        [92,  1, 38, 39, 33, 38,  1, 36],
        [26,  1, 33, 34,  1, 33, 18,  1],
        [ 1,  8, 69, 65,  9,  8, 71, 66],
        [ 1, 68, 19,  1, 92,  1, 69, 18],
        [71, 34, 70, 34,  1, 69, 68, 34],
        [ 1, 92,  3, 35,  3,  1, 67, 68],
        [ 1, 37, 18,  1, 92,  

## Bigram Model

In [49]:
import torch.nn as nn
import torch.nn.functional as F

In [50]:
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigramModel, self).__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        B, T, C = logits.size()
        
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        else:
                loss = None
        return logits, loss
    
    def generate(self, idx, n):
        for _ in range(n):
            logits = self.token_embedding_table(idx)
            next_idx = torch.multinomial(F.softmax(logits[:, -1], dim=1), 1)
            idx = torch.cat([idx, next_idx], dim=1)
        return idx

In [51]:
x, y = get_batch("train")
m = BigramModel(vocab_size)
m.to(device)
logits, loss = m(x, y)  
print(logits.shape)
print(loss)

torch.Size([32, 8, 95])
tensor(4.9579, device='cuda:0', grad_fn=<NllLossBackward0>)


In [52]:
idx = torch.zeros(1, 1).long().to(device)
g = m.generate(idx, 100)
decode(g[0].tolist())

'\nJ2)[ *)yLorct0s$/T@\'>3zx1G~SbJD2)[Rt)1y&cV*_1"6&sa.x}j7SDM<\'^E?$67R)V\\N/u?ruzu5|<zEWYTv>V<H#A{j[($Z5'

### training

In [53]:
with torch.no_grad():
    def estimate_loss(model, eval_iters, block_size):
        out = {}
        model.eval()
        for split in dataset:
            losses = torch.zeros(eval_iters)
            for i in range(eval_iters):
                x, y = get_batch(split, block_size)
                _, loss = model(x, y)
                losses[i] = loss.item()
            out[split] = losses.mean()
        model.train()
        return out

In [54]:
n_iters = 1000
optimizer = torch.optim.Adam(m.parameters(), lr=0.001)

In [55]:
for step in range(n_iters):
    x, y = get_batch("train")
    optimizer.zero_grad()
    logits, loss = m(x, y)
    loss.backward()
    optimizer.step()
    if step % (n_iters//10) == 0:
        losses = estimate_loss(m, 100, block_size)
        print(f"step: {step}, train loss: {losses['train']:.3f}, validation loss: {losses['validation']:.3f}")

step: 0, train loss: 4.983, validation loss: 4.966
step: 100, train loss: 4.845, validation loss: 4.832
step: 200, train loss: 4.715, validation loss: 4.712
step: 300, train loss: 4.603, validation loss: 4.587
step: 400, train loss: 4.472, validation loss: 4.473
step: 500, train loss: 4.368, validation loss: 4.357
step: 600, train loss: 4.252, validation loss: 4.237
step: 700, train loss: 4.142, validation loss: 4.142
step: 800, train loss: 4.051, validation loss: 4.046
step: 900, train loss: 3.949, validation loss: 3.935


In [56]:
idx = torch.zeros(1, 1).long().to(device)
g = m.generate(idx, 500)

decode(g[0].tolist()).replace("\n", "")

'Iw1_b&qn,@6ilm("61i~IS#Tk1/2LqP{pocT#Nwry#tAG104fj[&RFa;idMVZ48^vS*I~fTQ\'4 /A2t:NJ2ecfV]by$][BcXNJIzx[c/]i#@^]2o4!?,WY$,dK8B,D|SZ&U^m>Q+$,RS+\'>NL2\'Bg5f?@3_Rb".]3fT:3+|le^+t= c}\\PkxW;&z94Bd\';ikxJDh[238e?3wxh3$0x1"A2{&eJlJWu;&YpV<$cuB,Z&8mMis0]}4?y(17k`d5$0C^X9~Z6i,W6y*ayG:1it=i.cK8BfgOU4,\'dnY.$;iV$]L@z8TLJK8f.\\G3835}*r7PL@0igHG@t@B=k=B"-2c2yD>,,\'GEE3YQYQOr[2c>Nukx}dNVCQ3.(W7A2/TQm&uDF-[1a`Cdn7N(xdYpL(cet[Ue)^:_~1HJH$t!j_Ue5ts*2s:(c^0Fsk_xNh2:?ja!gb&eZyE2afgQy+A<ETMi(8-BGF+(GF>Ae2\\@#?87"cr+teZ'

## LanguageModel

In [107]:
# Basic  transformer components
class SelfAttention(nn.Module):
    def __init__(self, embed_size, head_size, dropout=0.1):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.head_size = head_size
        
        self.keys = nn.Linear(self.embed_size, self.head_size, bias=False)
        self.queries = nn.Linear(self.embed_size, self.head_size, bias=False)
        self.value = nn.Linear(self.embed_size, self.head_size, bias=False)
        
        self.register_buffer('tril', torch.tril(torch.ones((x.shape[1], x.shape[1]))))
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        k = self.keys(x) # B, block_size, head_size
        q = self.queries(x) # B, block_size, head_size
        v = self.value(x)


        wei = q @ k.transpose(-2, -1) * self.head_size**-0.5  # (B, block_size, head_size) @ (B, head_size, block_size) -> (B, block_size, block_size)
        wei = wei.masked_fill(self.tril == 0, float('-inf')) # B, block_size, block_size
        wei = torch.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        out = wei @ v
        return out
 
class Mlp(nn.Module):
    def __init__(self, embed_size, mlp_size, dropout=0.1):
        super(Mlp, self).__init__()
        self.embed_size = embed_size
        self.mlp_size = mlp_size
        self.mlp = nn.Sequential(
            nn.Linear(embed_size, mlp_size),
            nn.ReLU(),
            nn.Linear(mlp_size, embed_size),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.mlp(x)
    
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, head_size, n_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        self.embed_size = embed_size
        self.head_size = head_size
        self.num_heads = n_heads
            
        self.attentions = nn.ModuleList([SelfAttention(embed_size, head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_heads * head_size, embed_size)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x):
        out = torch.cat([attn(x) for attn in self.attentions], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

In [112]:
# Block
class Block(nn.Module): 
    def __init__(self, embed_size, mlp_size, n_heads, dropout=0.1):
        super(Block, self).__init__()
        self.embed_size = embed_size
        self.head_size = embed_size // n_heads
        self.mlp_size = mlp_size
        self.n_heads = n_heads
        
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)
        self.mha = MultiHeadAttention(embed_size, self.head_size, n_heads, dropout)
        self.mlp = Mlp(embed_size, mlp_size, dropout)
        
    def forward(self, x):
        out = self.mha(x) + self.ln1(x)
        out = self.mlp(self.ln2(out)) + out
        return out

In [113]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_blocks=8, block_size=8, n_heads=8, dropout=0.1):
        super(LanguageModel, self).__init__()
        self.block_size = block_size
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_dim)
        self.positional_embedding_table = nn.Embedding(block_size, embedding_dim)
        self.blocks = nn.Sequential(*
                                    [Block(embedding_dim, embedding_dim*4, n_heads, dropout) for _ in range(n_blocks)],
                                    nn.LayerNorm(embedding_dim)
                                    )
        self.lm_head = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, idx, targets=None):
        token_embeddings = self.token_embedding_table(idx) # B, T, C
        positional_embeddings = self.positional_embedding_table(torch.arange(block_size).to(device) )# T, C
        x = token_embeddings + positional_embeddings # 
        x = self.blocks(x)
        logits = self.lm_head(x)
        
        B, T, C = logits.size()
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        else:
                loss = None
                
        return logits, loss
    
    def generate(self, idx, n):
        for _ in range(n):
            logits, _ = self(idx[:, -self.block_size:])
            next_idx = torch.multinomial(F.softmax(logits[:, -1], dim=1), 1)
            idx = torch.cat([idx, next_idx], dim=1)
        return idx

In [114]:
embedding_dim = 384
n_heads = 6
head_size = 32//n_heads 
block_size = 256
bach_size = 64
n_iters = 10000
lr = 3e-4
n_blocks = 6    
dropout = 0.2
m = LanguageModel(vocab_size=vocab_size,
                  embedding_dim=embedding_dim,
                  block_size=block_size,
                  n_heads=n_heads,
                  dropout=dropout,)
m.to(device)

optimizer = torch.optim.Adam(m.parameters(), lr=lr)

In [115]:
for step in range(n_iters+1):
    x, y = get_batch("train", block_size, bach_size)
    optimizer.zero_grad()
    logits, loss = m(x, y)
    loss.backward()
    optimizer.step()
    if step % (n_iters//10) == 0:
        losses = estimate_loss(m, 100, block_size=block_size)
        print(f"step: {step}, train loss: {losses['train']:.3f}, validation loss: {losses['validation']:.3f}")

step: 0, train loss: 4.291, validation loss: 4.268
step: 1000, train loss: 1.057, validation loss: 1.051
step: 2000, train loss: 0.920, validation loss: 0.924
step: 3000, train loss: 0.866, validation loss: 0.862
step: 4000, train loss: 0.834, validation loss: 0.841
step: 5000, train loss: 0.812, validation loss: 0.813
step: 6000, train loss: 0.797, validation loss: 0.798
step: 7000, train loss: 0.782, validation loss: 0.781
step: 8000, train loss: 0.769, validation loss: 0.775
step: 9000, train loss: 0.757, validation loss: 0.764
step: 10000, train loss: 0.749, validation loss: 0.754


In [118]:
def generate(model, prompt, n):
    idx = torch.tensor(encode(prompt)).unsqueeze(0).to(device)
    idx = model.generate(idx, n)
    return decode(idx[0].tolist())

In [119]:
prompt = """
L:1/8
Q:1/8=180
M:3/8
K:C
"""
print(generate(m, prompt, 128))

RuntimeError: The size of tensor a (27) must match the size of tensor b (256) at non-singleton dimension 1