In [None]:
import datasets
import argparse
import torch
import tqdm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

device: cuda


# DATA

## Load data

In [3]:
# load data
dataset = datasets.load_from_disk("dataset")

## Tokenizer Encoding/Decoding

In [63]:
# encoding and decoding  
chars = sorted(set("\n\n".join(dataset["train"]["abc notation"]+dataset["validation"]["abc notation"])))
vocab_size = len(chars) 
print(f"vocab_size: {vocab_size}")
print(f"chars: {chars}")
chat2index = {ch:i for i, ch in enumerate(chars)}
index2chat = {i:ch for i, ch in enumerate(chars)}
encode = lambda x: [chat2index[c] for c in x]
decode = lambda x: "".join([index2chat[c] for c in x])

vocab_size: 95
chars: ['\n', ' ', '!', '"', '#', '$', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']


## Train/Validation data envoding

In [5]:
# encode training data
# dataset = dataset.map(lambda x: {"abc notation": encode(x["abc notation"])})
text = "\n\n".join(dataset["train"]["abc notation"]+dataset["validation"]["abc notation"])
training_data = torch.tensor(encode(text))
print(len(training_data))

63170498


In [6]:
# encode validation data
# dataset = dataset.map(lambda x: {"abc notation": encode(x["abc notation"])})
text = "\n\n".join(dataset["validation"]["abc notation"]+dataset["validation"]["abc notation"])
validation_data = torch.tensor(encode(text))
print(len(validation_data))

1247506


## Data batch

In [8]:
# bach generator
def get_batch(split, block_size=8, bach_size=32):
    if split == "train":
        data = training_data
    elif split == "validation":
        data = validation_data
    else:
        raise ValueError("split must be 'train' or 'validation'")
    start_idx = torch.randint(0, data.size(0) - block_size, (bach_size,))
    x = torch.stack([data[idx:idx+block_size] for idx in start_idx]).to(device)
    y = torch.stack([data[idx+1:idx+block_size+1] for idx in start_idx]).to(device)
    return x, y

In [62]:
torch.manual_seed(42)
bach_size = 1
block_size = 48
x, y = get_batch("train", block_size=block_size, bach_size=bach_size)
for b in range(bach_size):
    for t in range(block_size):
        context = x[b, :t+1]
        target = y[b, t]
    print(context.tolist(), "->", target.item())
    print(decode(context.tolist()), "->", decode([target.item()]))

[18, 1, 34, 1, 34, 33, 34, 1, 92, 1, 68, 69, 70, 1, 71, 70, 71, 1, 92, 1, 69, 65, 65, 1, 65, 18, 1, 71, 1, 92, 1, 70, 68, 68, 1, 68, 18, 1, 67, 1, 92, 1, 34, 18, 1, 34, 1, 34] -> 33
2 B BAB | def gfg | eaa a2 g | fdd d2 c | B2 B B -> A


## Bigram Model

In [10]:
import torch.nn as nn
import torch.nn.functional as F

In [11]:
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigramModel, self).__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        B, T, C = logits.size()
        
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        else:
                loss = None
        return logits, loss
    
    def generate(self, idx, n):
        for _ in range(n):
            logits = self.token_embedding_table(idx)
            next_idx = torch.multinomial(F.softmax(logits[:, -1], dim=1), 1)
            idx = torch.cat([idx, next_idx], dim=1)
        return idx

In [12]:
x, y = get_batch("train")
m = BigramModel(vocab_size)
m.to(device)
logits, loss = m(x, y)  
print(logits.shape)
print(loss)

torch.Size([32, 8, 95])
tensor(4.9579, device='cuda:0', grad_fn=<NllLossBackward0>)


In [13]:
idx = torch.zeros(1, 1).long().to(device)
g = m.generate(idx, 100)
decode(g[0].tolist())

'\nJ2)[ *)yLorct0s$/T@\'>3zx1G~SbJD2)[Rt)1y&cV*_1"6&sa.x}j7SDM<\'^E?$67R)V\\N/u?ruzu5|<zEWYTv>V<H#A{j[($Z5'

### training

In [14]:
with torch.no_grad():
    def estimate_loss(model, eval_iters, block_size):
        out = {}
        model.eval()
        for split in dataset:
            losses = torch.zeros(eval_iters)
            for i in range(eval_iters):
                x, y = get_batch(split, block_size)
                _, loss = model(x, y)
                losses[i] = loss.item()
            out[split] = losses.mean()
        model.train()
        return out

In [15]:
n_iters = 1000
optimizer = torch.optim.Adam(m.parameters(), lr=0.001)

In [16]:
for step in range(n_iters):
    x, y = get_batch("train")
    optimizer.zero_grad()
    logits, loss = m(x, y)
    loss.backward()
    optimizer.step()
    if step % (n_iters//10) == 0:
        losses = estimate_loss(m, 100, block_size)
        print(f"step: {step}, train loss: {losses['train']:.3f}, validation loss: {losses['validation']:.3f}")

step: 0, train loss: 4.983, validation loss: 4.966
step: 100, train loss: 4.845, validation loss: 4.832
step: 200, train loss: 4.715, validation loss: 4.712
step: 300, train loss: 4.603, validation loss: 4.587
step: 400, train loss: 4.472, validation loss: 4.473
step: 500, train loss: 4.368, validation loss: 4.357
step: 600, train loss: 4.252, validation loss: 4.237
step: 700, train loss: 4.142, validation loss: 4.142
step: 800, train loss: 4.051, validation loss: 4.046
step: 900, train loss: 3.949, validation loss: 3.935


In [17]:
idx = torch.zeros(1, 1).long().to(device)
g = m.generate(idx, 500)

decode(g[0].tolist()).replace("\n", "")

'Iw1_b&qn,@6ilm("61i~IS#Tk1/2LqP{pocT#Nwry#tAG104fj[&RFa;idMVZ48^vS*I~fTQ\'4 /A2t:NJ2ecfV]by$][BcXNJIzx[c/]i#@^]2o4!?,WY$,dK8B,D|SZ&U^m>Q+$,RS+\'>NL2\'Bg5f?@3_Rb".]3fT:3+|le^+t= c}\\PkxW;&z94Bd\';ikxJDh[238e?3wxh3$0x1"A2{&eJlJWu;&YpV<$cuB,Z&8mMis0]}4?y(17k`d5$0C^X9~Z6i,W6y*ayG:1it=i.cK8BfgOU4,\'dnY.$;iV$]L@z8TLJK8f.\\G3835}*r7PL@0igHG@t@B=k=B"-2c2yD>,,\'GEE3YQYQOr[2c>Nukx}dNVCQ3.(W7A2/TQm&uDF-[1a`Cdn7N(xdYpL(cet[Ue)^:_~1HJH$t!j_Ue5ts*2s:(c^0Fsk_xNh2:?ja!gb&eZyE2afgQy+A<ETMi(8-BGF+(GF>Ae2\\@#?87"cr+teZ'

## LanguageModel

In [18]:
# Basic  transformer components
class SelfAttention(nn.Module):
    def __init__(self, embed_size, head_size, dropout=0.1, block_size=8):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.head_size = head_size
        
        self.keys = nn.Linear(self.embed_size, self.head_size, bias=False)
        self.queries = nn.Linear(self.embed_size, self.head_size, bias=False)
        self.value = nn.Linear(self.embed_size, self.head_size, bias=False)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B, T, C = x.shape
        k = self.keys(x) # B, block_size, head_size
        q = self.queries(x) # B, block_size, head_size
        v = self.value(x)

        tril = torch.tril(torch.ones(T, T)).to(x.device)    
        wei = q @ k.transpose(-2, -1) * self.head_size**-0.5  # (B, block_size, head_size) @ (B, head_size, block_size) -> (B, block_size, block_size)
        wei = wei.masked_fill(tril == 0, float('-inf')) # B, block_size, block_size
        wei = torch.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        out = wei @ v
        return out
 
class Mlp(nn.Module):
    def __init__(self, embed_size, mlp_size, dropout=0.1):
        super(Mlp, self).__init__()
        self.embed_size = embed_size
        self.mlp_size = mlp_size
        self.mlp = nn.Sequential(
            nn.Linear(embed_size, mlp_size),
            nn.ReLU(),
            nn.Linear(mlp_size, embed_size),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.mlp(x)
    
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, head_size, n_heads, dropout=0.1, block_size=8):
        super(MultiHeadAttention, self).__init__()
        self.embed_size = embed_size
        self.head_size = head_size
        self.num_heads = n_heads
        self.attentions = nn.ModuleList([SelfAttention(embed_size, head_size, block_size=block_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_heads * head_size, embed_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([attn(x) for attn in self.attentions], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

In [19]:
# Block
class Block(nn.Module): 
    def __init__(self, embed_size, mlp_size, n_heads, dropout=0.1, block_size=8):
        super(Block, self).__init__()
        self.embed_size = embed_size
        self.head_size = embed_size // n_heads
        self.mlp_size = mlp_size
        self.n_heads = n_heads
        
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)
        self.mha = MultiHeadAttention(embed_size, self.head_size, n_heads, dropout, block_size)
        self.mlp = Mlp(embed_size, mlp_size, dropout)
        
    def forward(self, x):
        out = self.mha(x) + self.ln1(x)
        out = self.mlp(self.ln2(out)) + out
        return out

In [20]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_blocks=8, block_size=8, n_heads=8, dropout=0.1):
        super(LanguageModel, self).__init__()
        self.block_size = block_size
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_dim)
        self.positional_embedding_table = nn.Embedding(block_size, embedding_dim)
        self.blocks = nn.Sequential(*
                                    [Block(embedding_dim,
                                           embedding_dim*4,
                                           n_heads, dropout,
                                           block_size) for _ in range(n_blocks)],
                                    nn.LayerNorm(embedding_dim)
                                    )
        self.lm_head = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, x, targets=None):
        B, T = x.shape
        token_embeddings = self.token_embedding_table(x) # B, T, C
        positional_embeddings = self.positional_embedding_table(torch.arange(T).to(device) )# T, C
        x = token_embeddings + positional_embeddings # 
        x = self.blocks(x)
        logits = self.lm_head(x)
        
        B, T, C = logits.size()
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        else:
                loss = None
                
        return logits, loss
    
    def generate(self, idx, n):
        for _ in range(n):
            logits, _ = self(idx[:, -self.block_size:])
            next_idx = torch.multinomial(F.softmax(logits[:, -1], dim=1), 1)
            idx = torch.cat([idx, next_idx], dim=1)
        return idx

    def save_model(self, path):
        torch.save(self.state_dict(), path)
        
    def load_model(self, path):
        self.load_state_dict(torch.load(path))
          

In [21]:
embedding_dim = 384 
n_heads = 6
head_size = 32//n_heads 
block_size = 256 # context window size
bach_size = 64
n_iters = 1000
lr = 3e-4
n_blocks = 6    
dropout = 0.2
m = LanguageModel(vocab_size=vocab_size,
                  embedding_dim=embedding_dim,
                  block_size=block_size,
                  n_heads=n_heads,
                  dropout=dropout,)
m.to(device)
optimizer = torch.optim.Adam(m.parameters(), lr=lr)

In [22]:
import tqdm
with tqdm(total=len(n_iters+1), desc=f"Iter {epoch + 1}", unit="Iterations") as p_bar:
    for step in range(n_iters+1):
        x, y = get_batch("train", block_size, bach_size)
        optimizer.zero_grad()
        logits, loss = m(x, y)
        loss.backward()
        optimizer.step()
        if step % (n_iters//10) == 0 and step :
            losses = estimate_loss(m, 100, block_size=block_size)
            print(f"step: {step}, train loss: {losses['train']:.3f}, validation loss: {losses['validation']:.3f}")
        p_bar.update(1)
m.save_model(r"models/model.pth")

step: 100, train loss: 2.088, validation loss: 2.069
step: 200, train loss: 1.710, validation loss: 1.694
step: 300, train loss: 1.516, validation loss: 1.521
step: 400, train loss: 1.425, validation loss: 1.413
step: 500, train loss: 1.343, validation loss: 1.340
step: 600, train loss: 1.272, validation loss: 1.261
step: 700, train loss: 1.195, validation loss: 1.196
step: 800, train loss: 1.143, validation loss: 1.134
step: 900, train loss: 1.103, validation loss: 1.101
step: 1000, train loss: 1.065, validation loss: 1.077


In [25]:
def generate(model, prompt, n):
    encoded_prompt = torch.tensor(encode(prompt)).unsqueeze(0).to(device)
    print(encoded_prompt)
    out = model.generate(encoded_prompt, n)
    return decode(out[0].tolist())

In [27]:
prompt = """
L:1/8
Q:1/8=180
M:3/8
K:C
"""
m = LanguageModel(vocab_size=vocab_size,
                  embedding_dim=embedding_dim,
                  block_size=block_size,
                  n_heads=n_heads,
                  dropout=dropout)
# m.load_model(r"models/model.pth")
m.to(device)
print(generate(m, prompt, 128))

tensor([[ 0, 44, 26, 17, 15, 24,  0, 49, 26, 17, 15, 24, 29, 17, 24, 16,  0, 45,
         26, 19, 15, 24,  0, 43, 26, 35,  0]], device='cuda:0')

L:1/8
Q:1/8=180
M:3/8
K:C
jQPN3xf7ufQ1Xt`dzuC>
<I37";]#ap.}.x?JQCB}C'O: 1rRIZVqu$C:MTacts"xfa4_)]"|ke4#4f|G}O8qjU<rW7:HGG:4RC=&#yXH<:cPmy4<lL+z$9,Yyc*t"6@
