<a href="https://colab.research.google.com/github/alako/gpt/blob/main/poems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Prepare data

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

### Data cleaning

In [105]:
with open('polish_poems.txt', 'r', encoding='utf-8') as f:
    text = f.read()


# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [106]:
print(chars)
print(vocab_size)

['\t', '\n', ' ', '!', '"', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\x7f', '\x84', '\x92', '\x95', '\x96', '\x9f', '§', '\xad', '°', '·', '¼', 'Ä', 'É', 'Ï', 'Ó', '×', 'ß', 'à', 'á', 'â', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'ö', 'ù', 'ú', 'û', 'ü', 'ý', 'ă', 'Ą', 'ą', 'Ć', 'ć', 'ċ', 'Č', 'č', 'ė', 'Ę', 'ę', 'ě', 'ĺ', 'Ľ', 'Ł', 'ł', 'Ń', 'ń', 'Ň', 'œ', 'ř', 'Ś', 'ś', 'ş', 'š', 'Ť', 'ť', 'ū', 'Ź', 'ź', 'Ż', 'ż', 'Ž', 'ž', 'ſ', 'ǃ', 'ə', 'ˇ', '̨', 'Ε', 'ά', 'ή', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ο', 'ρ', 'ς', 'τ', 'υ', 'χ', 'ω', 'А', 'В', 'О', 'С', 'Т', 'Ф', 'а', 'б', 'в', 'е', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 

In [107]:
counts = {c: 0 for c in chars}
for s in text:
  counts[s] += 1
sorted_counts = dict(sorted(counts.items(), key=lambda item: -item[1]))
print(sorted_counts)

{' ': 3802477, 'i': 1846927, 'a': 1720518, 'e': 1665847, 'o': 1374117, 'z': 1278910, 'n': 1001863, 'c': 896426, 'r': 889680, 's': 875419, 'w': 840943, 'y': 817039, '\n': 772895, 't': 688574, 'm': 687569, 'd': 672915, 'k': 643641, 'ł': 534310, ',': 530446, 'p': 473874, 'u': 439373, 'j': 430190, 'l': 389449, 'ę': 339355, 'b': 329027, 'g': 292754, 'ą': 285621, '.': 281838, 'h': 268230, 'ś': 212145, 'ż': 200165, 'ó': 179343, 'ć': 107779, '!': 84852, 'P': 71868, 'W': 70361, 'I': 67173, '—': 64218, 'N': 62745, 'A': 57070, 'T': 53165, 'ń': 51110, 'C': 50477, 'S': 47016, 'Z': 46334, 'J': 44806, 'B': 43638, 'O': 39246, 'K': 37590, ';': 34633, 'M': 33110, ':': 32205, 'D': 31244, '-': 30810, '?': 30710, 'G': 25994, 'f': 25180, 'ź': 23681, 'L': 23465, 'R': 18541, 'Ż': 14972, '"': 12303, 'é': 11392, 'U': 9809, 'Ś': 7996, 'H': 7235, 'E': 6759, '–': 4949, 'F': 3546, '1': 3347, 'Ł': 3333, '…': 2169, 'Y': 1846, '5': 1676, 'v': 1591, '2': 1514, '0': 1346, 'X': 1340, 'q': 1147, 'V': 1125, '3': 1057, '8':

In [108]:
garbage = [k for k, v in sorted_counts.items() if v<=200]
print(garbage)
garbage = {g: ' ' for g in garbage}

['\\', '\u2002', 'с', 'Q', '‑', 'к', 'р', '\t', 'у', 'ü', 'и', 'о', 'й', 'á', 'è', 'а', 'Ľ', 'ç', 'ö', 'à', 'ï', 'ä', '‒', '‘', '•', 'č', 'É', 'е', 'ô', '\x84', 'ò', 'Ť', 'š', 'н', 'â', 'ǃ', 'В', 'ь', 'ї', '×', 'ê', 'ñ', 'б', 'л', 'я', 'ť', 'С', 'Т', 'п', 'î', 'ă', 'Ž', '̨', '†', 'æ', 'ú', 'η', 'μ', 'А', '·', 'ν', 'ե', 'י', '✽', 'ë', 'ž', 'ε', 'ι', 'ο', 'ا', 'ر', 'ế', 'ệ', '\x95', '\x96', '°', 'Ä', 'í', 'ė', 'ě', 'œ', 'ə', 'γ', 'κ', 'λ', 'ς', 'τ', 'χ', 'О', 'м', 'ц', 'ա', 'հ', 'յ', 'ն', 'ր', 'س', 'ف', 'ی', 'ನ', 'ạ', '中', '文', '국', '어', '한', '\x7f', '\x92', '\x9f', '§', '¼', 'Ï', 'ß', 'ù', 'û', 'ý', 'ċ', 'Č', 'ĺ', 'Ň', 'ř', 'ş', 'ū', 'ˇ', 'Ε', 'ά', 'ή', 'β', 'δ', 'ζ', 'θ', 'ρ', 'υ', 'ω', 'Ф', 'в', 'ю', 'і', 'ґ', 'ִ', 'ב', 'ד', 'ע', 'ר', 'ש', 'ת', 'ب', 'ة', 'ع', 'ل', 'ي', 'অ', 'ম', 'য', 'স', '়', 'া', 'ী', 'ಕ', 'ಡ', '್', 'ọ', '\u2003', '―', '‹', '›', '日', '本', '語']


In [109]:
filtered = {key: value for key, value in sorted_counts.items() if value > 200}

In [110]:
print(sorted(list(filtered.keys())))
print(len(filtered))

['\n', ' ', '!', '"', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xad', 'Ó', 'é', 'ó', 'Ą', 'ą', 'Ć', 'ć', 'Ę', 'ę', 'Ł', 'ł', 'Ń', 'ń', 'Ś', 'ś', 'Ź', 'ź', 'Ż', 'ż', 'ſ', '–', '—', '’', '…']
97


In [111]:
mod = {'\xad': '-', "'" : '"', 'é': 'e', 'ſ': 's', '–': '-', '—': '-', '…': ' '}
mod.update(garbage)

In [112]:
c = [index for index, char in enumerate(text) if char == '…']
print(len(c))
c[:5]

2169


[93116, 93278, 93430, 93589, 94185]

In [113]:
for ix in c[:5]:
  print(f'*<*{text[ix-10:ix+10]}*>*')

*<* nadzieję!… nie tę l*>*
*<* nadzieję!… nie tę c*>*
*<*my odwagę!… nie tę j*>*
*<*my odwagę!… nie tę t*>*
*<*się zbroić…
Lecz nie*>*


In [114]:
text = ''.join([mod[c] if c in mod else c for c in text])
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(chars)
print(vocab_size)

['\n', ' ', '!', '"', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ó', 'ó', 'Ą', 'ą', 'Ć', 'ć', 'Ę', 'ę', 'Ł', 'ł', 'Ń', 'ń', 'Ś', 'ś', 'Ź', 'ź', 'Ż', 'ż', '’']
90


### Prepare training set

In [115]:
print(text[:100])

Sukienkę miała w paseczki 
Perkalikową, 
We włosach polne kwiateczki, 
Twarzyczkę zawsze różową; 
Ni


In [116]:
len(text)

27082590

In [117]:
import re
text = re.sub(' +', ' ', text)
print(text[:100])
print(len(text))

Sukienkę miała w paseczki 
Perkalikową, 
We włosach polne kwiateczki, 
Twarzyczkę zawsze różową; 
Ni
27080306


In [118]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [119]:
train_data[:10]

tensor([37, 65, 55, 53, 49, 58, 55, 78,  1, 57])

## Model definition

In [125]:
# hyperparameters
batch_size = 64 # 16 # how many independent sequences will we process in parallel?
block_size = 256 #32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500 #100
learning_rate = 3e-4 #1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384 #64
n_head = 6 #4
n_layer = 6 #4
dropout = 0.2 # 0.0
# batch_size = 16 # how many independent sequences will we process in parallel?
# block_size = 32 # what is the maximum context length for predictions?
# max_iters = 5000
# eval_interval = 100
# learning_rate = 1e-3
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# eval_iters = 200
# n_embd = 64
# n_head = 4
# n_layer = 4
# dropout = 0.0
# ------------

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval() # set to eval mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Heads(nn.Module):
    """ multiple heads of self-attention """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.keys = nn.Linear(n_embd, num_heads * head_size, bias=False)
        self.queries = nn.Linear(n_embd, num_heads * head_size, bias=False)
        self.values = nn.Linear(n_embd, num_heads * head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout_attn = nn.Dropout(dropout)
        self.dropout_resid = nn.Dropout(dropout)
        self.proj = nn.Linear(n_embd, n_embd) # needed for residual connection (to come back)
        self.nh = num_heads
        self.hs = head_size

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # C = nh*hs
        k = self.keys(x)   # (B, T, nh*hs)
        q = self.queries(x)   # (B, T, nh*hs)
        v = self.values(x)   # (B, T, nh*hs)
        k = k.view(B, T, self.nh, self.hs).transpose(1,2) # (B, nh, T, hs)
        q = q.view(B, T, self.nh, self.hs).transpose(1,2) # (B, nh, T, hs)
        v = v.view(B, T, self.nh, self.hs).transpose(1,2) # (B, nh, T, hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, nh, T, hs) @ (B, nh, hs, T) -> (B, nh, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, nh, T, T)
        wei = F.softmax(wei, dim=-1) # (B, nh, T, T)
        wei = self.dropout_attn(wei)
        # perform the weighted aggregation of the values
        out = wei @ v # (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs) [16, 4, 32, 16]
        out = out.transpose(1,2).contiguous().view(B,T,self.nh*self.hs) # (B, nh, T, hs)->(B, T, nh, hs)->(B, T, nh*hs)
        out = self.dropout_resid(self.proj(out))
        return out


class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        # self.sa = MultiHeadAttention(n_head, head_size)
        self.sa = Heads(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        # residual connections x+ --> allows to propagate gradient!
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class GPTModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

## Train model

In [126]:
model = GPTModel()
model = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

10.808154 M parameters


In [127]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.6473, val loss 4.6498
step 500: train loss 2.3785, val loss 2.4001
step 1000: train loss 2.0576, val loss 2.0747
step 1500: train loss 1.8914, val loss 1.8974
step 2000: train loss 1.7897, val loss 1.7912
step 2500: train loss 1.7212, val loss 1.7233
step 3000: train loss 1.6686, val loss 1.6813
step 3500: train loss 1.6365, val loss 1.6467
step 4000: train loss 1.6150, val loss 1.6311
step 4500: train loss 1.5902, val loss 1.6062
step 4999: train loss 1.5740, val loss 1.5938

Stać ze straszki mażeńszego umdem, wtedy sto mama w suknosiu, głosił snu tym, co pszczonopodołując mu wiatr, lub tam dziś to trzymasz wy cendare: 
Słów sam, krew do prawdy było na boskim polu chłopię, nie spodziane matre.
W cichym ku sztukach miorzni drżał, gdy światom i strawy kurze po niemowie, zowieszakami Lewie take, 
Z uderzew wichle pchnął o rybach, a naszym czołem towarem, 
Iż się w im dzieło, o piersi! I izdą teraz strawy ubliska rumachy fal mowy krótkiem?
Cóż to Francelby do mojej pę