In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

tokens = text.encode("utf-8")
tokens = list(map(int, tokens))

print("lenght:", len(text))
print("---")
print("lenght:", len(tokens))

lenght: 6235521
---
lenght: 6252771


In [None]:
print(tokens[:10])

[72, 97, 114, 114, 121, 32, 80, 111, 116, 116]


In [None]:
def get_stats(ids):
  counts = {}
  for pair in zip(ids, ids[1:]):
    counts[pair] = counts.get(pair, 0) + 1
  return counts

stats = get_stats(tokens)
print(sorted(((v,k) for k,v in stats.items()), reverse=True))

[(169077, (101, 32)), (141235, (32, 116)), (127879, (100, 32)), (125335, (104, 101)), (112851, (116, 104)), (98697, (116, 32)), (97390, (32, 97)), (93748, (105, 110)), (91570, (115, 32)), (81631, (32, 104)), (80536, (32, 115)), (74040, (101, 114)), (72483, (44, 32)), (67894, (32, 119)), (62771, (101, 100)), (59724, (114, 101)), (59049, (97, 110)), (55958, (121, 32)), (55674, (110, 32)), (54916, (111, 117)), (54104, (97, 114)), (53795, (110, 103)), (52472, (32, 111)), (50901, (111, 110)), (50087, (110, 100)), (46060, (104, 97)), (44962, (46, 32)), (44672, (114, 32)), (43480, (97, 116)), (42879, (111, 32)), (42334, (104, 105)), (42229, (116, 111)), (41920, (32, 98)), (40917, (103, 32)), (40032, (32, 105)), (39911, (111, 114)), (39150, (97, 115)), (37463, (101, 110)), (36638, (108, 101)), (34877, (32, 102)), (34843, (115, 116)), (33534, (101, 97)), (32845, (105, 115)), (32338, (105, 116)), (31526, (116, 101)), (31032, (32, 99)), (31031, (119, 97)), (29602, (108, 108)), (29552, (118, 101))

In [None]:
chr(101)

'e'

In [None]:
top_pair = max(stats, key = stats.get)
top_pair

(101, 32)

In [None]:
def merge(ids, pair, idx):
  newids = []
  i = 0
  while i < len(ids):
    if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
      newids.append(idx)
      i += 2
    else:
      newids.append(ids[i])
      i += 1
  return newids

# Example
print("Example merge: ", merge([5,6,6,7,9,1], (6,7), 99))

tokens2 = merge(tokens, top_pair, 256)
print("Lenght after one new encoding:", len(tokens2))
print("Lenght before new encoding: ", len(tokens))
print("Difference: ", len(tokens)-len(tokens2))

Example merge:  [5, 6, 99, 9, 1]
Lenght after one new encoding: 6083694
Lenght before new encoding:  6252771
Difference:  169077


In [None]:
vocab_size = 456 # depending on best performance, this will do 20 merges exactly
num_merges = vocab_size - 256
ids = list(tokens) # copy operation

merges = {}
for i in range(num_merges):
  stats = get_stats(ids)
  pair = max(stats, key=stats.get)
  idx = 256 + i
  print(f"merging {pair} into new token -> {idx}")
  ids = merge(ids, pair, idx)
  merges[pair] = idx

merging (101, 32) into new token -> 256
merging (100, 32) into new token -> 257
merging (116, 104) into new token -> 258
merging (116, 32) into new token -> 259
merging (105, 110) into new token -> 260
merging (115, 32) into new token -> 261
merging (101, 114) into new token -> 262
merging (44, 32) into new token -> 263
merging (97, 110) into new token -> 264
merging (121, 32) into new token -> 265
merging (111, 117) into new token -> 266
merging (97, 114) into new token -> 267
merging (111, 110) into new token -> 268
merging (258, 256) into new token -> 269
merging (260, 103) into new token -> 270
merging (101, 257) into new token -> 271
merging (46, 32) into new token -> 272
merging (111, 32) into new token -> 273
merging (111, 114) into new token -> 274
merging (101, 110) into new token -> 275
merging (270, 32) into new token -> 276
merging (116, 273) into new token -> 277
merging (32, 115) into new token -> 278
merging (108, 108) into new token -> 279
merging (104, 105) into new to

In [None]:
print(f"tokens length: {len(tokens)}\nids lenght: {len(ids)}\ncompression ratio: {len(tokens) / len(ids):.2f}X")

tokens length: 6252771
ids lenght: 3127569
compression ratio: 2.00X


In [None]:
vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0,p1), idx in merges.items():
  vocab[idx] = vocab[p0] + vocab[p1]

def decode(ids):
  tokens = b"".join(vocab[idx] for idx in ids)
  text = tokens.decode("utf-8", errors="replace")
  return text

In [None]:
def encode(text):
  tokens = list(text.encode("utf-8"))
  while len(tokens) >= 2:
    stats = get_stats(tokens)
    pair = min(stats, key=lambda p: merges.get(p, float("inf")))
    if pair not in merges:
      break # nothing else can be merged
    idx = merges[pair]
    tokens = merge(tokens, pair, idx)
  return tokens

print(encode("Harry Potter!"))

[325, 80, 111, 397, 262, 33]


### Adding regex

In [None]:
import regex as re

gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

print(re.findall(gpt2pat, "Hello world, how are you?"))

['Hello', ' world', ',', ' how', ' are', ' you', '?']


In [None]:
from collections import Counter

# GPT-2 style tokenizer
gpt2pat = re.compile(
    r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
    re.UNICODE,
)

def tokenize(text):
    split_text = gpt2pat.findall(text)
    print("Splitted text: ", split_text)
    tokenized_text = [word.encode("utf-8") for word in split_text]
    print("Tokenized text: ", tokenized_text)
    tokens_list = [list(map(int, x)) for x in tokenized_text]
    print("Token list: ", tokens_list)
    return tokens_list

def get_stats(words):
    from collections import Counter
    pairs = Counter()
    for word in words:
        for a, b in zip(word, word[1:]):
            pairs[(a, b)] += 1
    return pairs

# Merge function: merges a given pair in one “word”
def merge_word(word, pair, new_sym):
    a, b = pair
    out = []
    i = 0
    while i < len(word):
        if i < len(word) - 1 and word[i] == a and word[i+1] == b:
            out.append(new_sym)
            i += 2
        else:
            out.append(word[i])
            i += 1
    return out

# BPE
def learn_bpe(text, desired_merges, start_id=256):
    tokens = tokenize(text)
    words = [list(tok) for tok in tokens]
    merges = {}
    next_id = start_id

    for i in range(desired_merges):
        stats = get_stats(words)
        if not stats:
            break           # nothing to merge
        best_pair = max(stats, key=stats.get)
        merges[best_pair] = next_id
        # perform merge inside each “word”
        words = [merge_word(w, best_pair, next_id) for w in words]
        print(f"Merge #{i+1}: {best_pair} → {next_id}")
        next_id += 1

    return words, merges

# Example:
text1 = "Hello, world!"
final_words, merge_list = learn_bpe(text1, desired_merges=12)
print("Resulting words:", final_words)
print("Merge map:", merge_list)


Splitted text:  ['Hello', ',', ' world', '!']
Tokenized text:  [b'Hello', b',', b' world', b'!']
Token list:  [[72, 101, 108, 108, 111], [44], [32, 119, 111, 114, 108, 100], [33]]
Merge #1: (72, 101) → 256
Merge #2: (256, 108) → 257
Merge #3: (257, 108) → 258
Merge #4: (258, 111) → 259
Merge #5: (32, 119) → 260
Merge #6: (260, 111) → 261
Merge #7: (261, 114) → 262
Merge #8: (262, 108) → 263
Merge #9: (263, 100) → 264
Resulting words: [[259], [44], [264], [33]]
Merge map: {(72, 101): 256, (256, 108): 257, (257, 108): 258, (258, 111): 259, (32, 119): 260, (260, 111): 261, (261, 114): 262, (262, 108): 263, (263, 100): 264}


In [None]:
print(tokenize("Hello, world"))

[[72, 101, 108, 108, 111], [44], [32, 119, 111, 114, 108, 100]]


### With encoder and decoder:

In [None]:
import regex as re
from collections import Counter

# — your GPT-2 style tokenizer & BPE learner ↓

gpt2pat = re.compile(
    r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
    re.UNICODE,
)

def tokenize(text):
    # returns a list of lists of byte-values (one sublist per GPT-2 token)
    parts = gpt2pat.findall(text)
    return [list(tok.encode('utf-8')) for tok in parts]

def get_stats(tokens):
    pairs = Counter()
    for a, b in zip(tokens, tokens[1:]):
        pairs[(a, b)] += 1
    return pairs

def merge_word(word, pair, new_sym):
    a, b = pair
    out, i = [], 0
    print(f"   merging pair: {pair} → {new_sym!r}")
    while i < len(word):
        if i < len(word)-1 and word[i]==a and word[i+1]==b:
            out.append(new_sym)
            i += 2
        else:
            out.append(word[i])
            i += 1
    return out

def learn_bpe(text, desired_merges, start_id=256):
    words = tokenize(text)
    # flatten into one big list so we count across the entire text:
    flat = [b for w in words for b in w]
    merges = {}
    next_id = start_id

    for i in range(desired_merges):
        stats = get_stats(flat)
        if not stats:
            break
        best = max(stats, key=stats.get)
        merges[best] = next_id
        # apply that merge everywhere in the flat list
        flat = merge_word(flat, best, next_id)
        next_id += 1

    return merges

# — build a vocab map so we can decode back ↓

def build_vocab(merges, start_id=256):
    # initial: each byte→itself
    vocab = {i: bytes([i]) for i in range(start_id)}
    # then each merged symbol → concatenation of its two parts
    for (a,b), idx in merges.items():
        vocab[idx] = vocab[a] + vocab[b]
    return vocab

# — encoder applies all merges until none remain ↓

def encode(text, merges, start_id=256):
    # start with raw bytes
    tokens = list(text.encode('utf-8'))
    while True:
        stats = get_stats(tokens)
        # pick only those pairs that we learned
        candidates = [p for p in stats if p in merges]
        if not candidates:
            break
        # pick the earliest‐learned merge (smallest merge‐ID)
        pair = min(candidates, key=lambda p: merges[p])
        tokens = merge_word(tokens, pair, merges[pair])
    return tokens

# — decoder rebuilds the bytes and decodes ↓

def decode(ids, merges, start_id=256):
    vocab = build_vocab(merges, start_id)
    data = b''.join(vocab[i] for i in ids)
    return data.decode('utf-8', errors='replace')

# — demonstration ↓
merges = learn_bpe(text, desired_merges=200)

enc = encode(text[:100], merges)
print("Encoded IDs:", enc)

dec = decode(enc, merges)
print("Decoded text:", dec)
print("Round-trip ok?", dec == text[:100])


   merging pair: (101, 32) → 256
   merging pair: (100, 32) → 257
   merging pair: (116, 104) → 258
   merging pair: (116, 32) → 259
   merging pair: (105, 110) → 260
   merging pair: (115, 32) → 261
   merging pair: (101, 114) → 262
   merging pair: (44, 32) → 263
   merging pair: (97, 110) → 264
   merging pair: (121, 32) → 265
   merging pair: (111, 117) → 266
   merging pair: (97, 114) → 267
   merging pair: (111, 110) → 268
   merging pair: (258, 256) → 269
   merging pair: (260, 103) → 270
   merging pair: (101, 257) → 271
   merging pair: (46, 32) → 272
   merging pair: (111, 32) → 273
   merging pair: (111, 114) → 274
   merging pair: (101, 110) → 275
   merging pair: (270, 32) → 276
   merging pair: (116, 273) → 277
   merging pair: (32, 115) → 278
   merging pair: (108, 108) → 279
   merging pair: (104, 105) → 280
   merging pair: (104, 97) → 281
   merging pair: (264, 257) → 282
   merging pair: (111, 102) → 283
   merging pair: (114, 101) → 284
   merging pair: (119, 97) → 

In [None]:
decode([430], merges)

'her '

### GPT Pretraining

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from collections import Counter

import regex as re

# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# 1) learn your merges on the raw text
def get_stats(tokens):
    pairs = Counter()
    for a, b in zip(tokens, tokens[1:]):
        pairs[(a, b)] += 1
    return pairs

def merge_word(tokens, pair, new_id):
    a, b = pair
    out, i = [], 0
    while i < len(tokens):
        if i < len(tokens)-1 and tokens[i]==a and tokens[i+1]==b:
            out.append(new_id)
            i += 2
        else:
            out.append(tokens[i])
            i += 1
    return out

def learn_bpe(text, desired_merges, start_id=256):
    # split into GPT-2 regex tokens, then to bytes
    gpt2pat = re.compile(
      r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
      re.UNICODE
    )
    parts = gpt2pat.findall(text)
    words = [list(p.encode("utf-8")) for p in parts]
    # flatten so we learn merges across the whole corpus
    flat = [b for w in words for b in w]
    merges, next_id = {}, start_id

    for _ in range(desired_merges):
        stats = get_stats(flat)
        if not stats: break
        best = max(stats, key=stats.get)
        merges[best] = next_id
        flat = merge_word(flat, best, next_id)
        next_id += 1

    return merges

# 2) build encode/decode using those merges
def encode_text(text, merges, start_id=256):
    # split+bytes
    gpt2pat = re.compile(
      r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+|
          ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
      re.UNICODE
    )
    parts = gpt2pat.findall(text)
    words = [list(p.encode("utf-8")) for p in parts]
    # apply merges in order of their IDs
    for (a,b), new_id in sorted(merges.items(), key=lambda kv: kv[1]):
        words = [merge_word(w, (a,b), new_id) for w in words]
    # flatten into one long list of token-IDs
    return [tok for w in words for tok in w]

def build_vocab(merges, start_id=256):
    vocab = {i: bytes([i]) for i in range(start_id)}
    for (a,b), idx in merges.items():
        vocab[idx] = vocab[a] + vocab[b]
    return vocab

def decode_ids(ids, merges, start_id=256):
    vocab = build_vocab(merges, start_id)
    data = b''.join(vocab[i] for i in ids)
    return data.decode("utf-8", errors="replace")

# 3) actually run it on your corpus
with open('input.txt','r',encoding='utf-8') as f:
    text = f.read()

merges      = learn_bpe(text, desired_merges=10, start_id=256)
vocab_size  = 256 + len(merges)     # new vocab size
data_ids    = torch.tensor(encode_text(text, merges), dtype=torch.long)
n = int(0.9 * len(data_ids))
train_data  = data_ids[:n].to(device)
val_data    = data_ids[n:].to(device)

# data loading
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)

        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)

        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



10.943498 M parameters
step 0: train loss 5.6495, val loss 5.6529
step 500: train loss 1.7700, val loss 1.7908
step 1000: train loss 1.4696, val loss 1.5195
step 1500: train loss 1.3425, val loss 1.4057
step 2000: train loss 1.2757, val loss 1.3452
step 2500: train loss 1.2326, val loss 1.3182
step 3000: train loss 1.1977, val loss 1.2885
step 3500: train loss 1.1760, val loss 1.2743
step 4000: train loss 1.1567, val loss 1.2645
step 4500: train loss 1.1378, val loss 1.2516
step 4999: train loss 1.1210, val loss 1.2407


NameError: name 'decode' is not defined

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode_ids(m.generate(context, max_new_tokens=500)[0].tolist(), merges))
# save 10000 tokens to a txt file
open('more.txt', 'w').write(decode_ids(m.generate(context, max_new_tokens=10000)[0].tolist(), merges))

 e want them to get a phant very look aften and with at their darkness Dumbledore cutting himself straight in high skid breath and to Harry shooks his head in midaisfact Undid White at Harry's influence
Through keeping mulk of people trolleing rains again as he left to pincid
Harry said Dumbledore softly Come on I suppose you cannot have had entirely at school
Isn't each and now And my office when we were running up Hermione gave her just told us
Firenze into and twitches No people pages flashing
Maybe said Hermobing shaking 


10623