# Example (Byte-Level BPE Tokenization)

In [2]:
"""
Byte‑Level BPE demo:
* Learns merges from one ASCII text segment.
* Stops when every pair is unique or target vocab size reached.
* Prints step‑by‑step tokens and vocabulary with pandas.
"""
import sys
from collections import Counter
import pandas as pd

# Example sentence with repeated substrings
sentence = "low_lower"

# Desired vocabulary size (≥ initial size)
target_vocab_size = 8 

tokens = list(sentence)      # 1️⃣  start at byte level
vocab  = set(tokens)         #    current vocabulary

history = [(0, tokens.copy(), sorted(vocab))]
step = 0

while True:
    # 2️⃣  stop if pairs unique or vocab limit reached
    if target_vocab_size and len(vocab) >= target_vocab_size:
        break
    pairs = Counter(zip(tokens, tokens[1:]))
    if not pairs or max(pairs.values()) <= 1:
        break

    pair = pairs.most_common(1)[0][0]  # most‑frequent adjacent pair

    # 3️⃣  merge selected pair
    merged, i = [], 0
    while i < len(tokens):
        if i < len(tokens) - 1 and (tokens[i], tokens[i + 1]) == pair:
            new_tok = tokens[i] + tokens[i + 1]
            merged.append(new_tok)
            vocab.add(new_tok)
            i += 2
        else:
            merged.append(tokens[i])
            i += 1

    tokens = merged
    step += 1
    history.append((step, tokens.copy(), sorted(vocab)))

# 4️⃣  tidy DataFrame of the process
df = pd.DataFrame(history, columns=["step", "tokens", "vocab"])
df['vocab_size'] = [len(v) for v in df['vocab']]
print(df[['step','vocab','vocab_size']].to_string(index=False))
print()
print(df[['step','tokens']].to_string(index=False))


 step                       vocab  vocab_size
    0          [_, e, l, o, r, w]           6
    1      [_, e, l, lo, o, r, w]           7
    2 [_, e, l, lo, low, o, r, w]           8

 step                      tokens
    0 [l, o, w, _, l, o, w, e, r]
    1     [lo, w, _, lo, w, e, r]
    2         [low, _, low, e, r]
