In this chapter we build GPT tokenizer

byte-pair encoding is pretty simple:
* we determine a most popular pair of bytes
* we invent a new byte and replace the pair with the new byte
* repeat

In [11]:
import collections
import itertools


def byte_pair_encode(text: str, num_iterations: int) -> list[int]:
    encoded = list(text.encode('utf-8'))
    new_bytes = itertools.count(256)
    for _ in range(num_iterations):
        pair = find_most_popular_pair(encoded)
        encoded = replace(encoded, pair, next(new_bytes))
    return encoded


def find_most_popular_pair(encoded: list[int]) -> tuple[int, int]:
    counts = collections.Counter()
    prev = None
    for cur in encoded:
        if prev is not None:
            pair = (prev, cur)
            counts[pair] += 1
        prev = cur
    return counts.most_common(1)[0][0]


def replace(encoded: list[int], what: tuple[int, int], replacement: int):
    result = []
    state = []
    for cur in encoded:
        state.append(cur)
        if tuple(state) == what:
            result.append(replacement)
            state = []
        elif len(state) == 2:
            result.extend(state)
            state = []

    result.extend(state)
    return result

In [17]:
print(len(byte_pair_encode("приветики вам, хочу проверить byte-pair encoding", num_iterations=3)))

66
