In this chapter we build GPT tokenizer

byte-pair encoding is pretty simple:
* we determine a most popular pair of bytes
* we invent a new byte and replace the pair with the new byte
* repeat

In [27]:
import collections
import itertools


def byte_pair_encode(text: str, num_iterations: int) -> list[int]:
    encoded = list(text.encode('utf-8'))
    new_bytes = itertools.count(256)
    for _ in range(num_iterations):
        if len(encoded) < 2:
            break
        pair = find_most_popular_pair(encoded)
        encoded = replace(encoded, pair, next(new_bytes))
    return encoded


def find_most_popular_pair(encoded: list[int]) -> tuple[int, int]:
    counts = collections.Counter()
    prev = None
    for cur in encoded:
        if prev is not None:
            pair = (prev, cur)
            counts[pair] += 1
        prev = cur
    return counts.most_common(1)[0][0]


def replace(encoded: list[int], what: tuple[int, int], replacement: int):
    result = []
    state = []
    for cur in encoded:
        # invariant: len(state) < 2
        state.append(cur)
        if tuple(state) == what:
            result.append(replacement)
            state = []
        elif len(state) == 2:
            # not a match, we can add the first element, since it's not part of the `what` pair
            result.append(state[0])
            state.pop(0)

    # invariant: len(state) < 2
    result.extend(state)
    return result

In [37]:
byte_pair_encode("aaabdaaabac", num_iterations=3)

[258, 100, 258, 97, 99]

In [32]:
len(byte_pair_encode("приветики вам, хочу проверить byte-pair encoding", num_iterations=20))

39