# BPE Tokenizer Tests

In [1]:
import sys
sys.path.insert(0, '.')
from tokenizer.bpe import BPETokenizer, get_byte_encoder, get_pairs
from tokenizer.trainer import BPE_Trainer

## 1. Test get_pairs()

In [2]:
assert get_pairs(['h', 'e', 'l', 'l', 'o']) == {('h','e'), ('e','l'), ('l','l'), ('l','o')}
assert get_pairs(['a']) == set()  # single char -> no pairs
assert get_pairs(['a', 'b']) == {('a', 'b')}
print('get_pairs: PASSED')

get_pairs: PASSED


## 2. Test byte encoder covers all 256 bytes

In [3]:
enc = get_byte_encoder()
assert len(enc) == 256, f'Expected 256 mappings, got {len(enc)}'
assert all(0 <= k <= 255 for k in enc.keys()), 'Keys should be bytes 0-255'
# all values should be unique printable chars
assert len(set(enc.values())) == 256, 'Mapped chars should be unique'
print(f'byte_encoder: PASSED (256 unique mappings)')

byte_encoder: PASSED (256 unique mappings)


## 3. Train BPE on a small corpus

In [4]:
corpus = "the cat sat on the mat. the cat sat on the cat. the mat sat on the mat."
trainer = BPE_Trainer(num_merges=50)
tok = trainer.train(corpus)

print(f'Vocab size: {len(tok.vocab)}')
print(f'Merges learned: {len(tok.merges)}')
print(f'First 10 merges: {tok.merges[:10]}')

Merge 0/50: ('a', 't')
Training complete. Vocab size: 268, Merges: 12
Vocab size: 268
Merges learned: 12
First 10 merges: [('a', 't'), ('h', 'e'), ('t', 'he'), ('Ġ', 'the'), ('Ġ', 'c'), ('Ġc', 'at'), ('s', 'at'), ('Ġ', 'sat'), ('Ġ', 'o'), ('Ġo', 'n')]


## 4. Test encode/decode roundtrip

In [5]:
test_strings = [
    "the cat",
    "cat sat on mat",
    "the",
]

for s in test_strings:
    ids = tok.encode(s)
    decoded = tok.decode(ids)
    print(f'{s!r:25s} -> {ids} -> {decoded!r}')
    assert decoded == s, f'Roundtrip failed: {s!r} != {decoded!r}'

print('\nEncode/decode roundtrip: PASSED')

'the cat'                 -> [258, 261] -> 'the cat'
'cat sat on mat'          -> [66, 256, 263, 265, 267] -> 'cat sat on mat'
'the'                     -> [258] -> 'the'

Encode/decode roundtrip: PASSED


## 5. Test save/load roundtrip

In [6]:
import tempfile, os

with tempfile.TemporaryDirectory() as tmpdir:
    vocab_path = os.path.join(tmpdir, 'vocab.json')
    merge_path = os.path.join(tmpdir, 'merges.txt')

    trainer.save(tok, vocab_path, merge_path)

    # Load into a fresh tokenizer
    tok2 = BPETokenizer()
    tok2.load_vocab_merges(vocab_path, merge_path)

    # Verify same encode output
    for s in test_strings:
        ids1 = tok.encode(s)
        ids2 = tok2.encode(s)
        assert ids1 == ids2, f'Mismatch for {s!r}: {ids1} vs {ids2}'

print('Save/load roundtrip: PASSED')

Save/load roundtrip: PASSED


## 6. Test _bpe with no merges (should return characters unchanged)

In [7]:
empty_tok = BPETokenizer()
result = empty_tok._bpe('hello')
assert result == 'h e l l o', f'Expected space-separated chars, got {result!r}'
print(f'_bpe no merges: PASSED ({result!r})')

_bpe no merges: PASSED ('h e l l o')
