## Wordy Motivation

A Byte Pair Encoding (BPE) tokenizer splits text into subword units based on the most frequent character pairs in a corpus,\
allowing it to balance vocabulary size and represent rare words efficiently. It starts with individual bytes or characters \
and repeatedly merges the most common adjacent pairs into new tokens until a fixed vocabulary size is reached.

Modern BPE tokenizers used for training and inference in large language models typically apply regex-based pretokenization. \
This step splits text into linguistically or visually meaningful chunks (like words or punctuation groups), preventing merges \
across token boundaries that could produce spurious or misleading tokens (e.g., treating "dog" and "dog!" as entirely different tokens).

Regex pretokenization also enables more efficient frequency counting: if a word like “text” appears 10 times, we can increment pair counts (like 't','e') by 10 directly.\
When a merge occurs (e.g., 't','e','x','t' → 'te','x','t'), only the keys in the frequency dictionary need to be updated — the total count remains the same, \
which simplifies and speeds up the BPE merge step.

In [2]:
import regex as re

In [None]:
# from here: https://github.com/openai/tiktoken/pull/234/files
# Using this pattern re.finditer will produce one pretoken per group
GPT2_SPLIT_PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
pattern = re.compile(GPT2_SPLIT_PAT)

## Pieces of Training

In [18]:
# mix of languages + emoji
text = "Привет, world! 😄 Let's go, 今日は."

In [29]:
# a bit larger piece of text
text = """Alice looked at the glowing sign: “Добро пожаловать!” — it blinked beneath a line of Chinese characters: 欢迎光临.

She typed quickly: `hello_世界123! :)` — mixing English, symbols, digits, and emojis into her message.  
The response came instantly: "Принято. ✅"  
She smiled, whispered «行吧», and pressed Send.
"""

In [42]:
# and another one from cs336 assignment
text = """ low low low low low lower lower widest widest widest newest newest newest newest newest newest"""

In [None]:
pattern.findall(text)

[' low',
 ' low',
 ' low',
 ' low',
 ' low',
 ' lower',
 ' lower',
 ' widest',
 ' widest',
 ' widest',
 ' newest',
 ' newest',
 ' newest',
 ' newest',
 ' newest',
 ' newest']

In [187]:
# as described above this dictionary will keep pretoken counts
# tuple(bytes('Привет', encoding='utf-8')) -> integer represented byte sequence for convenience
pretokens = dict()
for mt in pattern.finditer(text):
  pt = mt.group() # -> str; match will have one pretoken per group
  pt = tuple(pt.encode('utf-8'))
  pretokens[pt] = pretokens.get(pt, 0) + 1

next_ix = 256

In [188]:
merges = [int.to_bytes(i) for i in range(256)]

In [252]:
for pt, cnt in pretokens.items():
  print([merges[i] for i in pt], ':', cnt)

[b' ', b'w', b'i', b'd', b'est'] : 3
[b' low'] : 5
[b' low', b'e', b'r'] : 2
[b' ', b'ne', b'west'] : 6


In [253]:
pretokens

{(32, 119, 105, 100, 257): 3, (260,): 5, (260, 101, 114): 2, (32, 262, 261): 6}

In [254]:
# we need to iterate through pretokens to find pair frequencies
pair_counts = dict()
for pt, cnt in pretokens.items():
  for p in zip(pt, pt[1:]):
    pair_counts[p] = pair_counts.get(p, 0) + cnt

In [255]:
for p, cnt in pair_counts.items():
  print(merges[p[0]], '+' ,merges[p[1]], ":", cnt)

b' ' + b'w' : 3
b'w' + b'i' : 3
b'i' + b'd' : 3
b'd' + b'est' : 3
b' low' + b'e' : 2
b'e' + b'r' : 2
b' ' + b'ne' : 6
b'ne' + b'west' : 6


In [256]:
# find most frequent pair, ties resolved in lexicographical order
top_pair, top_cnt = max(pair_counts.items(), key=lambda it: [it[1], it[0]])

In [257]:
top_pair

(262, 261)

In [258]:
print(merges[top_pair[0]], ',', merges[top_pair[1]], '->', top_cnt)

b'ne' , b'west' -> 6


In [259]:
# merge `pair` to become `new_ix` if it's in the `seq`
def merge(seq, pair, new_ix):
  new_seq = []
  i = 0
  while i < len(seq):
    # check in range and if match
    if i+1 < len(seq) and (seq[i], seq[i+1]) == pair:
      new_seq.append(new_ix)
      i += 2 # correct step
    else:
      new_seq.append(seq[i]) # only current position
      i += 1
  return tuple(new_seq)

In [260]:
# Each merge introduces a new token (pair → new token) that wasn’t in the vocabulary before
# Pretoken keys are sequences of current tokens.
# Until you merge ('t', 'e') into 'te', there's no way 'te' appears as a unit inside any key
# Only keys that contain the exact pair ('t', 'e') in adjacent positions will be modified.
# The output of merge() depends deterministically on the input key.
# Therefore, at most one original key can produce any given new_pt in the merge step.
for pt in list(pretokens): # static copy of keys (prevents RuntimeError if we iterate original dict)
  new_pt = merge(pt, top_pair, next_ix)
  if new_pt != pt: # update only if we merged new index
    # even though we proved it can't happen (see above), we want this assertions and perhaps test against it
    # so we are sure not to mess up with implementation
    assert new_pt not in pretokens, f"Collision: {new_pt} already in pretokens"
    pretokens[new_pt] = pretokens.pop(pt) #  safe from key collisions under the BPE merge assumptions (see above)
  
# update merges
merges.append(merges[top_pair[0]] + merges[top_pair[1]])
next_ix += 1

In [262]:
# we can take a look into newly formed tokens
for i, bp in enumerate(merges[256:], 256):
  print(i, '->', bp)

256 -> b'st'
257 -> b'est'
258 -> b'ow'
259 -> b'low'
260 -> b' low'
261 -> b'west'
262 -> b'ne'
263 -> b'newest'


## Let's Put It All Together

In [None]:
# merge `pair` to become `new_ix` if it's in the `seq`
def merge(seq, pair, new_ix):
  new_seq = []
  i = 0
  while i < len(seq):
    # check in range and if match
    if i+1 < len(seq) and (seq[i], seq[i+1]) == pair:
      new_seq.append(new_ix)
      i += 2 # correct step
    else:
      new_seq.append(seq[i]) # only current position
      i += 1
  return tuple(new_seq)

In [263]:
merges = [int.to_bytes(i) for i in range(256)]

# as described above this dictionary will keep pretoken counts
# tuple(bytes('Привет', encoding='utf-8')) -> integer represented byte sequence for convenience
pretokens = dict()
for mt in pattern.finditer(text):
  pt = mt.group() # -> str; match will have one pretoken per group
  pt = tuple(pt.encode('utf-8'))
  pretokens[pt] = pretokens.get(pt, 0) + 1

next_ix = 256
num_merges = 10

In [264]:
sep = "==================================="
for _ in range(num_merges):
  print(sep)
  # show pretokens
  for pt, cnt in pretokens.items():
    print([merges[i] for i in pt], ':', cnt)
  
  # we need to iterate through pretokens to find pair frequencies
  pair_counts = dict()
  for pt, cnt in pretokens.items():
    for p in zip(pt, pt[1:]):
      pair_counts[p] = pair_counts.get(p, 0) + cnt
  # find most frequent pair, ties resolved in lexicographical order
  top_pair, top_cnt = max(pair_counts.items(), key=lambda it: [it[1], it[0]])
  print("top pair", merges[top_pair[0]], ',', merges[top_pair[1]], '->', top_cnt)
  
  # merge
  for pt in list(pretokens): # static copy of keys (prevents RuntimeError if we iterate original dict)
    new_pt = merge(pt, top_pair, next_ix)
    if new_pt != pt: # update only if we merged new index
      # even though we proved it can't happen (see above), we want this assertions and perhaps test against it
      # so we are sure not to mess up with implementation
      assert new_pt not in pretokens, f"Collision: {new_pt} already in pretokens"
      pretokens[new_pt] = pretokens.pop(pt) #  safe from key collisions under the BPE merge assumptions (see above)
  
  # update merges
  merges.append(merges[top_pair[0]] + merges[top_pair[1]])
  next_ix += 1

print(sep)
# we can take a look into newly formed tokens
for i, bp in enumerate(merges[256:], 256):
  print(i, '->', bp)

[b' ', b'l', b'o', b'w'] : 5
[b' ', b'l', b'o', b'w', b'e', b'r'] : 2
[b' ', b'w', b'i', b'd', b'e', b's', b't'] : 3
[b' ', b'n', b'e', b'w', b'e', b's', b't'] : 6
top pair b's' , b't' -> 9
[b' ', b'l', b'o', b'w'] : 5
[b' ', b'l', b'o', b'w', b'e', b'r'] : 2
[b' ', b'w', b'i', b'd', b'e', b'st'] : 3
[b' ', b'n', b'e', b'w', b'e', b'st'] : 6
top pair b'e' , b'st' -> 9
[b' ', b'l', b'o', b'w'] : 5
[b' ', b'l', b'o', b'w', b'e', b'r'] : 2
[b' ', b'w', b'i', b'd', b'est'] : 3
[b' ', b'n', b'e', b'w', b'est'] : 6
top pair b'o' , b'w' -> 7
[b' ', b'w', b'i', b'd', b'est'] : 3
[b' ', b'n', b'e', b'w', b'est'] : 6
[b' ', b'l', b'ow'] : 5
[b' ', b'l', b'ow', b'e', b'r'] : 2
top pair b'l' , b'ow' -> 7
[b' ', b'w', b'i', b'd', b'est'] : 3
[b' ', b'n', b'e', b'w', b'est'] : 6
[b' ', b'low'] : 5
[b' ', b'low', b'e', b'r'] : 2
top pair b' ' , b'low' -> 7
[b' ', b'w', b'i', b'd', b'est'] : 3
[b' ', b'n', b'e', b'w', b'est'] : 6
[b' low'] : 5
[b' low', b'e', b'r'] : 2
top pair b'w' , b'est' -> 6
[b' 

In [None]:
special_tokens = ["<|endoftext|>", "<|endofline|>"]
delim = "|".join(map(re.escape, special_tokens))

<\|endoftext\|>|<\|endofline\|>


In [25]:
with open("data/TinyStoriesV2-GPT4-valid.txt", mode='r', encoding="utf-8") as f:
  ts_text = f.read()

In [27]:
len(ts_text)

22493387

In [34]:
ts_text[:400]

'u don\'t have to be scared of the loud dog, I\'ll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.\n<|endoftext|>\nOnce upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red b'

In [28]:
stories = re.split(delim, ts_text)

In [29]:
len(stories)

27631

In [32]:
stories[2343]

'\nOnce upon a time, there was a little boy named Tim. Yesterday, he went to a shop with his mom. Tim saw a big, red ball. He wanted it so much. He asked his mom, "Can I have the ball, please?" His mom said, "Yes, you can have it."\nTim was so happy. He played with the ball all day. But then, he kicked the ball too hard. The ball flew away and hit a tree. The tree was hurt. The tree said, "Ouch! That hurt! Please be careful next time."\nTim felt sorry for the tree. He said, "I\'m sorry, tree. I will be more careful." He picked up the ball and took it home. Tim\'s mom ordered a new, soft ball for the tree. The tree was happy, and Tim learned to play more carefully. They all lived happily ever after.\n'

In [None]:
"".join(stories[:3])

'u don\'t have to be scared of the loud dog, I\'ll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.\n\nOnce upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.\nTom asked his friend, Sam, to help him search for the ball. They looked high and low, but they could not find the ball. Tom said, "I think my ball fell into the pit."\nSam and Tom went close to the pit. They were scared, but they wanted to find the red ball. They looked into the pit, but it was too dark to see. Tom said, "We must go in and search for my ball."\nThey went into the pit to search. It was dark and scary. They could not find the ball. They tried to get out, but the pit was too deep. Tom and Sam were stuck in the pit. They called for help, but no one could hear them. 

In [None]:
"".join(stories[:3])

'u don\'t have to be scared of the loud dog, I\'ll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.\n\nOnce upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.\nTom asked his friend, Sam, to help him search for the ball. They looked high and low, but they could not find the ball. Tom said, "I think my ball fell into the pit."\nSam and Tom went close to the pit. They were scared, but they wanted to find the red ball. They looked into the pit, but it was too dark to see. Tom said, "We must go in and search for my ball."\nThey went into the pit to search. It was dark and scary. They could not find the ball. They tried to get out, but the pit was too deep. Tom and Sam were stuck in the pit. They called for help, but no one could hear them. 

In [39]:
bytes('<|endoftext|>', encoding="utf-8")

b'<|endoftext|>'

In [40]:
t = "Hello<tok>world<tok><eod>!"
special_tokens = ["<tok>", "<eod>"]
delim = "|".join(map(re.escape, special_tokens))
re.sub(f"(?:{delim})+", " ", t)

'Hello world !'

## Optimization

1. We can do merge and count updates on only sequences really containing top pair -> need pair_to_sequence mapping Dict[tuple[int, int], Set[tuple[int, ...], ...]]

2. We can use heap to keep track of top pairs for O(log n) max pair look up. Real life training can lead to a really large number of unique pairs. Number of possible unique pairs depends on vocab size, which grows every merge iteration (256 -> vocab_size). If vocab_size = n, we have P(n, 2) = n! / (n - 2)! 

Important note: we shouldn't forget about updating counts for overlapping pairs before and after merge, as well as merged top pair itself


In [1]:
with open("data/TinyStoriesV2-GPT4-valid.txt", mode='r', encoding="utf-8") as f:
  ts_text = f.read()

In [4]:
text = ts_text[:200]

In [63]:
# we did step-by-step training on this text, so for the first test we should see that optimized
# version reproduces naive implementation results.
text = """ low low low low low lower lower widest widest widest newest newest newest newest newest newest"""

In [64]:
type(text), len(text)

(str, 95)

In [65]:
# same pre-token counter
from toksmith.tokenizer import Tokenizer

In [66]:
tok = Tokenizer()

In [67]:
# Data structures
# I am using byte-unit term to underline that tokens are formed as a sequences of units, which correspond to either
# one byte or multiple bytes
# 1. pretoken_count: Dict[tuple[int, ...], int] = pretokens (as byte-unit sequences) frequency dictionary
# 2. pairs_count: Dict[tuple[int, int], int] = count of all sequential pairs of byte-units in the train corpus 
# 3. pair_to_pretoken: Dict[tuple[int, int], Set[tuple[int, ...], ...]] = map a pair to all pretokens, which have it
# this data structure will allow us to only touch relevant pretokens during merge iteration (and not all of them as
# in our naive implementation).
# 4. max heap on List[tuple[int, tuple[int, int]], ...] = provides a O(log n) access to most frequent pair

In [68]:
import heapq

In [69]:
def _pretoken_count(text: str) -> dict[tuple[int, ...], int]:
    pretokens = dict()
    for mt in tok.pattern.finditer(text):
      pt = mt.group()  # -> str; match will have one pretoken per group
      pt = tuple(pt.encode('utf-8'))
      pretokens[pt] = pretokens.get(pt, 0) + 1
    return pretokens

### One Time Setup

In [71]:
# vocab if we need to print
vocab = {i : bytes([i]) for i in range(256)}
merges = []

In [72]:
# Setup pretoken count
pretoken_count = _pretoken_count(text)

In [73]:
for pt, cnt in pretoken_count.items():
  print([vocab[unit] for unit in pt], '->', cnt)

[b' ', b'l', b'o', b'w'] -> 5
[b' ', b'l', b'o', b'w', b'e', b'r'] -> 2
[b' ', b'w', b'i', b'd', b'e', b's', b't'] -> 3
[b' ', b'n', b'e', b'w', b'e', b's', b't'] -> 6


In [74]:
# Setup pairs_count and pair_to_pretoken
pairs_count = dict()
pair_to_pretoken = dict()
# basically the same code as in our _pairs_count
for pt, cnt in pretoken_count.items():
  for pair in zip(pt, pt[1:]):
    pairs_count[pair] = pairs_count.get(pair, 0) + cnt
    if pair in pair_to_pretoken:
      pair_to_pretoken[pair].add(pt)
    else:
      pair_to_pretoken[pair] = {pt}


In [77]:
for pair, cnt in pairs_count.items():
  print(pair, '->', '(', vocab[pair[0]], ',', vocab[pair[1]], ')', '->', cnt)

(32, 108) -> ( b' ' , b'l' ) -> 7
(108, 111) -> ( b'l' , b'o' ) -> 7
(111, 119) -> ( b'o' , b'w' ) -> 7
(119, 101) -> ( b'w' , b'e' ) -> 8
(101, 114) -> ( b'e' , b'r' ) -> 2
(32, 119) -> ( b' ' , b'w' ) -> 3
(119, 105) -> ( b'w' , b'i' ) -> 3
(105, 100) -> ( b'i' , b'd' ) -> 3
(100, 101) -> ( b'd' , b'e' ) -> 3
(101, 115) -> ( b'e' , b's' ) -> 9
(115, 116) -> ( b's' , b't' ) -> 9
(32, 110) -> ( b' ' , b'n' ) -> 6
(110, 101) -> ( b'n' , b'e' ) -> 6
(101, 119) -> ( b'e' , b'w' ) -> 6


In [78]:
pair_to_pretoken

{(32, 108): {(32, 108, 111, 119), (32, 108, 111, 119, 101, 114)},
 (108, 111): {(32, 108, 111, 119), (32, 108, 111, 119, 101, 114)},
 (111, 119): {(32, 108, 111, 119), (32, 108, 111, 119, 101, 114)},
 (119, 101): {(32, 108, 111, 119, 101, 114),
  (32, 110, 101, 119, 101, 115, 116)},
 (101, 114): {(32, 108, 111, 119, 101, 114)},
 (32, 119): {(32, 119, 105, 100, 101, 115, 116)},
 (119, 105): {(32, 119, 105, 100, 101, 115, 116)},
 (105, 100): {(32, 119, 105, 100, 101, 115, 116)},
 (100, 101): {(32, 119, 105, 100, 101, 115, 116)},
 (101, 115): {(32, 110, 101, 119, 101, 115, 116),
  (32, 119, 105, 100, 101, 115, 116)},
 (115, 116): {(32, 110, 101, 119, 101, 115, 116),
  (32, 119, 105, 100, 101, 115, 116)},
 (32, 110): {(32, 110, 101, 119, 101, 115, 116)},
 (110, 101): {(32, 110, 101, 119, 101, 115, 116)},
 (101, 119): {(32, 110, 101, 119, 101, 115, 116)}}

In [79]:
# Setup a heap of (-cnt, -pair), negation is to maintain it as max heap (python defaults to min heap)
# -pair is to maintain the same tie resolution as in our naive implementation
heap = [(-cnt, (-p[0], -p[1])) for p, cnt in pairs_count.items()]
heapq.heapify(heap)

In [80]:
heap

[(-9, (-115, -116)),
 (-9, (-101, -115)),
 (-7, (-111, -119)),
 (-8, (-119, -101)),
 (-7, (-108, -111)),
 (-6, (-110, -101)),
 (-6, (-101, -119)),
 (-3, (-105, -100)),
 (-3, (-100, -101)),
 (-7, (-32, -108)),
 (-2, (-101, -114)),
 (-6, (-32, -110)),
 (-3, (-32, -119)),
 (-3, (-119, -105))]

In [81]:
# Setup index
new_ix = 256

### Iteration Step

In [82]:
# One merge iteration
# Find most frequent pair. Note: the pair popped from the heap can be invalid (stale). 
# It can be so if we popping the not up-to-date (intermediary) pair count, pushed
# during processing of overlapping pairs.
# That's why we need to compare it with the actual state.
while True:
  neg_cnt, neg_p = heapq.heappop(heap)
  top_pair = (-neg_p[0], -neg_p[1])
  # if count is correct we are fine to break
  # if top_pair already left pairs counter, get returns None
  if pairs_count.get(top_pair) == -neg_cnt:
    break
  else: # invalid (stale) entry
    continue
print(top_pair, vocab[top_pair[0]], vocab[top_pair[1]])

(115, 116) b's' b't'


In [83]:
heap

[(-9, (-101, -115)),
 (-8, (-119, -101)),
 (-7, (-111, -119)),
 (-3, (-119, -105)),
 (-7, (-108, -111)),
 (-6, (-110, -101)),
 (-6, (-101, -119)),
 (-3, (-105, -100)),
 (-3, (-100, -101)),
 (-7, (-32, -108)),
 (-2, (-101, -114)),
 (-6, (-32, -110)),
 (-3, (-32, -119))]

In [84]:
def merge(seq, pair, new_ix):
  if not isinstance(pair, tuple) or len(pair) != 2:
    raise ValueError('`pair` must be a 2-tuple')
  new_seq = []
  out_pairs = []
  in_pairs = []
  i = 0
  while i < len(seq):
    # match branch
    if i + 1 < len(seq) and (seq[i], seq[i + 1]) == pair:
      new_seq.append(new_ix)
      # if our pair is (x,y), we want to keep track of all (u,x) and (y,v) pairs
      # as well as incoming (u, new_ix) and (new_ix, v)
      # add outgoing and incoming pairs if any
      if i - 1 >= 0:
        # have (u,x) and (u, new_ix)
        out_pairs.append((seq[i-1], seq[i]))
        in_pairs.append((seq[i-1], new_ix))
      if i + 2 < len(seq):
        # have (y,v) and (new_ix, v)
        out_pairs.append((seq[i+1], seq[i+2]))
        in_pairs.append((new_ix, seq[i+2]))
      i += 2  # correct step
    else:
      new_seq.append(seq[i])  # only current position
      i += 1
  return tuple(new_seq), out_pairs, in_pairs

In [85]:
def update_pair_count(pair, freq):
    c = pairs_count.get(pair)
    if c is not None:  # guard
      c += freq
      if c > 0:  # update
        pairs_count[pair] = c 
        # push it on heap to have up-to-date entry there
        heapq.heappush(heap, (-c, (-pair[0], -pair[1])))
      else:  # drop
        pairs_count.pop(pair)

def discard_pretoken(pt_seq, pair):
    adj_set = pair_to_pretoken.get(pair)
    if adj_set is not None:
      adj_set.discard(pt_seq)
      if adj_set:
        pair_to_pretoken[pair] = adj_set
      else:
        pair_to_pretoken.pop(pair)

def add_pretoken(pt_seq, pair):
  adj_set = pair_to_pretoken.get(pair)
  if adj_set is not None:
    adj_set.add(pt_seq)
  else:
    adj_set = {pt_seq}
  pair_to_pretoken[pair] = adj_set
    

In [86]:
if top_pair in pair_to_pretoken:
  # now we can walk through only pretoken sequences actually containing the top pair
  for pt in pair_to_pretoken[top_pair]:
    freq = pretoken_count.get(pt) # current pretoken frequency
    if freq is not None:
      new_pt, out_pairs, in_pairs = merge(pt, top_pair, new_ix)
      # update all our datastructures
      for op in out_pairs:
        update_pair_count(op, -freq)  # decrease count and update heap
        # discard old pretoken from the set of pretokens containing outgoing pair
        discard_pretoken(pt, op)
      for ip in in_pairs:
        # frequency of incoming pair has increased (+ update heap)
        update_pair_count(ip, freq)
        # add new pretoken to the set of pretokens containing incoming pair
        add_pretoken(pt, ip)
      # old pretoken pt is good to go
      pretoken_count.pop(pt)
      # new pretoken count update
      pretoken_count[new_pt] = pretoken_count.get(new_pt, 0) + freq
  # top pair doesn't exist anymore, so we need to clear its state
  pairs_count.pop(top_pair)
  pair_to_pretoken.pop(top_pair)


In [87]:
# update merges and vocab
merges.append(top_pair)
vocab[new_ix] = vocab[top_pair[0]] + vocab[top_pair[1]]

In [88]:
# increment next unit index
new_ix += 1

In [89]:
merges

[(115, 116)]

In [90]:
for pt, cnt in pretoken_count.items():
  print([vocab[unit] for unit in pt], '->', cnt)

[b' ', b'l', b'o', b'w'] -> 5
[b' ', b'l', b'o', b'w', b'e', b'r'] -> 2
[b' ', b'n', b'e', b'w', b'e', b'st'] -> 6
[b' ', b'w', b'i', b'd', b'e', b'st'] -> 3


In [91]:
for pair, cnt in pairs_count.items():
  print(pair, '->', '(', vocab[pair[0]], ',', vocab[pair[1]], ')', '->', cnt)

(32, 108) -> ( b' ' , b'l' ) -> 7
(108, 111) -> ( b'l' , b'o' ) -> 7
(111, 119) -> ( b'o' , b'w' ) -> 7
(119, 101) -> ( b'w' , b'e' ) -> 8
(101, 114) -> ( b'e' , b'r' ) -> 2
(32, 119) -> ( b' ' , b'w' ) -> 3
(119, 105) -> ( b'w' , b'i' ) -> 3
(105, 100) -> ( b'i' , b'd' ) -> 3
(100, 101) -> ( b'd' , b'e' ) -> 3
(32, 110) -> ( b' ' , b'n' ) -> 6
(110, 101) -> ( b'n' , b'e' ) -> 6
(101, 119) -> ( b'e' , b'w' ) -> 6


In [94]:
heap

[(-9, (-101, -115)),
 (-8, (-119, -101)),
 (-7, (-111, -119)),
 (-3, (-119, -105)),
 (-7, (-108, -111)),
 (-6, (-110, -101)),
 (-6, (-101, -119)),
 (-3, (-105, -100)),
 (-3, (-100, -101)),
 (-7, (-32, -108)),
 (-2, (-101, -114)),
 (-6, (-32, -110)),
 (-3, (-32, -119)),
 (-3, (-101, -115))]