# 2. Tokenization

## Byte-Pair Encoding (BPE)

In [1]:
import collections
import re

In [17]:
# Step 1: Let's build initial vocab from the corpus (list of sentences).
def build_vocab(corpus):
    """
    Build the initial vocabulary from the corpus.
    Each word is split into characters with and end-of-word marker </w> added.
    """
    vocab = collections.Counter()
    for sentence in corpus:
        words = sentence.strip().split() 
        for word in words:
            symbols = tuple(word) + ('</w>',)
            vocab[symbols] += 1
    return vocab

In [5]:
# Step 2: A function to get a symbol pair frequency given a vocab
def get_pair_frequencies(vocab):
    """
    Count frequencies of adjacent symbol pairs in the vocabulary.
    """
    pair_freqs = collections.Counter()
    for word, freq in vocab.items():
        symbols = word
        for i in range(len(symbols) - 1):
            pair = (symbols[i], symbols[i+1])
            pair_freqs[pair] += freq
    return pair_freqs

In [28]:
# Step 3: Merge the Most Frequent Pair
def merge_vocab(pair, vocab):
    """
    Merge the most frequent pair in the vocabulary.
    """
    merged_vocab = {}
    bigram = ' '.join(pair)
    pattern = re.compile(r'(?<!\S)' + re.escape(bigram) + r'(?!\S)')
    for word, freq in vocab.items():
        word_str = ' '.join(word)
        # Replace the pair with merged symbol
        word_str_new = pattern.sub(''.join(pair), word_str)
        # Convert back to tuple of symbols
        symbols = tuple(word_str_new.split())
        merged_vocab[symbols] = freq
    return merged_vocab

In [29]:
# Step 4: Using functions until now, build the learner

def learn_bpe(corpus, num_merges):
    """
    Learn BPE merge operations from the corpus.
    """
    vocab = build_vocab(corpus)
    bpe_codes = []

    for i in range(num_merges):
        pair_freqs = get_pair_frequencies(vocab)
        if not pair_freqs:
            break
        # get most frequent pair
        most_frequent = max(pair_freqs, key=pair_freqs.get)
        vocab = merge_vocab(most_frequent, vocab)
        bpe_codes.append(most_frequent)

    return bpe_codes


In [9]:
# Part 2: Let's make the tokenizer

def get_pairs(word):
    """
    Get all adjacent symbol pairs in the word.
    """
    pairs = set()
    for i in range(len(word) - 1):
        pairs.add((word[i], word[i+1]))
    return pairs

In [20]:
def tokenize(word, bpe_codes):
    """
    Tokenize a word using a learned BPE codes.
    """
    word = tuple(word) + ('</w>',)
    pairs = get_pairs(word)
    bpe_codes_dict = {
        pair: idx for idx, pair in enumerate(bpe_codes)
    }

    while True:
        min_pair = None
        min_rank = float('inf')

        # find the pair with lowest rank (earliest merge)
        for pair in pairs:
            rank = bpe_codes_dict.get(pair, float('inf'))
            if rank < min_rank:
                min_rank = rank
                min_pair = pair
        
        if min_pair is None or min_pair not in bpe_codes_dict:
            break

        # Merge the best pair
        new_word = []
        i = 0
        while i < len(word):
            j = i
            while j < len(word) - 1 and (word[j], word[j+1]) != min_pair:
                    j += 1
            new_word.extend(word[i:j])
            if j < len(word) - 1:
                 new_word.append(''.join(min_pair))
                 i = j + 2
            else:
                 new_word.extend(word[j:])
                 break
        word = tuple(new_word)
        if len(word) == 1:
             break
        else:
             pairs = get_pairs(word)
    
    if word[-1] == '</w>':
         word = word[:-1]
    
    return word

In [13]:
def encode_text(text, bpe_codes):
    """
    Tokenize each word in the text using the learned BPE codes.
    """
    tokens = []
    for word in text.strip().split():
        tokenized_word = tokenize(word, bpe_codes)
        tokens.append(tokenized_word)
    return tokens

In [14]:
# Example runs

training_corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is transforming industries worldwide.",
    "She enjoys reading books by the lake during weekends.",
    "The train to Tokyo departs at 10:30 AM sharp.",
    "A healthy diet includes fruits, vegetables, and plenty of water.",
    "Learning new languages can open doors to exciting opportunities.",
    "The stars were shining brightly in the clear night sky.",
    "Climate change is one of the biggest challenges of our time.",
    "He plays the guitar beautifully, especially classical pieces.",
    "The software update introduced several new features.",
    "An elephant is the largest land animal on Earth.",
    "They celebrated her birthday with a surprise party.",
    "Technology is advancing at an unprecedented pace.",
    "The team worked late to finish the project before the deadline.",
    "Many rural areas still lack access to clean drinking water.",
    "The museum was filled with ancient artifacts and rare paintings.",
    "Good communication skills are essential for any career.",
]

test_corpus = [
    "The app provides real-time weather updates and forecasts.",
    "Hiking in the mountains is a great way to unwind and relax.",
    "He enjoys experimenting with new recipes in the kitchen."
]


In [15]:
# learn about the data to decide num_merges

unique_words = set()
total_words = 0
for sentence in training_corpus:
    words = sentence.split()
    total_words += len(words)
    for word in words:
        unique_words.add(word)

print("Corpus Statistics")
print(f"1. Total Words : {total_words}")
print(f"2. Unique Words : {len(unique_words)}")

Corpus Statistics
1. Total Words : 151
2. Unique Words : 126


In [34]:
num_merges = 25
bpe_codes = learn_bpe(training_corpus, num_merges)
print(f"Number of Learned BPE Codes: {len(bpe_codes)}")
print(bpe_codes)

Number of Learned BPE Codes: 25
[('e', '</w>'), ('s', '</w>'), ('i', 'n'), ('.', '</w>'), ('h', 'e</w>'), ('a', 'n'), ('a', 'r'), ('y', '</w>'), ('a', 't'), ('a', 'l'), ('e', 'n'), ('in', 'g'), ('e', 's'), ('d', '</w>'), ('t', 'he</w>'), ('ing', '</w>'), ('e', 'r'), ('t', 'i'), ('o', 'r'), ('t', '</w>'), ('T', 'he</w>'), ('al', '</w>'), ('l', 'a'), ('l', 'l'), ('e', 'a')]


In [36]:
# tokenize the test corpus

for i, sentence in enumerate(test_corpus):
    print(f"Tokenizing #{i+1}th sentence")
    tokens = encode_text(sentence, bpe_codes)
    print("Original: ", sentence)
    print("Tokens: ", tokens)

    print("\n\n")

Tokenizing #1th sentence
Original:  The app provides real-time weather updates and forecasts.
Tokens:  [('The</w>',), ('a', 'p', 'p'), ('p', 'r', 'o', 'v', 'i', 'd', 'e', 's</w>'), ('r', 'e', 'al', '-', 'ti', 'm', 'e</w>'), ('w', 'e', 'at', 'h', 'er'), ('u', 'p', 'd', 'at', 'e', 's</w>'), ('an', 'd</w>'), ('f', 'or', 'e', 'c', 'a', 's', 't', 's', '.</w>')]



Tokenizing #2th sentence
Original:  Hiking in the mountains is a great way to unwind and relax.
Tokens:  [('H', 'i', 'k', 'ing</w>'), ('in',), ('the</w>',), ('m', 'o', 'u', 'n', 't', 'a', 'in', 's</w>'), ('i', 's</w>'), ('a',), ('g', 'r', 'e', 'at'), ('w', 'a', 'y</w>'), ('t', 'o'), ('u', 'n', 'w', 'in', 'd</w>'), ('an', 'd</w>'), ('r', 'e', 'la', 'x', '.</w>')]



Tokenizing #3th sentence
Original:  He enjoys experimenting with new recipes in the kitchen.
Tokens:  [('H', 'e</w>'), ('en', 'j', 'o', 'y', 's</w>'), ('e', 'x', 'p', 'er', 'i', 'm', 'en', 't', 'ing</w>'), ('w', 'i', 't', 'h'), ('n', 'e', 'w'), ('r', 'e', 'c', 'i', 'p',