# **Byte-Pair Encoding**

In [None]:
# Import key modules
import numpy as np
from collections import Counter
from typing import Sequence, Tuple, Dict
import tiktoken # tokenizer for GPT-4
import requests
import re
import matplotlib.pyplot as plt

## Initialize Text

In [None]:
from collections import Counter

# texts with repetitions
# text = "like liker love lovely hug hugs hugging heart"
text = "banana"

chars = list(set(text))
chars.sort()

# Make a vocabulary
vocab = {ch: i for i, ch in enumerate(chars)}
print(f"Vocabulary: \n\t {vocab}")

# Convert the text into a list of chracters
origtext = list(text)

# # Create a dictionary with token pairs

pairs = Counter(origtext[i] + origtext[i+1] for i in range(len(origtext) - 1))
print(f"Text Pairs: \n\t {pairs}")


Vocabulary: 
	 {'a': 0, 'b': 1, 'n': 2}
Text Pairs: 
	 Counter({'an': 2, 'na': 2, 'ba': 1})


In [None]:
# find the most frequent pair
most_frequent_pair = pairs.most_common(1)[0][0]
print(f"Most Frequent Pair: \t \'{most_frequent_pair}\'")

# Add the most frequent pair to the vocabulary
vocab[most_frequent_pair] = len(vocab) + 1
print(vocab)

Most Frequent Pair: 	 'an'
{'a': 0, 'b': 1, 'n': 2, 'an': 4}


In [None]:
# Create a new list of tokens with the pairs of most frequently occurring characters from the text

new_text = []

i = 0

while i < len(origtext)-1:
        if origtext[i] + origtext[i+1] == most_frequent_pair:
          add_text = most_frequent_pair
          i += 2
        else:
          add_text = origtext[i]
          i += 1

        new_text.append(add_text)

print(f"New Text: \n\t: {new_text}")



New Text: 
	: ['b', 'an', 'an']


In [None]:
from collections import Counter
from typing import Tuple, List, Dict

def bpe(text: str, vocab_size: int) -> Tuple[List[str], Dict[str, int]]:
    # Initialize vocabulary from characters
    chars = sorted(set(text))
    vocab: Dict[str, int] = {ch: i for i, ch in enumerate(chars)}

    if len(vocab) >= vocab_size:
        return list(text), vocab

    # Start with character-level tokens
    tokens: List[str] = list(text)

    while len(vocab) < vocab_size:
        if len(tokens) < 2:
            break  # nothing to merge

        # Step 1: find most frequent adjacent pair in current tokens
        pairs = Counter(tokens[i] + tokens[i+1] for i in range(len(tokens) - 1))
        most_frequent_pair, freq = pairs.most_common(1)[0]

        # Step 2: add the pair to the vocabulary
        if most_frequent_pair not in vocab:
            vocab[most_frequent_pair] = len(vocab)

        # Step 3: merge the pair in the token list
        new_tokens: List[str] = []
        i = 0
        while i < len(tokens):
            if i < len(tokens) - 1 and tokens[i] + tokens[i+1] == most_frequent_pair:
                new_tokens.append(most_frequent_pair)
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1

        tokens = new_tokens

    return tokens, vocab


In [None]:
text = "like liker love lovely hug hugs hugging heart"
vocab_size = 25
tokens, vocab = bpe(text, vocab_size)
print(f"Final token : {tokens}")
print(f"Final vocabulary: {vocab}")

Final token : ['l', 'ike', ' l', 'ike', 'r', ' love', ' love', 'l', 'y', ' hug', ' hug', 's', ' hug', 'g', 'i', 'n', 'g', ' h', 'e', 'a', 'r', 't']
Final vocabulary: {' ': 0, 'a': 1, 'e': 2, 'g': 3, 'h': 4, 'i': 5, 'k': 6, 'l': 7, 'n': 8, 'o': 9, 'r': 10, 's': 11, 't': 12, 'u': 13, 'v': 14, 'y': 15, ' h': 16, ' l': 17, ' hu': 18, ' hug': 19, 'ik': 20, 'ike': 21, ' lo': 22, ' lov': 23, ' love': 24}
