# Reconstructing GPT4 MinBPE Tokenizer

## Step 1
Write the BasicTokenizer class, with the following three core functions:

def train(self, text, vocab_size, verbose=False)
def encode(self, text)
def decode(self, ids)
Train your tokenizer on whatever text you like and visualize the merged tokens. Do they look reasonable? One default test you may wish to use is the text file tests/taylorswift.txt.

In [None]:
# Constructing a BPE Tokenizer class based on the functions defined above

class BPE_TokenizerV1:
    def __init__(self, vocab_size, num_merges):
        self.vocab_size = vocab_size
        self.num_merges = num_merges
        self.merges = None
        self.vocab = None

    def stats(self, ids):
        count = {}
        for pair in zip(ids,ids[1:]):
            count[pair] = count.get(pair,0) + 1 # counting the frequency of each pair
        return count

    def replace_pair(self, ids, pair, new_id):
        new_ids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
                new_ids.append(new_id)
                i += 2
            else:
                new_ids.append(ids[i])
                i += 1
        return new_ids

    def bpe_train(self, ids):
        merges = {}
        for i in range(self.num_merges):
            stat = self.stats(ids)
            top_pair = max(stat, key=stat.get)
            idx = self.vocab_size + i
            print(f"merging {top_pair} to {idx}")
            ids = self.replace_pair(ids, top_pair, idx)
            merges[top_pair] = idx
        self.merges = merges
        self.ids = ids
        vocab = {idx:bytes([idx]) for idx in range(256)}
        for pair, idx in merges.items():
            vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
        self.vocab = vocab
        return ids, merges

    def encode(self, text):
        tokens = list(text.encode('utf-8'))
        while len(tokens) >= 2:
            stat = self.stats(tokens)
            pair = min(stat, key=lambda p: self.merges.get(p, float("inf")))
            if pair not in self.merges:
                break
            idx = self.merges[pair]
            tokens = self.replace_pair(tokens, pair, idx)
        return tokens

    def decode(self, tokens):
        text = b"".join([self.vocab[t] for t in tokens]) #b"" is used to convert the string to bytes
        return text.decode('utf-8', errors='replace') # replace any unknown characters with a replacement character
    




In [None]:
#testing
tokenizer = BPE_TokenizerV1(256, 20)
ids = list(tokens2)
ids, merges = tokenizer.bpe_train(ids)
print("tokens length before BPE:", len(tokens2))
print("tokens length after BPE:", len(ids))
print(f"compression ratio: {len(tokens2)/len(ids):.2f}")
print(tokenizer.decode(tokenizer.encode("hello world")))
text2 = tokenizer.decode(tokenizer.encode(text))
print(text2 == text)


## Step 2
Convert you BasicTokenizer into a RegexTokenizer, which takes a regex pattern and splits the text exactly as GPT-4 would. Process the parts separately as before, then concatenate the results. Retrain your tokenizer and compare the results before and after. You should see that you will now have no tokens that go across categories (numbers, letters, punctuation, more than one whitespace). Use the GPT-4 pattern:

GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
