<a href="https://colab.research.google.com/github/ZHAOTransparentAI/MaschinellesLernen/blob/main/Byte_level_BPE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import re
from collections import defaultdict, Counter

def get_vocab(text):
    vocab = defaultdict(int)
    for word in text.split():
        vocab[' '.join(list(word)) + ' </w>'] += 1
    return vocab

def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

def merge_vocab(pair, vocab):
    new_vocab = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in vocab:
        new_word = p.sub(''.join(pair), word)
        new_vocab[new_word] = vocab[word]
    return new_vocab

def byte_pair_encoding(text, num_merges):
    vocab = get_vocab(text)
    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        print(f'Merge {i+1}: {best}')
    return vocab

# examplary text
text = "low lower lowest"


for i in range(10):
  vocab = byte_pair_encoding(text, i)
  print("\n")
  print(f"The {i+1}th Vocabulary List ")
  for word in vocab:
      print(word)



The 1th Vocabulary List 
l o w </w>
l o w e r </w>
l o w e s t </w>
Merge 1: ('l', 'o')


The 2th Vocabulary List 
lo w </w>
lo w e r </w>
lo w e s t </w>
Merge 1: ('l', 'o')
Merge 2: ('lo', 'w')


The 3th Vocabulary List 
low </w>
low e r </w>
low e s t </w>
Merge 1: ('l', 'o')
Merge 2: ('lo', 'w')
Merge 3: ('low', 'e')


The 4th Vocabulary List 
low </w>
lowe r </w>
lowe s t </w>
Merge 1: ('l', 'o')
Merge 2: ('lo', 'w')
Merge 3: ('low', 'e')
Merge 4: ('low', '</w>')


The 5th Vocabulary List 
low</w>
lowe r </w>
lowe s t </w>
Merge 1: ('l', 'o')
Merge 2: ('lo', 'w')
Merge 3: ('low', 'e')
Merge 4: ('low', '</w>')
Merge 5: ('lowe', 'r')


The 6th Vocabulary List 
low</w>
lower </w>
lowe s t </w>
Merge 1: ('l', 'o')
Merge 2: ('lo', 'w')
Merge 3: ('low', 'e')
Merge 4: ('low', '</w>')
Merge 5: ('lowe', 'r')
Merge 6: ('lower', '</w>')


The 7th Vocabulary List 
low</w>
lower</w>
lowe s t </w>
Merge 1: ('l', 'o')
Merge 2: ('lo', 'w')
Merge 3: ('low', 'e')
Merge 4: ('low', '</w>')
Merge 5: