In [4]:
from collections import defaultdict, Counter

# Function to get pair frequencies
def get_pair_frequencies(corpus):
    pairs = defaultdict(int)
    for word, freq in corpus.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i + 1])] += freq
    return pairs

# Function to merge the most frequent pair
def merge_pair(corpus, pair):
    new_corpus = {}
    bigram = ' '.join(pair)
    for word in corpus:
        new_word = word.replace(bigram, ''.join(pair))
        new_corpus[new_word] = corpus[word]
    return new_corpus

# Function to train the BPE tokenizer
def train_bpe(corpus, num_merges):
    corpus = Counter(corpus)
    for _ in range(num_merges):
        pairs = get_pair_frequencies(corpus)
        if not pairs:
            break
        best_pair = max(pairs, key=pairs.get)
        corpus = merge_pair(corpus, best_pair)
    return corpus

# Function to tokenize a sentence using the trained BPE tokenizer
def tokenize_bpe(sentence, bpe_vocab):
    sentence = sentence.split()
    tokenized = []
    for word in sentence:
        subwords = [word]
        for token in bpe_vocab:
            new_subwords = []
            for subword in subwords:
                if subword in bpe_vocab:
                    new_subwords.append(subword)
                else:
                    split_subword = subword.replace(token, ' '.join(list(token)))
                    new_subwords.extend(split_subword.split())
            subwords = new_subwords
        tokenized.extend(subwords)
    return tokenized

# Input corpus
corpus = ["low low low low lowest newer newer newer newer newer newer wider wider wider new new"]
corpus = [' '.join(word) for word in corpus[0].split()]
corpus = {corpus[i]: 1 for i in range(len(corpus))}

# Merges count
num_merges = 2

# Train the BPE tokenizer
bpe_vocab = train_bpe(corpus, num_merges)

# Tokenize the given sentence
sentence = "newer lower"
tokenized_sentence = tokenize_bpe(sentence, bpe_vocab)
print("Tokenized Sentence:", tokenized_sentence)


Tokenized Sentence: ['newer', 'l', 'o', 'wer']
