In [1]:
from tokenizers import (
    normalizers,
    pre_tokenizers
)
from collections import defaultdict

In [2]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [3]:
normalizer = normalizers.BertNormalizer(
    clean_text = True, # remove all control characters and repleace repeating strings with a single one
    handle_chinese_chars=True, # place spaces around Chinese characters
    strip_accents=None, # whether to strip accents
    lowercase=True
)
pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [4]:
def initialize(corpus_words):
    vocabulary = {"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"}
    char_frequency = defaultdict(int)

    corpus_tokens = []
    for word in set(corpus_words):
        word_tokens = []
        for i,c in enumerate(word):
            token = c if i == 0 else f"##{c}"
            vocabulary.add(token)
            char_frequency[token] += 1
            word_tokens.append(token)
        corpus_tokens.append(word_tokens)
    
    pair_frequency = defaultdict(int)
    for word_tokens in corpus_tokens:
        for pair in zip(word_tokens, word_tokens[1:]):
            pair_frequency[pair] += 1

    return vocabulary, char_frequency, pair_frequency, corpus_tokens

In [7]:
def pair_to_token(pair):
    return f"{pair[0]}{pair[1].lstrip("#")}"


def update(new_pair, corpus_tokens):
    char_frequency = defaultdict(int)
    pair_frequency = defaultdict(int)
    new_corpus_tokens = []
    for word_tokens in corpus_tokens:
        tokens = []
        idx = 0
        while idx < len(word_tokens):
            if idx < len(word_tokens) - 1 and new_pair == (word_tokens[idx], word_tokens[idx+1]):
                tokens.append(pair_to_token(new_pair))
                idx += 1
            else:
                tokens.append(word_tokens[idx])
            idx += 1

        for token in tokens:
            char_frequency[token] += 1
        for pair in zip(tokens, tokens[1:]):
            pair_frequency[pair] += 1
        new_corpus_tokens.append(tokens)
    return char_frequency, pair_frequency, new_corpus_tokens


def train_vocabulary(corpus, vocab_size = 70):
    corpus_words = []
    for sentence in corpus:
        tuples = pre_tokenizer.pre_tokenize_str(normalizer.normalize_str(sentence))
        corpus_words.extend([word for word, offset in tuples])
    
    vocabulary, char_frequency, pair_frequency, corpus_tokens = initialize(corpus_words)
    while len(vocabulary) < vocab_size:
        pair_scores = {pair: pf / (char_frequency[pair[0]] * char_frequency[pair[1]]) for pair,pf in pair_frequency.items()}
        max_pair = max(pair_frequency.keys(), key=pair_scores.get)
        vocabulary.add(pair_to_token(max_pair))
        char_frequency, pair_frequency, corpus_tokens = update(max_pair, corpus_tokens)
    
    return vocabulary

In [8]:
vocabulary = train_vocabulary(corpus, vocab_size=70)

In [11]:
vocabulary

{'##a',
 '##al',
 '##b',
 '##c',
 '##ct',
 '##cti',
 '##d',
 '##e',
 '##f',
 '##fu',
 '##ful',
 '##full',
 '##fully',
 '##g',
 '##gg',
 '##h',
 '##hm',
 '##i',
 '##ithms',
 '##iz',
 '##k',
 '##l',
 '##m',
 '##n',
 '##o',
 '##p',
 '##r',
 '##ral',
 '##rithms',
 '##s',
 '##t',
 '##thm',
 '##thms',
 '##u',
 '##ugg',
 '##v',
 '##w',
 '##y',
 '##z',
 ',',
 '.',
 '[CLS]',
 '[MASK]',
 '[PAD]',
 '[SEP]',
 '[UNK]',
 'a',
 'ab',
 'abl',
 'al',
 'alg',
 'b',
 'c',
 'f',
 'fa',
 'fac',
 'g',
 'h',
 'hugg',
 'huggi',
 'i',
 'is',
 's',
 't',
 'u',
 'w',
 'wi',
 'wil',
 'will',
 'y'}

In [15]:
def encode_word(word, vocab):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

def tokenize(text, vocab):
    pre_tokenize_result = pre_tokenizer.pre_tokenize_str(normalizer.normalize_str(text))
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word, vocab) for word in pre_tokenized_text]
    return sum(encoded_words, [])

In [16]:
tokenize("This is the Hugging Face course!", vocabulary)

['t',
 '##h',
 '##i',
 '##s',
 'is',
 't',
 '##h',
 '##e',
 'huggi',
 '##n',
 '##g',
 'fac',
 '##e',
 'c',
 '##o',
 '##u',
 '##r',
 '##s',
 '##e',
 '[UNK]']