In [3]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer
from tokenizers.pre_tokenizers import Whitespace

In [4]:
unk_token = "<UNK>"  # token for unknown words
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]  # special tokens

def prepare_tokenizer_trainer(alg):
    """
    Prepares the tokenizer and trainer with unknown & special tokens.
    """
    if alg == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token = unk_token))
        trainer = BpeTrainer(special_tokens = spl_tokens)
    elif alg == 'UNI':
        tokenizer = Tokenizer(Unigram())
        trainer = UnigramTrainer(unk_token= unk_token, special_tokens = spl_tokens)
    elif alg == 'WPC':
        tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
        trainer = WordPieceTrainer(special_tokens = spl_tokens)
    else:
        tokenizer = Tokenizer(WordLevel(unk_token = unk_token))
        trainer = WordLevelTrainer(special_tokens = spl_tokens)
    
    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer


def train_tokenizer(files, alg='WLV'):
    """
    Takes the files and trains the tokenizer.
    """
    tokenizer, trainer = prepare_tokenizer_trainer(alg)
    tokenizer.train(files, trainer) # training the tokenzier
    tokenizer.save("./tokenizer-trained.json")
    tokenizer = Tokenizer.from_file("./tokenizer-trained.json")
    return tokenizer

def tokenize(input_string, tokenizer):
    """
    Tokenizes the input string using the tokenizer provided.
    """
    output = tokenizer.encode(input_string)
    return output


In [5]:
##training on a small dataset
small_file = ['veri/alice.txt']
# large_files = [f"./wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
large_files = ['veri/alice.txt', 'veri/alice.txt', 'veri/alice.txt']

tokens_dict = {}

for files in [small_file, large_files]:
    print(f"========Using vocabulary from {files}=======")
    for alg in ['WLV', 'BPE', 'UNI', 'WPC']:
        trained_tokenizer = train_tokenizer(files, alg)
        input_string = "This is a deep learning tokenization tutorial. Tokenization is the first step in a deep learning NLP pipeline. We will be comparing the tokens generated by each tokenization model. Excited much?!😍"
        output = tokenize(input_string, trained_tokenizer)
        tokens_dict[alg] = output.tokens
        print("----", alg, "----")
        print(output.tokens, "->", len(output.tokens))


---- WLV ----
['<UNK>', '<UNK>', 'a', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '.', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', 'in', 'a', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '.', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '.', '<UNK>', '<UNK>', '<UNK>'] -> 35



---- BPE ----
['T', 'his', 'is', 'a', 'de', 'ep', 'le', 'ar', 'nin', 'g', 't', 'o', 'ken', 'iz', 'a', 'ti', 'on', 'tut', 'or', 'i', 'al', '.', 'To', 'ken', 'iz', 'a', 'ti', 'on', 'is', 't', 'he', 'f', 'ir', 'st', 'ste', 'p', 'in', 'a', 'de', 'ep', 'le', 'ar', 'nin', 'g', 'N', 'L', 'P', 'p', 'ip', 'eline', '.', 'W', 'e', 'w', 'il', 'l', 'be', 'c', 'om', 'par', 'ing', 't', 'he', 't', 'o', 'ken', 's', 'gen', 'er', 'a', 'te', 'd', 'b', 'y', 'e', 'ac', 'h', 't', 'o', 'ken', 'iz', 'a', 'ti', 'on', 'mo', 'de', 'l', '.', 'E', '<UNK>', 'ci', 'te', 'd', 'mu', 'c', 'h', '?!', '<UNK>'] -> 98


---- UNI ----
['T', 'h', 'is', 'is', 'a', 'de', 'e', 'p', 'le', 'ar', 'nin', 'g', 't', 'o', 'ke

In [8]:

tokens_dict = {}

for alg in ['BPE', 'UNI', 'WPC']:
    trained_tokenizer = train_tokenizer(large_files, alg)
    input_string = "This is a deep learning tokenization tutorial. Tokenization is the first step in a deep learning NLP pipeline. We will be comparing the tokens generated by each tokenization model. Excited much?!😍"
    output = tokenize(input_string, trained_tokenizer)
    tokens_dict[alg] = output.tokens











In [10]:
import pandas as pd

max_len = max(len(tokens_dict['UNI']), len(tokens_dict['WPC']), len(tokens_dict['BPE']))
diff_bpe = max_len - len(tokens_dict['BPE'])
diff_wpc = max_len - len(tokens_dict['WPC'])

tokens_dict['BPE'] = tokens_dict['BPE'] + ['<PAD>']*diff_bpe
tokens_dict['WPC'] = tokens_dict['WPC'] + ['<PAD>']*diff_wpc


df = pd.DataFrame(tokens_dict)

In [11]:
df.head(10)


Unnamed: 0,BPE,UNI,WPC
0,T,T,T
1,his,h,##h
2,is,is,##is
3,a,is,is
4,de,a,a
5,ep,de,de
6,le,e,##ep
7,ar,p,le
8,nin,le,##ar
9,g,ar,##ni


In [12]:
df.describe(include= 'all')

Unnamed: 0,BPE,UNI,WPC
count,108,108,108
unique,60,52,65
top,<PAD>,e,<PAD>
freq,10,7,13
