## Import statements

In [1]:
from datasets import load_dataset
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import AutoTokenizer, PreTrainedTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


## Data loading

In [2]:
dataset = load_dataset("rotten_tomatoes", split="train")


def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i: i + 1000]["text"]

## Initialization/training

In [3]:
tokenizer = Tokenizer(models.Unigram())
from tokenizers import Regex

tokenizer.normalizer = normalizers.Sequence(
    [
        normalizers.Replace("``", '"'),
        normalizers.Replace("''", '"'),
        normalizers.NFKD(),
        normalizers.StripAccents(),
        normalizers.Replace(Regex(" {2,}"), " "),
    ]
)
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(
    vocab_size=25000, special_tokens=special_tokens, unk_token="<unk>"
)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)





## Testing

In [4]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)
print(encoding.ids)
print(tokenizer.decode(encoding.ids))
print(len(tokenizer.get_vocab()))
print(tokenizer.get_vocab_size())

['▁', 'L', 'e', "t's", '▁test', '▁this', '▁to', 'ke', 'niz', 'er', '.']
[7, 2, 20, 2036, 1331, 35, 15, 3026, 10835, 99, 8]
▁ e t's ▁test ▁this ▁to ke niz er .
12344
12344


## Saving

In [5]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<cls>",
    sep_token="<sep>",
    mask_token="<mask>",
    padding_side="left",
)

wrapped_tokenizer.save_pretrained("../../saved_models/tokenizers/rotten_tomatoes_unigram_style")

('../../saved_models/tokenizers/rotten_tomatoes_unigram_style/tokenizer_config.json',
 '../../saved_models/tokenizers/rotten_tomatoes_unigram_style/special_tokens_map.json',
 '../../saved_models/tokenizers/rotten_tomatoes_unigram_style/tokenizer.json')

In [6]:
tok = AutoTokenizer.from_pretrained("../../saved_models/tokenizers/rotten_tomatoes_unigram_style")

tokens = tok.tokenize("Test number 2.!@#$%^&*()")

print(tokens)
ids = tok.convert_tokens_to_ids(tokens)
print(ids)
decoded_string = tok.decode(ids)
print(decoded_string)
print(len(tok.get_vocab()))

['▁', 'T', 'est', '▁number', '▁2', '.', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')']
[7, 2, 536, 1684, 916, 8, 298, 2, 6832, 12339, 10445, 2, 1589, 1103, 153, 152]
▁ <unk> est ▁number ▁2 . ! <unk> # $ % <unk> & * ( )
12344
