## Import statements

In [58]:
from datasets import load_dataset
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import AutoTokenizer, PreTrainedTokenizerFast

## Data loading

In [59]:
dataset = load_dataset("rotten_tomatoes", split="train")


def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i: i + 1000]["text"]

## Initialization/training

In [60]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
tokenizer.decoder = decoders.WordPiece(prefix="##")






## Testing

In [61]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)
print(encoding.ids)
print(tokenizer.decode(encoding.ids))

['let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.']
[1226, 11, 55, 2097, 183, 11558, 2487, 11435, 18]
let ' s test this tokenizer.


## Saving

In [62]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

wrapped_tokenizer.save_pretrained("../../saved_models/tokenizers/rotten_tomatoes_bert_style")

('../../saved_models/tokenizers/rotten_tomatoes_bert_style/tokenizer_config.json',
 '../../saved_models/tokenizers/rotten_tomatoes_bert_style/special_tokens_map.json',
 '../../saved_models/tokenizers/rotten_tomatoes_bert_style/tokenizer.json')

In [63]:
tok = AutoTokenizer.from_pretrained("../../saved_models/tokenizers/rotten_tomatoes_bert_style")

tokens = tok.tokenize("Test number 2.!@#$%^&*()")

print(tokens)
ids = tok.convert_tokens_to_ids(tokens)
print(ids)
decoded_string = tok.decode(ids)
print(decoded_string)

['test', 'number', '2', '.', '!', '[UNK]', '#', '$', '%', '[UNK]', '&', '*', '(', ')']
[2097, 3216, 22, 18, 5, 0, 7, 8, 9, 0, 10, 14, 12, 13]
test number 2.! [UNK] # $ % [UNK] & * ( )
