# Convert SentencePiece to FastTokenizer

### Load SPM

In [1]:
import sentencepiece as spm

In [2]:
tokenizer_model = "tokenizer.model"
sealion_spm = spm.SentencePieceProcessor(model_file=tokenizer_model)

In [3]:
# Sanity check
sealion_spm.encode("There can be only one")

[1173, 493, 384, 851, 613]

### Extract vocab and merge from SPM

In [6]:
%run sentencepiece_extractor --provider sentencepiece --model tokenizer/tokenizer.model --vocab-output-path tokenizer/vocab.json --merges-output-path tokenizer/merges.txt

100%|██████████| 256000/256000 [00:00<00:00, 1398410.88it/s]
100%|██████████| 256000/256000 [2:09:21<00:00, 32.98it/s]  


### Load extracted vocab and merges into Tokenizers

In [7]:
from tokenizers import SentencePieceBPETokenizer

In [8]:
sealion_spBPE = SentencePieceBPETokenizer.from_file(merges_filename="tokenizer/merges.txt", vocab_filename="tokenizer/vocab.json")
sealion_spBPE

Tokenizer(vocabulary_size=256000, model=SentencePieceBPE, unk_token=<unk>, replacement=▁, add_prefix_space=True, dropout=None)

In [9]:
sealion_spBPE.save("tokenizer/sealion_spBPETokenizer.json")

### Cast SentencePieceBPETokenizer to PreTrainedTokenizerFast

In [7]:
from transformers import PreTrainedTokenizerFast

In [11]:
sealion_tokenizer_fast = PreTrainedTokenizerFast(tokenizer_file="tokenizer/sealion_spBPETokenizer.json")
sealion_tokenizer_fast

PreTrainedTokenizerFast(name_or_path='', vocab_size=256000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True)

In [12]:
# Sanity check
sealion_tokenizer_fast("There can be only one")

{'input_ids': [1173, 493, 384, 851, 613], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [13]:
sealion_tokenizer_fast.save_pretrained("SEAlion_FastTokenizer_TMP")

('SEABert_FastTokenizer_TMP/tokenizer_config.json',
 'SEABert_FastTokenizer_TMP/special_tokens_map.json',
 'SEABert_FastTokenizer_TMP/tokenizer.json')

### Load PretrainedTokenizerFast via AutoTokenizer and assign Special Tokens

In [6]:
from transformers import AutoTokenizer

In [15]:
sealion_fasttokenizer_tmp = AutoTokenizer.from_pretrained("SEAlion_FastTokenizer_TMP")
sealion_fasttokenizer_tmp

PreTrainedTokenizerFast(name_or_path='SEABert_FastTokenizer_TMP', vocab_size=256000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True)

In [16]:
# Sanity check
sealion_fasttokenizer_tmp("There can be only one")

{'input_ids': [1173, 493, 384, 851, 613], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [36]:
# Assign PAD, BOS, EOS, MASK, UNK tokens
sealion_fasttokenizer_tmp.pad_token_id = 0
sealion_fasttokenizer_tmp.bos_token_id = 1
sealion_fasttokenizer_tmp.eos_token_id = 2
sealion_fasttokenizer_tmp.mask_token_id = 3
sealion_fasttokenizer_tmp.unk_token_id = 8

In [37]:
# Sanity check
print(sealion_fasttokenizer_tmp.pad_token)
print(sealion_fasttokenizer_tmp.bos_token)
print(sealion_fasttokenizer_tmp.eos_token)
print(sealion_fasttokenizer_tmp.mask_token)
print(sealion_fasttokenizer_tmp.unk_token)

<|pad|>
<|bos|>
<|eos|>
<|mask|>
<|unk|>


In [38]:
# Assign all other Special Tokens
add_special_tokens = [
    "<|endofline|>", "\n", "\t", "\r", "\b",]
langs = [
    "<|en|>", "<|zh|>", "<|id|>", "<|ms|>", "<|tl|>", "<|my|>",
    "<|th|>", "<|lo|>", "<|km|>", "<|ta|>", "<|vi|>",
    "<|python|>", "<|javascript|>", "<|shell|>", "<|sql|>"]
for n in range(24, 1, -1):
    add_special_tokens.append(" "*n)
add_special_tokens.extend(langs)

In [39]:
sealion_fasttokenizer_tmp.add_special_tokens({"additional_special_tokens": add_special_tokens})
sealion_fasttokenizer_tmp.special_tokens_map

{'bos_token': '<|bos|>',
 'eos_token': '<|eos|>',
 'unk_token': '<|unk|>',
 'pad_token': '<|pad|>',
 'mask_token': '<|mask|>',
 'additional_special_tokens': ['<|endofline|>',
  '\n',
  '\t',
  '\r',
  '\x08',
  '                        ',
  '                       ',
  '                      ',
  '                     ',
  '                    ',
  '                   ',
  '                  ',
  '                 ',
  '                ',
  '               ',
  '              ',
  '             ',
  '            ',
  '           ',
  '          ',
  '         ',
  '        ',
  '       ',
  '      ',
  '     ',
  '    ',
  '   ',
  '  ',
  '<|en|>',
  '<|zh|>',
  '<|id|>',
  '<|ms|>',
  '<|tl|>',
  '<|my|>',
  '<|th|>',
  '<|lo|>',
  '<|km|>',
  '<|ta|>',
  '<|vi|>',
  '<|python|>',
  '<|javascript|>',
  '<|shell|>',
  '<|sql|>']}

In [40]:
# Save final version of tokenizer
sealion_fasttokenizer_tmp.save_pretrained("SEAlion_FastTokenizer_Final")

('SEABert_FastTokenizer_Final/tokenizer_config.json',
 'SEABert_FastTokenizer_Final/special_tokens_map.json',
 'SEABert_FastTokenizer_Final/tokenizer.json')

### Verification

In [8]:
sealion_tokenizer = AutoTokenizer.from_pretrained("SEAlion_FastTokenizer_Final")

In [9]:
# Check tokenizer type
isinstance(sealion_tokenizer, PreTrainedTokenizerFast)

True

In [44]:
# Check vocab size
sealion_tokenizer.vocab_size

256000

In [43]:
# Check special tokens
sealion_tokenizer("<|sql|> is a language written in <|en|>")

{'input_ids': [45, 371, 303, 4610, 4247, 337, 249923, 31], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [45]:
sealion_tokenizer.convert_tokens_to_ids("<|sql|>")

45

In [46]:
sealion_tokenizer.convert_tokens_to_ids("<|en|>")

31

In [47]:
sealion_tokenizer("                        There can be only one")

{'input_ids': [256001, 1173, 493, 384, 851, 613], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [48]:
sealion_tokenizer.convert_tokens_to_ids("                        ")

256001

In [4]:
sealion_tokenizer("Sea Lion is awesome")

{'input_ids': [9975, 19723, 371, 6032], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [5]:
sealion_tokenizer.decode([9975, 19723, 371, 6032])

'Sea Lion is awesome'