## Hugging Face Tokebizers Tutorial and Usage

Ref:- https://www.kaggle.com/funtowiczmo/hugging-face-tutorials-training-tokenizer

In [1]:
# !pip install tokenizers==0.5.2

In [12]:
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC, Lowercase, Sequence
from tokenizers.pre_tokenizers import ByteLevel

# First we create an empty Byte Pair Encoding model
tokenizer = Tokenizer(BPE.empty())

# Lower casing and unicode-normalization
tokenizer.normalizer = Sequence([NFKC(), Lowercase()])

# use a pre-tokenizer convert a input to a Byte Level representation
tokenizer.pre_tokenizer = ByteLevel()

# use a decoder so we can recover from a tokenized input to the original one
tokenizer.decoder = ByteLevelDecoder()


### Train the pipeline

In [13]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet())

tokenizer.train(trainer, ["./big.txt"])

print(f"Trained vocab size {tokenizer.get_vocab_size()}")

Trained vocab size 25000


In [15]:
tokenizer.model.save(".")

['./vocab.json', './merges.txt']

In [17]:
# Load the model
tokenizer.model = BPE.from_files("./vocab.json", "./merges.txt")

encoding = tokenizer.encode("This is a simple input to be tokenized")

print(f"Encoded String {encoding.tokens}")

decoded = tokenizer.decode(encoding.ids)

print(f"Decoded String {decoded}")

Encoded String ['Ġthis', 'Ġis', 'Ġa', 'Ġsimple', 'Ġin', 'put', 'Ġto', 'Ġbe', 'Ġtoken', 'ized']
Decoded String  this is a simple input to be tokenized
