In [2]:
from datasets import load_from_disk
dataset = load_from_disk('train-ds')
dataset['tibetan'][0]

'བྱིན་རླབས་བདུད་རྩིའི་སྣང་བས་འཇིག་རྟེན་ན།།'

In [3]:
from tokenizers import SentencePieceBPETokenizer
from transformers import PreTrainedTokenizerFast

# Initialize and train the tokenizer
tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(
    (dataset['tibetan'] + dataset['english']),
    vocab_size=32_000,
    min_frequency=5,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]", "<unk>"]
)

# Wrap the tokenizer with PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="<unk>",
    pad_token="[PAD]",  # Set padding token
    bos_token="[BOS]",
    eos_token="[EOS]"
)

# Encode and decode example
enc = tokenizer.encode(dataset[0]['tibetan'])
dec = tokenizer.decode(enc)
print(dec)




བྱིན་རླབས་བདུད་རྩིའི་སྣང་བས་འཇིག་རྟེན་ན།།


In [7]:
tokenizer.save_pretrained('custom-tokenizer')

('custom-tokenizer/tokenizer_config.json',
 'custom-tokenizer/special_tokens_map.json',
 'custom-tokenizer/tokenizer.json')