## Loading the Data

In [27]:
from datasets import load_dataset, concatenate_datasets

mont = load_dataset('billingsmoore/montano-bo-es', split='train')

lh = load_dataset('billingsmoore/tibetan-to-spanish-translation-dataset', split='train').rename_columns({'tibetan':'bo', 'spanish':'es'}).remove_columns(['phonetic'])

dataset = concatenate_datasets([mont, lh])

In [28]:
dataset

Dataset({
    features: ['bo', 'es'],
    num_rows: 21894
})

In [30]:
dataset[0]

{'bo': 'སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ།',
 'es': '¡Homenaje a todos los Budas y Bodhisatvas!'}

## Train Tokenizer

In [31]:
from tokenizers import SentencePieceBPETokenizer
from transformers import PreTrainedTokenizerFast

# Initialize and train the tokenizer
tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(
    (dataset['bo'] + dataset['es']),
    vocab_size=32_000,
    min_frequency=5,
    show_progress=True,
    special_tokens=["[PAD]", "[BOS]", "[EOS]", "<unk>"]
)

# Wrap the tokenizer with PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="<unk>",
    pad_token="[PAD]",  # Set padding token
    bos_token="[BOS]",
    eos_token="[EOS]"
)

# Save tokenizer for re-use
tokenizer.save_pretrained('bo-es-tokenizer')









('bo-es-tokenizer/tokenizer_config.json',
 'bo-es-tokenizer/special_tokens_map.json',
 'bo-es-tokenizer/tokenizer.json')