# Continue Pretraining on T5

In [None]:
from datasets import load_dataset, concatenate_datasets

agg = load_dataset("billingsmoore/Aggregated-bo-en", split='train')
op = load_dataset("openpecha/cleaned_MT_v1.0.3", split='train')

op = op.rename_column('Source', 'bo')
op = op.rename_column('Target', 'en')


dataset = concatenate_datasets([agg, op])

In [None]:
len(dataset)

NameError: name 'dataset' is not defined

## Train Tokenizer

T5 does not accomodate Tibetan in its tokenizer by default. To fix this, I've trained a custom tokenizer on the uncorrupted training data.

In [None]:
from tokenizers import SentencePieceBPETokenizer
from transformers import PreTrainedTokenizerFast

# Initialize and train the tokenizer
tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(
    (dataset['bo'] + dataset['en']),
    vocab_size=32_000,
    min_frequency=5,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]", "<unk>"]
)




In [None]:
# Wrap the tokenizer with PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="<unk>",
    pad_token="[PAD]",  # Set padding token
    bos_token="[BOS]",
    eos_token="[EOS]"
)

In [None]:
tokenizer.save_pretrained('./my_tokenizer')