# Continue Pretraining on T5

In [1]:
from datasets import load_from_disk

dataset = load_from_disk('../RawData/raw-ds')
dataset

DatasetDict({
    train: Dataset({
        features: ['bo', 'en'],
        num_rows: 861417
    })
    dev: Dataset({
        features: ['bo', 'en'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['bo', 'en'],
        num_rows: 100000
    })
})

In [2]:
dataset = dataset['train']

## Train Tokenizer

T5 does not accomodate Tibetan in its tokenizer by default. To fix this, I've trained a custom tokenizer on the uncorrupted training data.

In [3]:
from tokenizers import SentencePieceBPETokenizer
from transformers import PreTrainedTokenizerFast

# Initialize and train the tokenizer
tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(
    (dataset['bo'] + dataset['en']),
    vocab_size=32_000,
    min_frequency=5,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]", "<unk>"]
)






In [4]:
# Wrap the tokenizer with PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="<unk>",
    pad_token="[PAD]",  # Set padding token
    bos_token="[BOS]",
    eos_token="[EOS]"
)

In [5]:
tokenizer.save_pretrained('./my_tokenizer')

('./my_tokenizer/tokenizer_config.json',
 './my_tokenizer/special_tokens_map.json',
 './my_tokenizer/tokenizer.json')