## Loading the Data

In [1]:
from datasets import load_from_disk, Dataset

dataset = load_from_disk('combined-ds')

In [2]:
from datasets import Dataset

first = dataset['train'].select(range(len(dataset['train'])//4))

In [2]:
second = dataset['train'].select(range(len(dataset['train'])//4,len(dataset['train'])//2))

In [2]:
third = dataset['train'].select(range(len(dataset['train'])//2,len(dataset['train'])-len(dataset['train'])//4))

In [7]:
fourth = dataset['train'].select(range(len(dataset['train'])-len(dataset['train'])//4, len(dataset['train'])))

## Load Tokenizer

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('my_tokenizer')

## Preprocess Data

The dataset can now be tokenized for training.

In [3]:
def translation_preprocess_function(examples):

    # Prepare translation inputs and targets
    translation_inputs = ['Translate Tibetan to English: ' + example for example in examples['bo']]
    translation_targets = [example for example in examples['en']]
    
    # Tokenize translation inputs and targets
    translation_model_inputs = tokenizer(translation_inputs, text_target=translation_targets, 
                                         max_length=256, truncation=True, padding="max_length")
    
    
    return translation_model_inputs


In [6]:
first_tokenized = first.map(translation_preprocess_function, batched=True)
first_tokenized.save_to_disk('tokenized-shards/tokenized-first-ds')

Map:   0%|          | 0/382801 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/382801 [00:00<?, ? examples/s]

In [8]:
second_tokenized = second.map(translation_preprocess_function, batched=True)
second_tokenized.save_to_disk('tokenized-shards/tokenized-second-ds')

Map:   0%|          | 0/382802 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/382802 [00:00<?, ? examples/s]

In [6]:
third_tokenized = third.map(translation_preprocess_function, batched=True)
third_tokenized.save_to_disk('tokenized-third-ds')

Saving the dataset (0/4 shards):   0%|          | 0/382803 [00:00<?, ? examples/s]

In [8]:
fourth_tokenized = fourth.map(translation_preprocess_function, batched=True)
fourth_tokenized.save_to_disk('tokenized-fourth-ds')

Map:   0%|          | 0/382801 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/382801 [00:00<?, ? examples/s]

In [4]:
test_dataset = dataset['test'].map(translation_preprocess_function, batched=True)
test_dataset.save_to_disk('tokenized-test-ds')

Map:   0%|          | 0/170135 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/170135 [00:00<?, ? examples/s]

## Concatenate Tokenized Shards

In [3]:
from datasets import load_from_disk, concatenate_datasets

first = load_from_disk('tokenized-shards/tokenized-first-ds')
second = load_from_disk('tokenized-shards/tokenized-second-ds')
third = load_from_disk('tokenized-shards/tokenized-third-ds')
fourth = load_from_disk('tokenized-shards/tokenized-fourth-ds')

train = concatenate_datasets([first, second, third, fourth])

In [4]:
from datasets import DatasetDict

test = load_from_disk('tokenized-shards/tokenized-test-ds')

ds = DatasetDict()

ds['train'] = train
ds['test'] = test

ds.save_to_disk('tokenized-ds')

Saving the dataset (0/14 shards):   0%|          | 0/1531207 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/170135 [00:00<?, ? examples/s]