## Load Tokenizer

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('../Models/my_tokenizer')

## Preprocess Data

The dataset can now be tokenized for training.

In [2]:
def translation_preprocess_function(examples):

    # Prepare translation inputs and targets
    translation_inputs = ['Translate Tibetan to English: ' + example for example in examples['bo']]
    translation_targets = [example for example in examples['en']]
    
    # Tokenize translation inputs and targets
    translation_model_inputs = tokenizer(translation_inputs, text_target=translation_targets, 
                                         max_length=256, truncation=True, padding="max_length")
    
    
    return translation_model_inputs


### Train Split Data

In [4]:
from datasets import load_from_disk
import gc
from tqdm import tqdm

n_shards = 10
dataset = load_from_disk('../RawData/combined-ds')
train_data = dataset['train']
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end))
    tokenized_shard = shard.map(translation_preprocess_function, batched=True)
    tokenized_shard.save_to_disk(f'tokenized-shards/train/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()


Tokenizing shards:   0%|          | 0/10 [00:00<?, ?it/s]

Map:   0%|          | 0/153120 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/153120 [00:00<?, ? examples/s]

Tokenizing shards:  10%|█         | 1/10 [00:51<07:47, 51.93s/it]

Map:   0%|          | 0/153120 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/153120 [00:00<?, ? examples/s]

Tokenizing shards:  20%|██        | 2/10 [01:43<06:52, 51.55s/it]

Map:   0%|          | 0/153120 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/153120 [00:00<?, ? examples/s]

Tokenizing shards:  30%|███       | 3/10 [02:35<06:04, 52.05s/it]

Map:   0%|          | 0/153120 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/153120 [00:00<?, ? examples/s]

Tokenizing shards:  40%|████      | 4/10 [03:29<05:15, 52.51s/it]

Map:   0%|          | 0/153120 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/153120 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 5/10 [04:21<04:22, 52.52s/it]

Map:   0%|          | 0/153120 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/153120 [00:00<?, ? examples/s]

Tokenizing shards:  60%|██████    | 6/10 [05:13<03:29, 52.43s/it]

Map:   0%|          | 0/153120 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/153120 [00:00<?, ? examples/s]

Tokenizing shards:  70%|███████   | 7/10 [06:05<02:36, 52.11s/it]

Map:   0%|          | 0/153120 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/153120 [00:00<?, ? examples/s]

Tokenizing shards:  80%|████████  | 8/10 [06:59<01:45, 52.77s/it]

Map:   0%|          | 0/153120 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/153120 [00:00<?, ? examples/s]

Tokenizing shards:  90%|█████████ | 9/10 [07:52<00:52, 52.73s/it]

Map:   0%|          | 0/153127 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/153127 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 10/10 [08:43<00:00, 52.36s/it]


### Test Split Data

In [3]:
from datasets import load_from_disk
import gc
from tqdm import tqdm

n_shards = 4
dataset = load_from_disk('../RawData/combined-ds')
train_data = dataset['test']
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end))
    tokenized_shard = shard.map(translation_preprocess_function, batched=True)
    tokenized_shard.save_to_disk(f'tokenized-shards/test/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()



Tokenizing shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/42533 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/42533 [00:00<?, ? examples/s]

Tokenizing shards:  25%|██▌       | 1/4 [00:15<00:46, 15.48s/it]

Map:   0%|          | 0/42533 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/42533 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 2/4 [00:31<00:31, 15.59s/it]

Map:   0%|          | 0/42533 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/42533 [00:00<?, ? examples/s]

Tokenizing shards:  75%|███████▌  | 3/4 [00:46<00:15, 15.41s/it]

Map:   0%|          | 0/42536 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/42536 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 4/4 [01:01<00:00, 15.40s/it]


## Concatenate Tokenized Shards

In [5]:
from datasets import load_from_disk, concatenate_datasets
import os

shard_dir = 'tokenized-shards/train'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
train = concatenate_datasets(datasets)

In [6]:
shard_dir = 'tokenized-shards/test'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
test = concatenate_datasets(datasets)

In [7]:
from datasets import DatasetDict

ds = DatasetDict()

ds['train'] = train
ds['test'] = test

ds.save_to_disk('tokenized-finetuning-ds')

Saving the dataset (0/14 shards):   0%|          | 0/1531207 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/170135 [00:00<?, ? examples/s]