## Load Tokenizer

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('../Models/my_tokenizer')

## Preprocess Data

The dataset can now be tokenized for training.

In [2]:
def translation_preprocess_function(examples):

    # Prepare translation inputs and targets
    translation_inputs = ['Translate English to Tibetan: ' + example for example in examples['en']]
    translation_targets = [example for example in examples['bo']]
    
    # Tokenize translation inputs and targets
    translation_model_inputs = tokenizer(translation_inputs, text_target=translation_targets, 
                                         max_length=256, truncation=True, padding="max_length")
    
    
    return translation_model_inputs


### Train Split Data

In [3]:
from datasets import load_from_disk
import gc
from tqdm import tqdm

n_shards = 10
dataset = load_from_disk('../RawData/raw-ds')
train_data = dataset['train']
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end))
    tokenized_shard = shard.map(translation_preprocess_function, batched=True)
    tokenized_shard.save_to_disk(f'Data/en-bo/tokenized-shards/train/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()


Tokenizing shards:   0%|          | 0/10 [00:00<?, ?it/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  10%|█         | 1/10 [00:36<05:32, 36.94s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  20%|██        | 2/10 [01:12<04:51, 36.40s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  30%|███       | 3/10 [01:49<04:13, 36.24s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  40%|████      | 4/10 [02:24<03:36, 36.14s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 5/10 [03:01<03:00, 36.14s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  60%|██████    | 6/10 [03:39<02:27, 36.79s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  70%|███████   | 7/10 [04:16<01:51, 37.01s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  80%|████████  | 8/10 [04:52<01:13, 36.70s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  90%|█████████ | 9/10 [05:30<00:37, 37.01s/it]

Map:   0%|          | 0/86148 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86148 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 10/10 [06:07<00:00, 36.71s/it]


### Dev Split Data

In [4]:
from datasets import load_from_disk
import gc
from tqdm import tqdm

n_shards = 4
dataset = load_from_disk('../RawData/raw-ds')
train_data = dataset['dev']
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end))
    tokenized_shard = shard.map(translation_preprocess_function, batched=True)
    tokenized_shard.save_to_disk(f'Data/en-bo/tokenized-shards/dev/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()

Tokenizing shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12500 [00:00<?, ? examples/s]

Tokenizing shards:  25%|██▌       | 1/4 [00:04<00:14,  4.77s/it]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12500 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 2/4 [00:09<00:09,  4.75s/it]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12500 [00:00<?, ? examples/s]

Tokenizing shards:  75%|███████▌  | 3/4 [00:14<00:04,  4.72s/it]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12500 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 4/4 [00:18<00:00,  4.72s/it]


### Test Split Data

In [5]:
from datasets import load_from_disk
import gc
from tqdm import tqdm

n_shards = 4
dataset = load_from_disk('../RawData/raw-ds')
train_data = dataset['test']
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end))
    tokenized_shard = shard.map(translation_preprocess_function, batched=True)
    tokenized_shard.save_to_disk(f'Data/en-bo/tokenized-shards/test/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()



Tokenizing shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards:  25%|██▌       | 1/4 [00:09<00:28,  9.57s/it]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 2/4 [00:18<00:18,  9.47s/it]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards:  75%|███████▌  | 3/4 [00:28<00:09,  9.50s/it]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 4/4 [00:38<00:00,  9.50s/it]


## Concatenate Tokenized Shards

In [6]:
from datasets import load_from_disk, concatenate_datasets
import os

shard_dir = 'Data/en-bo/tokenized-shards/train'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
train = concatenate_datasets(datasets)

In [7]:
shard_dir = 'Data/en-bo/tokenized-shards/dev'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
dev = concatenate_datasets(datasets)

In [8]:
shard_dir = 'Data/en-bo/tokenized-shards/test'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
test = concatenate_datasets(datasets)

In [9]:
from datasets import DatasetDict

ds = DatasetDict()

ds['train'] = train
ds['dev'] = dev

ds['test'] = test

ds.save_to_disk('Data/en-bo/tokenized-finetuning-ds')

ds

Saving the dataset (0/8 shards):   0%|          | 0/861417 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['bo', 'en', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 861417
    })
    dev: Dataset({
        features: ['bo', 'en', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['bo', 'en', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100000
    })
})