## Load Tokenizer

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('../Models/my_tokenizer')

## Preprocess Data

The dataset can now be tokenized for training.

In [2]:
def translation_preprocess_function_bo_en(examples):

    # Prepare translation inputs and targets
    translation_inputs = ['Translate Tibetan to English: ' + example for example in examples['bo']]
    translation_targets = [example for example in examples['en']]
    
    # Tokenize translation inputs and targets
    translation_model_inputs = tokenizer(translation_inputs, text_target=translation_targets, 
                                         max_length=256, truncation=True, padding="max_length")
    
    return translation_model_inputs

def translation_preprocess_function_en_bo(examples):

    # Prepare translation inputs and targets
    translation_inputs = ['Translate English to Tibetan: ' + example for example in examples['en']]
    translation_targets = [example for example in examples['bo']]
    
    # Tokenize translation inputs and targets
    translation_model_inputs = tokenizer(translation_inputs, text_target=translation_targets, 
                                         max_length=256, truncation=True, padding="max_length")
    
    return translation_model_inputs

### Train Split Data

In [7]:
from datasets import load_from_disk, concatenate_datasets
import gc
from tqdm import tqdm

n_shards = 10
dataset = load_from_disk('../RawData/raw-ds')
train_data = dataset['train']
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end)) # create shard

    tokenized_shard_bo_en = shard.map(translation_preprocess_function_bo_en, batched=True)
    tokenized_shard_en_bo = shard.map(translation_preprocess_function_en_bo, batched=True)

    tokenized_shard = concatenate_datasets([tokenized_shard_bo_en, tokenized_shard_en_bo])

    tokenized_shard.save_to_disk(f'Data/bidirectinal/tokenized-shards/train/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()


Tokenizing shards:   0%|          | 0/10 [00:00<?, ?it/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172282 [00:00<?, ? examples/s]

Tokenizing shards:  10%|█         | 1/10 [01:17<11:39, 77.70s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172282 [00:00<?, ? examples/s]

Tokenizing shards:  20%|██        | 2/10 [02:32<10:07, 75.98s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172282 [00:00<?, ? examples/s]

Tokenizing shards:  30%|███       | 3/10 [03:47<08:48, 75.53s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172282 [00:00<?, ? examples/s]

Tokenizing shards:  40%|████      | 4/10 [05:02<07:32, 75.45s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172282 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 5/10 [06:17<06:16, 75.32s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172282 [00:00<?, ? examples/s]

Tokenizing shards:  60%|██████    | 6/10 [07:34<05:03, 75.83s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172282 [00:00<?, ? examples/s]

Tokenizing shards:  70%|███████   | 7/10 [08:50<03:47, 75.74s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172282 [00:00<?, ? examples/s]

Tokenizing shards:  80%|████████  | 8/10 [10:07<02:32, 76.25s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172282 [00:00<?, ? examples/s]

Tokenizing shards:  90%|█████████ | 9/10 [11:24<01:16, 76.48s/it]

Map:   0%|          | 0/86148 [00:00<?, ? examples/s]

Map:   0%|          | 0/86148 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172296 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 10/10 [12:40<00:00, 76.08s/it]


### Dev Split Data

In [5]:
from datasets import load_from_disk, concatenate_datasets
import gc
from tqdm import tqdm

n_shards = 4
dataset = load_from_disk('../RawData/raw-ds')
train_data = dataset['dev']
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end))
    
    tokenized_shard_bo_en = shard.map(translation_preprocess_function_bo_en, batched=True)
    tokenized_shard_en_bo = shard.map(translation_preprocess_function_en_bo, batched=True)

    tokenized_shard = concatenate_datasets([tokenized_shard_bo_en, tokenized_shard_en_bo])

    tokenized_shard.save_to_disk(f'Data/bidirectinal/tokenized-shards/dev/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()

Tokenizing shards:   0%|          | 0/4 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards:  25%|██▌       | 1/4 [00:00<00:01,  2.22it/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 2/4 [00:11<00:13,  6.80s/it]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards:  75%|███████▌  | 3/4 [00:22<00:08,  8.42s/it]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 4/4 [00:31<00:00,  7.94s/it]


### Test Split Data

In [6]:
from datasets import load_from_disk, concatenate_datasets
import gc
from tqdm import tqdm

n_shards = 4
dataset = load_from_disk('../RawData/raw-ds')
train_data = dataset['test']
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end))
    
    tokenized_shard_bo_en = shard.map(translation_preprocess_function_bo_en, batched=True)
    tokenized_shard_en_bo = shard.map(translation_preprocess_function_en_bo, batched=True)

    tokenized_shard = concatenate_datasets([tokenized_shard_bo_en, tokenized_shard_en_bo])

    tokenized_shard.save_to_disk(f'Data/bidirectinal/tokenized-shards/test/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()



Tokenizing shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Tokenizing shards:  25%|██▌       | 1/4 [00:20<01:01, 20.57s/it]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 2/4 [00:40<00:40, 20.35s/it]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Tokenizing shards:  75%|███████▌  | 3/4 [01:00<00:20, 20.24s/it]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 4/4 [01:20<00:00, 20.17s/it]


## Concatenate Tokenized Shards

In [9]:
from datasets import load_from_disk, concatenate_datasets
import os

shard_dir = 'Data/bidirectional/tokenized-shards/train'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
train = concatenate_datasets(datasets)

In [10]:
shard_dir = 'Data/bidirectional/tokenized-shards/dev'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
dev = concatenate_datasets(datasets)

In [11]:
shard_dir = 'Data/bidirectional/tokenized-shards/test'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
test = concatenate_datasets(datasets)

In [12]:
from datasets import DatasetDict

ds = DatasetDict()

ds['train'] = train
ds['dev'] = dev

ds['test'] = test

ds.save_to_disk('Data/bidirectional/tokenized-finetuning-ds')

ds

Saving the dataset (0/16 shards):   0%|          | 0/1722834 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/200000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['bo', 'en', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1722834
    })
    dev: Dataset({
        features: ['bo', 'en', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['bo', 'en', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 200000
    })
})