## Load Tokenizer

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('google-t5/t5-small')

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [2]:
# Generate a list of all Tibetan Unicode characters (U+0F00 to U+0FFF)
tibetan_chars = [chr(codepoint) for codepoint in range(0x0F00, 0x0FFF)]

# Add the Tibetan characters to the tokenizer's vocabulary
new_tokens = [char for char in tibetan_chars if char not in tokenizer.get_vocab()]

# Add new tokens to the tokenizer
tokenizer.add_tokens(new_tokens)

255

## Preprocess Data

The dataset can now be tokenized for training.

In [3]:
def translation_preprocess_function(examples):

    # Prepare translation inputs and targets
    translation_inputs = ['Translate Tibetan to English: ' + example for example in examples['bo']]
    translation_targets = [example for example in examples['en']]
    
    # Tokenize translation inputs and targets
    translation_model_inputs = tokenizer(translation_inputs, text_target=translation_targets, 
                                         max_length=256, truncation=True, padding="max_length")
    
    
    return translation_model_inputs


### Train Split Data

In [4]:
from datasets import load_from_disk
import gc
from tqdm import tqdm

n_shards = 10
dataset = load_from_disk('/home/j/Desktop/MLotsawa/Notebooks/Models/TibetanToEnglishTranslation/TibetanToEnglishTranslationv2/BuddhistOnly/RawData/raw-ds')
train_data = dataset['train']
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end))
    tokenized_shard = shard.map(translation_preprocess_function, batched=True)
    tokenized_shard.save_to_disk(f'Data/tokenized-shards/train/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()


Tokenizing shards:   0%|          | 0/10 [00:00<?, ?it/s]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  10%|█         | 1/10 [00:42<06:23, 42.64s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  20%|██        | 2/10 [01:23<05:34, 41.86s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  30%|███       | 3/10 [02:05<04:51, 41.69s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  40%|████      | 4/10 [02:47<04:10, 41.74s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 5/10 [03:29<03:28, 41.77s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  60%|██████    | 6/10 [04:13<02:50, 42.54s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  70%|███████   | 7/10 [04:59<02:11, 43.87s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  80%|████████  | 8/10 [05:43<01:27, 43.88s/it]

Map:   0%|          | 0/86141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86141 [00:00<?, ? examples/s]

Tokenizing shards:  90%|█████████ | 9/10 [06:26<00:43, 43.43s/it]

Map:   0%|          | 0/86148 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/86148 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 10/10 [07:08<00:00, 42.87s/it]


### Dev Split Data

In [6]:
from datasets import load_from_disk
import gc
from tqdm import tqdm

n_shards = 4
dataset = load_from_disk('/home/j/Desktop/MLotsawa/Notebooks/Models/TibetanToEnglishTranslation/TibetanToEnglishTranslationv2/BuddhistOnly/RawData/raw-ds')
train_data = dataset['dev']
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end))
    tokenized_shard = shard.map(translation_preprocess_function, batched=True)
    tokenized_shard.save_to_disk(f'Data/tokenized-shards/dev/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()

Tokenizing shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12500 [00:00<?, ? examples/s]

Tokenizing shards:  25%|██▌       | 1/4 [00:05<00:16,  5.51s/it]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12500 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 2/4 [00:10<00:10,  5.35s/it]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12500 [00:00<?, ? examples/s]

Tokenizing shards:  75%|███████▌  | 3/4 [00:16<00:05,  5.32s/it]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12500 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 4/4 [00:21<00:00,  5.31s/it]


### Test Split Data

In [7]:
from datasets import load_from_disk
import gc
from tqdm import tqdm

n_shards = 4
dataset = load_from_disk('/home/j/Desktop/MLotsawa/Notebooks/Models/TibetanToEnglishTranslation/TibetanToEnglishTranslationv2/BuddhistOnly/RawData/raw-ds')
train_data = dataset['test']
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end))
    tokenized_shard = shard.map(translation_preprocess_function, batched=True)
    tokenized_shard.save_to_disk(f'Data/tokenized-shards/test/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()



Tokenizing shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards:  25%|██▌       | 1/4 [00:12<00:36, 12.31s/it]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 2/4 [00:23<00:23, 11.86s/it]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards:  75%|███████▌  | 3/4 [00:35<00:11, 11.69s/it]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 4/4 [00:46<00:00, 11.57s/it]


## Concatenate Tokenized Shards

In [8]:
from datasets import load_from_disk, concatenate_datasets
import os

shard_dir = 'Data/tokenized-shards/train'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
train = concatenate_datasets(datasets)

In [9]:
shard_dir = 'Data/tokenized-shards/dev'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
dev = concatenate_datasets(datasets)

In [10]:
shard_dir = 'Data/tokenized-shards/test'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
test = concatenate_datasets(datasets)

In [11]:
from datasets import DatasetDict

ds = DatasetDict()

ds['train'] = train
ds['dev'] = dev

ds['test'] = test

ds.save_to_disk('Data/tokenized-finetuning-ds')

ds

Saving the dataset (0/8 shards):   0%|          | 0/861417 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['bo', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 861417
    })
    dev: Dataset({
        features: ['bo', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['bo', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100000
    })
})