## Tokenize the Data for Pretraining

In [1]:
from datasets import load_from_disk

ds = load_from_disk('pretraining-ds')

In [2]:
first = ds.select(range(len(ds)//2))

In [2]:
second = ds.select(range(len(ds)//2, len(ds)))

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('my_tokenizer')

In [4]:
def tokenize_data(example):
    inputs = tokenizer(example["input_text"], max_length=256, truncation=True, padding="max_length")
    targets = tokenizer(example["target_text"], max_length=256, truncation=True, padding="max_length")
    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": targets.input_ids
    }

In [5]:
tokenized_first = first.map(tokenize_data, batched=True, remove_columns=["bo", 'en', "input_text", "target_text"])
tokenized_first.save_to_disk('tokenized-pretraining-first-ds')

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/500000 [00:00<?, ? examples/s]

In [5]:
tokenized_second = second.map(tokenize_data, batched=True, remove_columns=["bo", 'en', "input_text", "target_text"])
tokenized_second.save_to_disk('tokenized-pretraining-second-ds')

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/500000 [00:00<?, ? examples/s]

## Concatenate Shards

In [1]:
from datasets import load_from_disk, concatenate_datasets

first = load_from_disk('tokenized-pretraining-first-ds')
second = load_from_disk('tokenized-pretraining-second-ds')

tokenized_dataset = concatenate_datasets([first, second])

tokenized_dataset.save_to_disk('tokenized-pretraining-ds')

Saving the dataset (0/7 shards):   0%|          | 0/1000000 [00:00<?, ? examples/s]