## Tokenize the Data for Pretraining

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('../Tokenizer/my_tokenizer')

In [2]:
def tokenize_data(example):
    inputs = tokenizer(example["input_text"], max_length=256, truncation=True, padding="max_length")
    targets = tokenizer(example["target_text"], max_length=256, truncation=True, padding="max_length")
    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": targets.input_ids
    }

In [4]:
from datasets import load_from_disk
import gc
from tqdm import tqdm

n_shards = 10
train_data = load_from_disk('Data/pretraining-ds')
total_len = len(train_data)
shard_size = total_len // n_shards

for i in tqdm(range(n_shards), desc="Tokenizing shards"):
    start = i * shard_size
    end = (i + 1) * shard_size if i < n_shards - 1 else total_len  # include remainder in last shard

    shard = train_data.select(range(start, end))
    tokenized_shard = shard.map(tokenize_data, batched=True)
    tokenized_shard.save_to_disk(f'Data/tokenized-shards/tokenized-shard_{i}')

    del shard
    del tokenized_shard
    gc.collect()

Tokenizing shards:   0%|          | 0/10 [00:00<?, ?it/s]

Map:   0%|          | 0/172283 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172283 [00:00<?, ? examples/s]

Tokenizing shards:  10%|█         | 1/10 [01:13<11:01, 73.50s/it]

Map:   0%|          | 0/172283 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172283 [00:00<?, ? examples/s]

Tokenizing shards:  20%|██        | 2/10 [02:36<10:32, 79.07s/it]

Map:   0%|          | 0/172283 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172283 [00:00<?, ? examples/s]

Tokenizing shards:  30%|███       | 3/10 [04:03<09:40, 82.88s/it]

Map:   0%|          | 0/172283 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172283 [00:00<?, ? examples/s]

Tokenizing shards:  40%|████      | 4/10 [05:39<08:47, 87.91s/it]

Map:   0%|          | 0/172283 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172283 [00:00<?, ? examples/s]

Tokenizing shards:  50%|█████     | 5/10 [07:01<07:09, 85.83s/it]

Map:   0%|          | 0/172283 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172283 [00:00<?, ? examples/s]

Tokenizing shards:  60%|██████    | 6/10 [08:25<05:40, 85.17s/it]

Map:   0%|          | 0/172283 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172283 [00:00<?, ? examples/s]

Tokenizing shards:  70%|███████   | 7/10 [09:50<04:15, 85.20s/it]

Map:   0%|          | 0/172283 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172283 [00:00<?, ? examples/s]

Tokenizing shards:  80%|████████  | 8/10 [11:15<02:50, 85.14s/it]

Map:   0%|          | 0/172283 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172283 [00:00<?, ? examples/s]

Tokenizing shards:  90%|█████████ | 9/10 [12:38<01:24, 84.22s/it]

Map:   0%|          | 0/172287 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/172287 [00:00<?, ? examples/s]

Tokenizing shards: 100%|██████████| 10/10 [13:59<00:00, 83.92s/it]


## Concatenate Shards

In [1]:
from datasets import load_from_disk, concatenate_datasets
import os

shard_dir = 'Data/tokenized-shards'
shard_paths = sorted(
    [os.path.join(shard_dir, d) for d in os.listdir(shard_dir) if d.startswith('tokenized-shard_')]
)

datasets = [load_from_disk(path) for path in shard_paths]
ds = concatenate_datasets(datasets)
ds.save_to_disk('tokenized-pretraining-ds')

Saving the dataset (0/17 shards):   0%|          | 0/1722834 [00:00<?, ? examples/s]