In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.20.3-py3-none-any.whl.metadata (2.1 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp313-cp313-macosx_12_0_x86_64.whl.metadata (3.2 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting httpx<1.0.0 (from datasets)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp313-cp313-macosx_10_13_x86_64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py313-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.10.0,>=2023.1.0 (f

In [None]:
# Load the TinyStories dataset
from datasets import load_dataset
ds = load_dataset("roneneldan/TinyStories")

Training data = ~2M,
Test split = 21K (1%)

Step 2: Tokenization

Tokenize the dataset into token IDs
Create train.bin / test.bin files where the token IDs for the whole dataset is stored
Need to store on disk using memory-mapped array rather than RAM (Probably bigger machines can store on Local SSD)

In [None]:
import tiktoken # different tokenizers
import os
import numpy as np
from tqdm.auto import tqdm

enc = tiktoken.get_encoding("gpt2") # byte pair encoding

def process(example):
  ids = enc.encode_ordinary(example['text'])
  out = {'ids': ids, 'len': len(ids)}
  return out

# testing to see if tokens generated
print(process({'text': 'The quick brown fox jumps over the lazy dog.'}))

In [None]:
if not os.path.exists("train.bin"):
  tokenized = ds.map(process, remove_columns=['text'], desc="tokenization process", num_proc=8)

print(tokenized)

Runtime of above process on T4 GPU = ~5min
Creates a Dataset object like so:

DatasetDict({
    train: Dataset({
        features: ['ids', 'len'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['ids', 'len'],
        num_rows: 21990
    })
})

In [None]:
for split, dset in tokenized.items():
  arr_len = np.sum(dset['len'], dtype=np.uint64)
  filename = f'{split}.bin'
  dtype = np.uint16 # enc.max_token_value in GPT2 (Byte Pair Encoding) < 2^16, so all can fit
  arr = np.memmap(filename, dtype, mode='w+', shape=(arr_len,))
  total_batches = 1024 # based on GPT2 paper, can be modified

  # For each split, we combine the ids in batches, and concatenate all batched arrays
  idx = 0
  for batch_idx in tqdm(range(total_batches)):
    batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True)
    arr_batch = np.concatenate(batch['ids'])
    arr[idx: idx + len(arr_batch)] = arr_batch
    idx += len(arr_batch)
  
  arr.flush()

print(arr)

Runtime of batch processing = ~15min