In [16]:
from datasets import load_dataset

ds = load_dataset("roneneldan/TinyStories")

In [17]:
import tiktoken
import os
import numpy as np
from tqdm.auto import tqdm

In [18]:
# Tikenizer Initialization

enc = tiktoken.get_encoding("gpt2")

In [19]:
# Tokenization Function

def process(example):
    import tiktoken
    enc = tiktoken.get_encoding("gpt2") # added here so that "enc" is available in multiprocessing.
    ids = enc.encode_ordinary(example["text"])
    out = {'ids': ids, 'len': len(ids)}
    return out

In [20]:
# Tokenizing the Dataset

if not os.path.exists("train.bin"):
    tokenized = ds.map(
        process, 
        remove_columns=["text"],
        desc="tokenizing the splits",
        num_proc=8
    )

In [22]:
for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename=f"{split}.bin"
    dtype = np.uint16
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    total_batches = 1024
    
    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f"Writing {filename}"):
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format("numpy")
        arr_batch = np.concatenate(batch['ids'])
        arr[idx: idx + len(arr_batch)] =arr_batch
        idx += len(arr_batch)
    arr.flush()

Writing train.bin: 100%|██████████| 1024/1024 [12:36<00:00,  1.35it/s]
Writing validation.bin: 100%|██████████| 1024/1024 [00:09<00:00, 107.25it/s]


Summary & What This Is For

Purpose: Converts a text dataset into GPT-style token ids and stores them sequentially in fast binary files, optimized for training neural LMs like GPT.

Why Batches: Efficient I/O and memory usage on large datasets.

Why memmap: Handles datasets larger than RAM by mapping files directly in memory.

Why np.uint16: Token IDs (max 50256 for GPT-2) fit into 16 bits, so saves space.

In [None]:
# create I/O batches for the dataset

# Some functions from https://github.com/karpathy/nanoGPT/blob/master/train.py with slight modifications
def get_batch(split):
    # We recreate np.memmap every batch to avoid a memory leak, as per
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    if split == 'train':
        data = np.memmap('train.bin', dtype=np.uint16, mode='r')
    else:
        data = np.memmap('validation.bin', dtype=np.uint16, mode='r')
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y