In [20]:
import json
import os
from tqdm import tqdm

# Download and Prepare Different Sizes

In [21]:
!wget https://huggingface.co/datasets/mlfoundations/dclm-baseline-1.0/resolve/main/global-shard_01_of_10/local-shard_0_of_10/shard_00000000_processed.jsonl.zst

--2025-07-02 08:55:48--  https://huggingface.co/datasets/mlfoundations/dclm-baseline-1.0/resolve/main/global-shard_01_of_10/local-shard_0_of_10/shard_00000000_processed.jsonl.zst
Resolving huggingface.co (huggingface.co)... 52.222.136.89, 52.222.136.38, 52.222.136.117, ...
Connecting to huggingface.co (huggingface.co)|52.222.136.89|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/6d/96/6d960e289759d6a146125cb84c6134b9c0bc4344e4c59e0b3e902698997e5fe7/c006fce0b12ea2366ed94503344c7bc539e17c0aa10159a632272dd6d2929bdb?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27shard_00000000_processed.jsonl.zst%3B+filename%3D%22shard_00000000_processed.jsonl.zst%22%3B&Expires=1751442948&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1MTQ0Mjk0OH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzZkLzk2LzZkOTYwZTI4OTc1OWQ2YTE0NjEyNWNiODRjNjEzNGI5YzBiYzQzNDRlNGM1OWUwYjNlOTAyN

In [22]:
# Extract the downloaded file
!zstd -d shard_00000000_processed.jsonl.zst

zstd: shard_00000000_processed.jsonl already exists; overwrite (y/n) ? 

In [23]:
jsonl_data_path = 'shard_00000000_processed.jsonl'

In [24]:
data = []
with open(jsonl_data_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

In [25]:
len(data)

61000

In [27]:
# create JSONL files of different document sizes, for smaller sizes than len(data) drop remaining documents and for larger sizes pad with copies of the first K documents where K is the remaining number of documents

sizes = [1, 10, 100, 1000, 10000, 100000, 1000000]

for size in tqdm(sizes):

    if os.path.exists(f'benchmarking_data/data_{size}.jsonl'):
        print(f"File benchmarking_data/data_{size}.jsonl already exists, skipping...")
        continue

    if size < len(data):
        subset = data[:size]
    else:
        subset = data + [data[i % len(data)] for i in range(size - len(data))]

    assert len(subset) == size, f"Subset size mismatch: expected {size}, got {len(subset)}"

    with open(f'benchmarking_data/data_{size}.jsonl', 'w') as f:
        for item in subset:
            f.write(json.dumps(item) + '\n')

100%|██████████| 7/7 [00:00<00:00, 27988.68it/s]

File benchmarking_data/data_1.jsonl already exists, skipping...
File benchmarking_data/data_10.jsonl already exists, skipping...
File benchmarking_data/data_100.jsonl already exists, skipping...
File benchmarking_data/data_1000.jsonl already exists, skipping...
File benchmarking_data/data_10000.jsonl already exists, skipping...
File benchmarking_data/data_100000.jsonl already exists, skipping...
File benchmarking_data/data_1000000.jsonl already exists, skipping...





# Tokenize

In [28]:
import sys
sys.path.insert(0, "/NS/llm-pretraining/work/afkhan/tokensmith")
sys.path.insert(0, "/NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox")

In [29]:
from transformers import AutoTokenizer
TOKENIZER_NAME_OR_PATH = "EleutherAI/gpt-neox-20b"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME_OR_PATH, add_eos_token=True)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [30]:
from tokensmith import DatasetManager

dataset_manager = DatasetManager()

In [None]:
for size in tqdm(sizes):
    path = f'benchmarking_data/data_{size}.jsonl'
    output_prefix = f'benchmarking_data/data_{size}'
    log_file = f'benchmarking_data/log_{size}.txt'

    if os.path.exists(f'{output_prefix}_text_document.bin') and os.path.exists(f'{output_prefix}_text_document.idx'):
        print(f"Files {output_prefix}_text_document.bin and {output_prefix}_text_document.idx already exist, skipping...")
        continue
    
    dataset_manager.ingest.ingest_from_jsonl(
        input_jsonl_path=path,
        output_prefix=output_prefix,
        vocab_path='/NS/llm-pretraining/work/afkhan/tokensmith/artifacts/tokenizer.json',
        neox_dir='/NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox',
        workers=8,
        append_eod=True,
        dataset_impl='mmap',
        tokenizer_type='HFTokenizer',
        log_file=log_file,
    )

100%|██████████| 7/7 [00:00<00:00, 19303.17it/s]

Files benchmarking_data/data_1_text_document.bin and benchmarking_data/data_1_text_document.idx already exist, skipping...
Files benchmarking_data/data_10_text_document.bin and benchmarking_data/data_10_text_document.idx already exist, skipping...
Files benchmarking_data/data_100_text_document.bin and benchmarking_data/data_100_text_document.idx already exist, skipping...
Files benchmarking_data/data_1000_text_document.bin and benchmarking_data/data_1000_text_document.idx already exist, skipping...
Files benchmarking_data/data_10000_text_document.bin and benchmarking_data/data_10000_text_document.idx already exist, skipping...
Files benchmarking_data/data_100000_text_document.bin and benchmarking_data/data_100000_text_document.idx already exist, skipping...
Files benchmarking_data/data_1000000_text_document.bin and benchmarking_data/data_1000000_text_document.idx already exist, skipping...



