In [3]:
import struct

with open("/NS/llm-pretraining/work/afkhan/tokensmith/artifacts/data_tokenized_text_document.bin", "rb") as f:
    header = f.read(16)  # Try the first 16 or 32 bytes
    print(header)

b'r\xb5\x0f\x00Nn\xe7\x0b\xfd\x006uY\x04\x1d\x01'


In [5]:
decoded = struct.unpack('<QQ', header) 

In [6]:
decoded

(857775535324902770, 80225150627610877)

In [7]:
import numpy as np
import struct

def convert_neox_to_tokengrams(bin_path, idx_path, output_bin_path, drop_eod=True, eod_token=50256):
    """
    Converts a Megatron-GPT-NeoX .bin/.idx dataset into a raw .bin format usable by Tokengrams.
    
    Parameters:
    - bin_path: str, path to the original .bin file
    - idx_path: str, path to the original .idx file
    - output_bin_path: str, where to write the flat token stream
    - drop_eod: bool, if True, drops the last token of every sequence if it equals the EOD token
    - eod_token: int, token ID that marks end-of-document
    """
    with open(idx_path, 'rb') as f:
        magic = f.read(8)
        if magic != b'MMIDIDX\x00':
            raise ValueError("Only mmap-style indexed dataset supported.")

        version = struct.unpack('<Q', f.read(8))[0]
        dtype_code = struct.unpack('<B', f.read(1))[0]
        dtype_map = {
            1: np.uint8, 2: np.int8, 3: np.int16,
            4: np.int32, 5: np.int64, 6: np.float16,
            7: np.float32, 8: np.uint16
        }
        if dtype_code not in dtype_map:
            raise ValueError(f"Unsupported dtype code: {dtype_code}")
        dtype = dtype_map[dtype_code]

        num_sequences = struct.unpack('<Q', f.read(8))[0]
        num_docs = struct.unpack('<Q', f.read(8))[0]

        # Read sequence sizes (uint32) and pointers (uint64)
        seq_sizes = np.frombuffer(f.read(num_sequences * 4), dtype=np.uint32)
        seq_pointers = np.frombuffer(f.read(num_sequences * 8), dtype=np.uint64)

        # Skip document index info (not needed)
        doc_index_size = num_docs * 2 * 4  # two uint32s per doc
        f.seek(doc_index_size, 1)

    # Read full token stream from .bin file
    token_data = np.memmap(bin_path, dtype=dtype, mode='r')

    # Collect sequences into a flat list
    flat_tokens = []
    for i in range(num_sequences):
        start = seq_pointers[i]
        end = start + seq_sizes[i]
        seq = token_data[start:end]

        if drop_eod and len(seq) > 0 and seq[-1] == eod_token:
            seq = seq[:-1]  # Drop final EOD token
        flat_tokens.append(seq)

    # Concatenate and write to file
    flat_array = np.concatenate(flat_tokens)
    flat_array.astype(dtype).tofile(output_bin_path)

    print(f"✅ Wrote {len(flat_array)} tokens to {output_bin_path}")


In [8]:
convert_neox_to_tokengrams(
    "/NS/llm-pretraining/work/afkhan/tokensmith/artifacts/data_tokenized_text_document.bin",
    "/NS/llm-pretraining/work/afkhan/tokensmith/artifacts/data_tokenized_text_document.idx",
    "/NS/llm-pretraining/work/afkhan/tokensmith/artifacts/data_tokenized_text_document_flat.bin"
)

ValueError: Unsupported dtype code: 0