# Dataset Inspection

Character counts, token counts, and pad% estimates for all training datasets.
Includes a "model view" section showing exactly what token sequences the model receives,
with special tokens visible and loss-mask annotations.

In [30]:
import sys, os
sys.path.insert(0, os.path.join(os.getcwd(), '..'))

import random
import numpy as np
import pyarrow.parquet as pq
from transformers import AutoTokenizer

random.seed(42)

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({
    'bos_token': '<|beginoftext|>',
    'pad_token': '<|pad|>',
    'additional_special_tokens': ['<|user|>', '<|assistant|>', '<|system|>'],
})
print(f'Vocab: {len(tokenizer)}')
print(f'BOS={tokenizer.bos_token_id}, EOS={tokenizer.eos_token_id}, PAD={tokenizer.pad_token_id}')

Vocab: 50262
BOS=50257, EOS=50256, PAD=50258


## Config

In [31]:
import glob as _glob_mod

N_SAMPLES = 500    # rows to sample per dataset for stats
SEQ_LEN   = 2048  # packing chunk length

BASE_DIR     = os.path.join(os.getcwd(), '..', 'data')
fineweb_dir  = os.path.join(BASE_DIR, 'base_data')
conv_dir     = os.path.join(BASE_DIR, 'conversation_data')
lima_dir     = os.path.join(BASE_DIR, 'lima_data')
mmlu_dir     = os.path.join(BASE_DIR, 'mmlu_data')
gsm8k_dir    = os.path.join(BASE_DIR, 'gsm8k_data')

def _glob(d, pattern='*.parquet'):
    if not os.path.isdir(d): return []
    return sorted(_glob_mod.glob(os.path.join(d, pattern)))

fineweb_files = _glob(fineweb_dir)
conv_files    = _glob(conv_dir)
lima_files    = _glob(lima_dir)
mmlu_files    = {'train': _glob(mmlu_dir, 'train.parquet'),
                 'test':  _glob(mmlu_dir, 'test.parquet')}
gsm8k_files   = {'train': _glob(gsm8k_dir, 'train.parquet'),
                 'test':  _glob(gsm8k_dir, 'test.parquet')}

print(f'fineweb  : {len(fineweb_files)} shards')
print(f'conv     : {len(conv_files)} shards')
print(f'lima     : {len(lima_files)} files')
print(f'mmlu     : train={len(mmlu_files["train"])}, test={len(mmlu_files["test"])}')
print(f'gsm8k    : train={len(gsm8k_files["train"])}, test={len(gsm8k_files["test"])}')

fineweb  : 1706 shards
conv     : 31 shards
lima     : 1 files
mmlu     : train=1, test=1
gsm8k    : train=1, test=1


## Helpers

In [32]:
bos_id = tokenizer.bos_token_id
eos_id = tokenizer.eos_token_id
pad_id = tokenizer.pad_token_id
role_ids = {
    'user':      tokenizer.convert_tokens_to_ids('<|user|>'),
    'assistant': tokenizer.convert_tokens_to_ids('<|assistant|>'),
    'system':    tokenizer.convert_tokens_to_ids('<|system|>'),
}

def tokenize_conv(messages):
    """Mirror _tokenize_conversation with mask_policy='assistant_only'.
    Returns (token_ids, loss_mask) both of same length.
    """
    tokens, mask = [bos_id], [0]
    prev_role = None
    for msg in messages:
        role, content = msg['role'], msg['content']
        rid = role_ids.get(role)
        is_asst     = (role == 'assistant')
        is_turn_end = (role == 'user' and prev_role == 'assistant')
        if rid is not None:
            tokens.append(rid)
            mask.append(1 if (is_asst or is_turn_end) else 0)
        ids = tokenizer.encode(content, add_special_tokens=False)
        tokens.extend(ids)
        mask.extend([1 if is_asst else 0] * len(ids))
        prev_role = role
    tokens.append(eos_id)
    mask.append(1 if prev_role == 'assistant' else 0)
    return tokens, mask

def make_packed_chunk(msgs_iter, seq_len=SEQ_LEN):
    """Pack conversations (BOS-aligned) until one chunk of seq_len is full.
    msgs_iter: iterable of message lists (each item is one conversation).
    Returns (token_ids, loss_mask) of length seq_len.
    """
    buf_t, buf_m = [], []
    for msgs in msgs_iter:
        toks, m = tokenize_conv(msgs)
        toks = toks[:seq_len]; m = m[:seq_len]
        if buf_t and len(buf_t) + len(toks) > seq_len:
            n = seq_len - len(buf_t)
            buf_t.extend([pad_id] * n)
            buf_m.extend([0] * n)
            return buf_t, buf_m
        buf_t.extend(toks)
        buf_m.extend(m)
    if buf_t and len(buf_t) < seq_len:
        n = seq_len - len(buf_t)
        buf_t.extend([pad_id] * n)
        buf_m.extend([0] * n)
    return buf_t, buf_m

def model_view(tokens, mask=None, label='', show_n=200):
    """Print a human-readable view of what the model receives for a chunk."""
    n      = len(tokens)
    n_pad  = tokens.count(pad_id)
    n_bos  = tokens.count(bos_id)
    n_loss = sum(mask) if mask is not None else '—'
    text   = tokenizer.decode(tokens[:show_n], skip_special_tokens=False)
    suffix = f'  …+{n - show_n} more tokens' if n > show_n else ''
    print(f'\n{"─"*72}')
    print(f'  {label}')
    print(f'  {n:,} tokens | {n_bos} conv(s) | {n_pad} pad | {n_loss} loss tokens')
    print(f'{"─"*72}')
    print(f'  {text}{suffix}')

## Dataset Statistics

Simulates the actual packing seen during training to estimate total chunks and steps per epoch.

- **fineweb**: linear stream — `total_chunks = total_docs × mean_toks / SEQ_LEN` (no padding)
- **conv datasets**: BOS-aligned packing simulated on N_SAMPLES, scaled to full dataset row count
- Row counts read from parquet metadata (no data loaded); source fraction estimated from sampled shards for mixed-source conv files
- **Steps** = total_chunks ÷ effective_batch (batch_size × world_size × grad_accum)

In [33]:
import numpy as np

BATCH_SIZE = 18
WORLD_SIZE = 4
GRAD_ACCUM = max(1, 4 // WORLD_SIZE)
EFF_BATCH  = BATCH_SIZE * WORLD_SIZE * GRAD_ACCUM  # chunks per training step

def get_total_rows(files, meta_sample=50):
    """Estimate total rows via parquet metadata — no data loaded."""
    if not files: return 0
    sample = random.sample(files, min(meta_sample, len(files)))
    mean_rows = np.mean([pq.read_metadata(f).num_rows for f in sample])
    return int(mean_rows * len(files))

def sample_rows_with_count(files, n, source_filter=None):
    """Sample up to n rows and estimate total rows for this (filtered) source.
    For mixed-source files (conv_files), estimates source fraction from the sample.
    """
    if not files: return [], 0
    total_file_rows = get_total_rows(files)

    pool, pool_file_rows = [], 0
    for f in random.sample(files, min(len(files), 8)):
        file_rows = pq.read_table(f).to_pylist()
        pool_file_rows += len(file_rows)
        if source_filter:
            file_rows = [r for r in file_rows if r.get('source') == source_filter]
        pool.extend(file_rows)
        if len(pool) >= n * 3: break

    sampled = random.sample(pool, min(n, len(pool)))

    if source_filter and pool_file_rows > 0:
        src_fraction = len(pool) / pool_file_rows
        total_rows = int(total_file_rows * src_fraction)
    else:
        total_rows = total_file_rows

    return sampled, total_rows

def sim_pack(conv_lengths, seq_len=SEQ_LEN):
    """Simulate BOS-aligned packing. Returns (n_chunks, pad_pct)."""
    buf = n_chunks = padding = total = 0
    for L in conv_lengths:
        L = min(L, seq_len)
        if buf > 0 and buf + L > seq_len:
            padding += seq_len - buf
            total   += seq_len
            n_chunks += 1
            buf = L
        else:
            buf += L
    if buf > 0:
        padding += seq_len - buf
        total   += seq_len
        n_chunks += 1
    return n_chunks, (padding / max(total, 1)) * 100

datasets = [
    ('fineweb',       'text', fineweb_files,         None),
    ('smoltalk',      'conv', conv_files,             'smoltalk'),
    ('ultrachat_gen', 'conv', conv_files,             'ultrachat_gen'),
    ('ultrachat_sft', 'conv', conv_files,             'ultrachat_sft'),
    ('lima',          'conv', lima_files,             None),
    ('mmlu_train',    'conv', mmlu_files['train'],    None),
    ('mmlu_test',     'conv', mmlu_files['test'],     None),
    ('gsm8k_train',   'conv', gsm8k_files['train'],  None),
    ('gsm8k_test',    'conv', gsm8k_files['test'],   None),
]

results = {}
for name, dtype, files, src_filter in datasets:
    rows, total_rows = sample_rows_with_count(files, N_SAMPLES, source_filter=src_filter)
    if not rows:
        print(f'  {name}: NOT FOUND — skipping')
        continue

    if dtype == 'text':
        tok_counts   = [len(tokenizer.encode(r['text'], add_special_tokens=False)) + 1 for r in rows]
        mean_toks    = np.mean(tok_counts)
        total_chunks = int(total_rows * mean_toks / SEQ_LEN)  # linear, no padding
        pad_pct      = None
    else:
        msgs_list    = [r['messages'] for r in rows]
        tok_counts   = [len(tokenize_conv(msgs)[0]) for msgs in msgs_list]
        mean_toks    = np.mean(tok_counts)
        n_sample_chunks, pad_pct = sim_pack(tok_counts)
        total_chunks = int(n_sample_chunks * (total_rows / len(rows)))

    results[name] = dict(
        total_rows=total_rows, mean_toks=mean_toks,
        pad_pct=pad_pct, total_chunks=total_chunks,
        steps=total_chunks // EFF_BATCH,
    )
    print(f'  {name}: {len(rows)} samples, {total_rows:,} total rows')

print(f'\nBatch config: {BATCH_SIZE}/GPU × {WORLD_SIZE} GPUs × grad_accum {GRAD_ACCUM} = {EFF_BATCH} chunks/step\n')
hdr = f'{"Dataset":<18}  {"Total rows":>12}  {"Tok μ":>7}  {"Pad%":>6}  {"Chunks (est)":>14}  {"Steps @"+str(EFF_BATCH):>12}'
print(hdr)
print('─' * len(hdr))
for name, s in results.items():
    pad = f'{s["pad_pct"]:.1f}%' if s['pad_pct'] is not None else '    —'
    print(f'{name:<18}  {s["total_rows"]:>12,}  {s["mean_toks"]:>7.0f}  {pad:>6}  '
          f'{s["total_chunks"]:>14,}  {s["steps"]:>12,}')

Token indices sequence length is longer than the specified maximum sequence length for this model (2523 > 1024). Running this sequence through the model will result in indexing errors


  fineweb: 500 samples, 91,190,476 total rows
  smoltalk: 500 samples, 1,040,783 total rows
  ultrachat_gen: 500 samples, 255,393 total rows
  ultrachat_sft: 500 samples, 203,313 total rows
  lima: 500 samples, 1,030 total rows
  mmlu_train: 500 samples, 99,842 total rows
  mmlu_test: 500 samples, 115,700 total rows
  gsm8k_train: 500 samples, 7,473 total rows
  gsm8k_test: 500 samples, 1,319 total rows

Batch config: 18/GPU × 4 GPUs × grad_accum 1 = 72 chunks/step

Dataset               Total rows    Tok μ    Pad%    Chunks (est)     Steps @72
───────────────────────────────────────────────────────────────────────────────
fineweb               91,190,476     1127       —      50,193,856       697,136
smoltalk               1,040,783      906   22.0%         541,207         7,516
ultrachat_gen            255,393      931   24.1%         150,171         2,085
ultrachat_sft            203,313     1213   26.1%         156,144         2,168
lima                       1,030      704   22.7%

## Model View — what the model sees

Each block shows the actual decoded token stream the model receives, with special tokens visible.  
Header line shows total tokens, number of packed conversations, pad count, and loss-token count.

- **fineweb**: linear stream (no BOS, `<|endoftext|>` between docs, all tokens trained)
- **conv datasets**: BOS-aligned, `<|user|>` / `<|assistant|>` role tokens, loss only on assistant spans
- **mmlu_train**: single packed chunk showing multiple MC questions back-to-back

In [34]:
random.seed(0)

# ── FineWeb-Edu (linear packed stream — no BOS, EOS between docs) ─────────────
if fineweb_files:
    texts = pq.read_table(random.choice(fineweb_files))['text'].to_pylist()
    stream = []
    for t in random.sample(texts, min(20, len(texts))):
        stream.extend(tokenizer.encode(t, add_special_tokens=False))
        stream.append(eos_id)
        if len(stream) >= SEQ_LEN: break
    model_view(stream[:SEQ_LEN],
               label='fineweb_edu  [linear packed — no BOS, <|endoftext|> between docs]',
               show_n=200)
else:
    print('fineweb: not found')

# ── SmolTalk (multi-turn conv) ────────────────────────────────────────────────
if conv_files:
    pool = []
    for f in random.sample(conv_files, min(3, len(conv_files))):
        pool += [r for r in pq.read_table(f).to_pylist() if r['source'] == 'smoltalk']
    if pool:
        ex = random.choice(pool)
        toks, mask = tokenize_conv(ex['messages'])
        model_view(toks, mask,
                   label=f'smoltalk  [{len(ex["messages"])} turns, {len(toks)} tokens]',
                   show_n=200)

# ── LIMA (conv, longer outputs) ───────────────────────────────────────────────
if lima_files:
    rows = pq.read_table(lima_files[0]).to_pylist()
    ex = random.choice(rows)
    toks, mask = tokenize_conv(ex['messages'])
    model_view(toks, mask,
               label=f'lima  [{len(ex["messages"])} turns, {len(toks)} tokens]',
               show_n=200)
else:
    print('lima: not found')

# ── MMLU train (packed chunk — multiple MC questions per 2048-token window) ───
if mmlu_files['train']:
    rows = pq.read_table(mmlu_files['train'][0]).to_pylist()
    random.shuffle(rows)
    chunk_toks, chunk_mask = make_packed_chunk(r['messages'] for r in rows)
    model_view(chunk_toks, chunk_mask,
               label=f'mmlu_train  [packed {SEQ_LEN}-token chunk, multiple MC questions]',
               show_n=300)
else:
    print('mmlu: not found — run: python -m core.dataset --mmlu')

# ── GSM8K train (single example, step-by-step reasoning) ─────────────────────
if gsm8k_files['train']:
    rows = pq.read_table(gsm8k_files['train'][0]).to_pylist()
    ex = random.choice(rows)
    toks, mask = tokenize_conv(ex['messages'])
    model_view(toks, mask,
               label=f'gsm8k_train  [{len(toks)} tokens]',
               show_n=200)
else:
    print('gsm8k: not found — run: python -m core.dataset --gsm8k')


────────────────────────────────────────────────────────────────────────
  fineweb_edu  [linear packed — no BOS, <|endoftext|> between docs]
  2,048 tokens | 0 conv(s) | 0 pad | — loss tokens
────────────────────────────────────────────────────────────────────────
  Five foods that can stain your teeth
- 6 Months ago
Determined to keep your teeth shining bright? You already know how important it is to brush and floss daily, and to avoid smoking or chewing tobacco—and see a dentist periodically. However, dentists also suggest you to be mindful of certain foods and beverages to prevent your teeth from staining. Here is a list of 5 such foods and beverages that will stain your teeth.
- Black Coffee-The darker the beverage, more the staining, this is because the outer layer of the tooth is very porous and absorbs the dark colored beverage causing the teeth to stain. By that, I don't mean to say one must stop drinking coffee altogether, but you can make a lighter coffee, add some milk to l