# Packing Strategy Demo — Real Conversation Data

Loads real SmolTalk conversations from `data/conversation_data/` and shows what `_yield_packed` vs `_yield_bos_aligned` actually yield.

Run from the project root or ensure `data/conversation_data/` is available.

In [1]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))

from pathlib import Path
from transformers import AutoTokenizer
from run_11_sft import PackedStreamingDataset

# Tokenizer setup — identical to run_11_sft.py main
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({
    'bos_token': '<|beginoftext|>',
    'pad_token': '<|pad|>',
    'additional_special_tokens': ['<|user|>', '<|assistant|>', '<|system|>']
})

bos_id  = tokenizer.bos_token_id
eos_id  = tokenizer.eos_token_id
pad_id  = tokenizer.pad_token_id
usr_id  = tokenizer.convert_tokens_to_ids('<|user|>')
asst_id = tokenizer.convert_tokens_to_ids('<|assistant|>')
sys_id  = tokenizer.convert_tokens_to_ids('<|system|>')
ROLE_NAMES = {bos_id: 'BOS', eos_id: 'EOS', pad_id: 'PAD',
              usr_id: 'USR', asst_id: 'ASST', sys_id: 'SYS'}
BOUNDARY_IDS = set(ROLE_NAMES)

# Point to first conversation shard (validation shard)
conv_dir   = Path('../data/conversation_data')
conv_files = sorted(conv_dir.glob('*.parquet'))
demo_files = conv_files[:1]
print(f"Tokenizer vocab : {len(tokenizer)}")
print(f"Conversation shards found : {len(conv_files)}")
print(f"Using for demo  : {[f.name for f in demo_files]}")

  from .autonotebook import tqdm as notebook_tqdm


Tokenizer vocab : 50262
Conversation shards found : 31
Using for demo  : ['shard_00000.parquet']


In [2]:
N_CHUNKS   = 4
SEQ_LENGTH = 2048
SOURCE     = 'smoltalk'

# mask_policy='all'            → _yield_packed  (linear, no boundary alignment)
# mask_policy='assistant_only' → _yield_bos_aligned (each chunk starts at a conversation boundary)

ds_packed = PackedStreamingDataset(
    demo_files, tokenizer, seq_length=SEQ_LENGTH,
    rank=0, world_size=1, shuffle=False, max_sequences=N_CHUNKS,
    data_format='conversation', source_filter=SOURCE,
    mask_policy='all',
)
ds_aligned = PackedStreamingDataset(
    demo_files, tokenizer, seq_length=SEQ_LENGTH,
    rank=0, world_size=1, shuffle=False, max_sequences=N_CHUNKS,
    data_format='conversation', source_filter=SOURCE,
    mask_policy='assistant_only',
)

packed_chunks  = list(iter(ds_packed))
aligned_chunks = list(iter(ds_aligned))

print(f"_yield_packed  : {len(packed_chunks)} chunks")
print(f"_yield_aligned : {len(aligned_chunks)} chunks")

_yield_packed  : 4 chunks
_yield_aligned : 4 chunks


In [3]:
def show_chunk(chunk, idx=0, max_chars=90):
    """Show conversation structure of a chunk with train/masked annotations."""
    ids  = list(chunk['input_ids'])
    mask = list(chunk['loss_mask'])

    n_trained    = int(sum(mask))
    first_is_bos = (ids[0] == bos_id)

    # Walk tokens and group into role-delimited sections
    sections = []
    i = 0
    while i < len(ids):
        tok = ids[i]
        if tok in BOUNDARY_IDS:
            role = ROLE_NAMES[tok]
            role_trained = (mask[i] == 1)
            j = i + 1
            while j < len(ids) and ids[j] not in BOUNDARY_IDS:
                j += 1
            content_ids  = ids[i+1:j]
            content_mask = mask[i+1:j]
            trained = role_trained or any(m == 1 for m in content_mask)
            text = ''
            if content_ids:
                text = tokenizer.decode(content_ids, skip_special_tokens=True).strip()
                text = text[:max_chars] + ('...' if len(text) > max_chars else '')
            sections.append((role, text, trained))
            i = j
        else:
            # Tokens with no preceding role boundary — only happens in _yield_packed
            # when a chunk starts mid-conversation
            j = i
            while j < len(ids) and ids[j] not in BOUNDARY_IDS:
                j += 1
            text = tokenizer.decode(ids[i:j], skip_special_tokens=True).strip()
            text = text[:max_chars] + ('...' if len(text) > max_chars else '')
            trained = any(m == 1 for m in mask[i:j])
            sections.append(('[MID-CONV]', text, trained))
            i = j

    print(f"  ── Chunk {idx+1} {'─'*46}")
    ok = '✓' if first_is_bos else '✗'
    print(f"  {ok} starts at BOS : {first_is_bos}   "
          f"trained: {n_trained}/{len(mask)} tokens ({100*n_trained/len(mask):.1f}%)")
    for role, text, trained in sections:
        marker = '  ▶' if trained else '   '
        print(f"  {marker} <{role:<9}> {repr(text)}")
    print()


print('=' * 60)
print('  _yield_packed  (mask_policy="all", linear stream-packing)')
print('=' * 60)
for i, c in enumerate(packed_chunks):
    show_chunk(c, idx=i)

print('=' * 60)
print('  _yield_bos_aligned  (mask_policy="assistant_only")')
print('=' * 60)
for i, c in enumerate(aligned_chunks):
    show_chunk(c, idx=i)

  _yield_packed  (mask_policy="all", linear stream-packing)
  ── Chunk 1 ──────────────────────────────────────────────
  ✓ starts at BOS : True   trained: 2046/2048 tokens (99.9%)
    ▶ <[MID-CONV]> 'A tailor cut 0.75 inch off a skirt and some inches off a pair of pants. The tailor cut 0.2...'

  ── Chunk 2 ──────────────────────────────────────────────
  ✗ starts at BOS : False   trained: 2043/2048 tokens (99.8%)
    ▶ <[MID-CONV]> 'of the first corner. Defaults to 0."},"x2":{"type":"integer","description":"The x-coordina...'

  ── Chunk 3 ──────────────────────────────────────────────
  ✗ starts at BOS : False   trained: 2046/2048 tokens (99.9%)
    ▶ <[MID-CONV]> 'is to change "suggest" to "suggests" because "data" is a singular noun. However, I\'ve seen...'

  ── Chunk 4 ──────────────────────────────────────────────
  ✗ starts at BOS : False   trained: 2045/2048 tokens (99.9%)
    ▶ <[MID-CONV]> 'integer `10`. The memory is deleted when both `ptr1` and `ptr2` go out of scope.\n\n

In [6]:
# Summary: for each chunk, show what the model sees before its first trained label

print('CONTEXT BEFORE FIRST TRAINED LABEL')
print('─' * 60)
for label, chunks in [('packed ', packed_chunks), ('aligned', aligned_chunks)]:
    for i, c in enumerate(chunks):
        ids  = list(c['input_ids'])
        lbl  = list(c['labels'])
        mask = list(c['loss_mask'])
        first_pos = next((j for j, m in enumerate(mask) if m == 1), None)
        if first_pos is None:
            print(f"  {label}  Chunk {i+1}: no trained tokens")
            continue
        context_ids    = ids[:first_pos]
        first_label_id = lbl[first_pos]
        has_bos        = (bos_id in context_ids)
        has_usr        = (usr_id in context_ids)
        ok = '✓' if (has_bos and has_usr) else '✗'
        context_text = tokenizer.decode(
            [t for t in context_ids if t not in BOUNDARY_IDS],
            skip_special_tokens=True
        ).strip()[:80]
        first_label_text = tokenizer.decode([first_label_id], skip_special_tokens=False)
        role_sequence = [ROLE_NAMES[t] for t in context_ids if t in BOUNDARY_IDS]
        print(f"\n  {ok} {label}  Chunk {i+1}:")
        print(f"       role tokens seen : {role_sequence}")
        print(f"       user context     : {repr(context_text)}")
        print(f"       first prediction : {repr(first_label_text)}")
        if not has_bos:
            print(f"       ⚠ no BOS — chunk starts mid-conversation, user question is not in context")
        elif not has_usr:
            print(f"       ⚠ no USR  — assistant response has no preceding user turn in context")
        else:
            print(f"       ✓ full user turn visible before first trained token")

CONTEXT BEFORE FIRST TRAINED LABEL
────────────────────────────────────────────────────────────

  ✗ packed   Chunk 1:
       role tokens seen : []
       user context     : ''
       first prediction : '<|user|>'
       ⚠ no BOS — chunk starts mid-conversation, user question is not in context

  ✗ packed   Chunk 2:
       role tokens seen : []
       user context     : ''
       first prediction : ' the'
       ⚠ no BOS — chunk starts mid-conversation, user question is not in context

  ✗ packed   Chunk 3:
       role tokens seen : []
       user context     : ''
       first prediction : ' to'
       ⚠ no BOS — chunk starts mid-conversation, user question is not in context

  ✗ packed   Chunk 4:
       role tokens seen : []
       user context     : ''
       first prediction : ' `'
       ⚠ no BOS — chunk starts mid-conversation, user question is not in context
