In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os
import json
from dataclasses import dataclass
from typing import Any
import random
import torch
from tqdm import tqdm
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, TrainingArguments
from any2json.training.augment import Augmentor
from any2json.utils import configure_loggers, logger, try_minify_json_string
from any2json.training.train import prepare_model_and_tokenizer, PipelineConfig, create_trainer, prepare_dataset
from any2json.training.utils import (
    build_tokenized_length_filter_fn,
    load_hf_dataset,
    apply_debug_limit,
    build_tokenize_fn,
    CausalLMDataCollator,
    prepare_splits,
    process_raw_to_tokenized,
)
from any2json.training.dataset import AugmentTokenizeDataset

# Configure logging
configure_loggers(level="INFO", basic_level="WARNING")

# Default model and config
DEFAULT_MODEL = "google/gemma-3-270m"


@dataclass
class DebugConfig(PipelineConfig):
    dataset_path: str = "btseytlin/any2json"
    model_name: str = DEFAULT_MODEL
    max_sequence_length: int = 2560
    debug_limit: int = 100  # Small for debugging
    val_size: int = 20
    dataloader_num_proc: int = 4
    augment: bool = True
    seed: int = 42
    pad_to_multiple_of: int = 8
    debug_tokens: bool = True


def setup_debug_environment():
    """Setup the exact same environment as training"""
    cfg = DebugConfig()

    args = TrainingArguments()

    # Load tokenizer
    model, tokenizer = prepare_model_and_tokenizer(cfg, args)

    train_dataset, eval_dataset = prepare_dataset(cfg, args, tokenizer)
    trainer = create_trainer(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        model=model,
        args=args,
        pad_to_multiple_of=cfg.pad_to_multiple_of,
        debug_tokens=cfg.debug_tokens,
        max_sequence_length=cfg.max_sequence_length,
    )


    return {
        "config": cfg,
        "tokenizer": tokenizer,
        "train_dataset": train_dataset,
        "val_dataset": eval_dataset,
        "collator": trainer.data_collator,
    }


# Helper functions for debugging
def print_tokenized_example(tokenizer, example, title="Example"):
    """Print a tokenized example in detail"""
    print(f"\n=== {title} ===")

    input_ids = example.get("input_ids", [])
    labels = example.get("labels", [])

    print(f"Input IDs length: {len(input_ids)}")
    print(f"Labels length: {len(labels)}")

    if input_ids:
        # Decode the full sequence
        full_text = tokenizer.decode(input_ids, skip_special_tokens=False)
        print(f"Full decoded text:\n{repr(full_text)}")

        # Find the boundary between prompt and target
        if labels:
            try:
                start_idx = next(i for i, l in enumerate(labels) if l != -100)
                prompt_ids = input_ids[:start_idx]
                target_ids = [
                    input_ids[i]
                    for i in range(len(input_ids))
                    if i < len(labels) and labels[i] != -100
                ]

                prompt_text = tokenizer.decode(prompt_ids, skip_special_tokens=False)
                target_text = tokenizer.decode(target_ids, skip_special_tokens=False)

                print(f"\nPrompt ({len(prompt_ids)} tokens):\n{repr(prompt_text)}")
                print(f"\nTarget ({len(target_ids)} tokens):\n{repr(target_text)}")
                print(f"\nBoundary index: {start_idx}")

            except StopIteration:
                print("❌ NO NON-MASKED LABELS FOUND!")
                print(f"Labels sample: {labels[:20]}...")

    print(
        f"Input IDs: {input_ids[:10]}...{input_ids[-5:] if len(input_ids) > 15 else ''}"
    )
    print(f"Labels:    {labels[:10]}...{labels[-5:] if len(labels) > 15 else ''}")


def debug_collator_batch(collator, examples, tokenizer):
    """Debug what happens during collation"""
    print(f"\n=== COLLATOR DEBUG ===")
    print(f"Input batch size: {len(examples)}")

    for i, ex in enumerate(examples):
        print(
            f"Example {i}: {len(ex['input_ids'])} tokens, {sum(1 for l in ex['labels'] if l != -100)} target tokens"
        )

    # Apply collator
    batch = collator(examples)

    print(f"\nAfter collation:")
    print(
        f"Batch shapes: input_ids={batch['input_ids'].shape}, labels={batch['labels'].shape}"
    )

    # Check each example in the batch
    for i in range(batch["input_ids"].shape[0]):
        input_ids = batch["input_ids"][i]
        labels = batch["labels"][i]
        attention_mask = batch["attention_mask"][i]

        non_pad_count = sum(
            1 for t in input_ids.tolist() if t != tokenizer.pad_token_id
        )
        non_masked_count = sum(1 for l in labels.tolist() if l != -100)
        attention_count = sum(attention_mask.tolist())

        assert attention_count == non_pad_count


        

        print(
            f"  Example {i}: {non_pad_count} non-pad, {non_masked_count} non-masked, {attention_count} attention"
        )

        if non_pad_count == 0:
            print(f"    ❌ ALL PADDING! This is the bug!")
        elif non_masked_count == 0:
            print(f"    ❌ ALL LABELS MASKED! This causes eval issues!")

        # Show the actual tokens for debugging
        if non_pad_count > 0:
            decoded = tokenizer.decode(input_ids, skip_special_tokens=False)
            print(f"    Decoded : {repr(decoded)}")


debug_data = setup_debug_environment()


  from .autonotebook import tqdm as notebook_tqdm
INFO:any2json:Configured any2json logger with level INFO
INFO:any2json:Applied debug limit: 100, now 100 train samples
INFO:any2json:Loaded 100 train samples
INFO:any2json:Minifying JSON schemas and outputs on load
Map (num_proc=4): 100%|██████████| 100/100 [00:00<00:00, 727.09 examples/s]
INFO:any2json:Target test size: 20
INFO:any2json:Train group sizes (top 10): [(0, 7), (3697, 1), (375, 1), (179, 1), (8045, 1), (11951, 1), (11952, 1), (1578, 1), (11954, 1), (161, 1)]
INFO:any2json:Test group sizes (top 10): [(11950, 1), (3338, 1), (11956, 1), (4444, 1), (6985, 1), (8046, 1), (11967, 1), (3339, 1), (1929, 1), (22228, 1)]
INFO:any2json:Prepared splits: DatasetDict({
    train: Dataset({
        features: ['input_data', 'schema', 'output', 'meta'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['input_data', 'schema', 'output', 'meta'],
        num_rows: 20
    })
})
INFO:any2json:Preparing train dataset
Flatt

In [69]:
debug_data.keys()

dict_keys(['config', 'tokenizer', 'train_dataset', 'val_dataset', 'collator'])

In [70]:
print(f"Train dataset size: {len(debug_data['train_dataset'])}")
print(f"Val dataset size: {len(debug_data['val_dataset'])}")

Train dataset size: 78
Val dataset size: 16


In [71]:
for _ in tqdm(debug_data["train_dataset"]):
    pass

for _ in tqdm(debug_data["val_dataset"]):
    pass


100%|██████████| 78/78 [00:00<00:00, 808.14it/s]
100%|██████████| 16/16 [00:00<00:00, 2259.25it/s]


In [151]:
print("\n--- Testing train examples ---")

i = random.randint(0, len(debug_data["train_dataset"]) - 1)
i = 39
example = debug_data["train_dataset"][i]
print_tokenized_example(debug_data["tokenizer"], example, f"Train Example {i}")


--- Testing train examples ---

=== Train Example 39 ===
Input IDs length: 391
Labels length: 391
Full decoded text:
'<bos>Convert input data to json according to JSONSchema\n[SCHEMA]{"items":{"properties":{"also_interesting_story_office_town":{"type":["integer","null"]},"build_four_late_song_while":{"type":["string","null"]},"explain_join_within":{"type":["string","null"]}},"type":["object","null"]},"type":["array","null"]}[INPUT]\n    INSERT INTO `leader` (\n        explain_join_within,\n        also_int\'eresting_story_office_town,\n        build_four_lat e_son_g_while)\n     VALUES\n(\'104.181.251.246\', -23378, \'9.8\'),\n(\'182.120.82.44\', 849006, \'56.9\'),\n(\'204.240.33.9-4\', 893876, \'722.23\');[OUTPUT][{"also_interesting_story_office_town":-23378,"build_four_late_song_while":"9.8","explain_join_within":"104.181.251.246"},{"also_interesting_story_office_town":849006,"build_four_late_song_while":"56.9","explain_join_within":"182.120.82.44"},{"also_interesting_story_office_t

In [73]:
print("\n--- Testing val examples ---")

i = random.randint(0, len(debug_data["val_dataset"]) - 1)
i = 1
example = debug_data["val_dataset"][i]
print_tokenized_example(debug_data["tokenizer"], example, f"Val Example {i}")


--- Testing val examples ---

=== Val Example 1 ===
Input IDs length: 159
Labels length: 159
Full decoded text:
'<bos>Convert input data to json according to JSONSchema\n[SCHEMA]{"properties":{"HelpThose":{"type":["string","null"]},"SaveCourseVariousTechnology":{"type":["string","null"]},"company_former_dark_peace_account":{"type":["integer","null"]}},"type":["object","null"]}[INPUT]\n    INSERT INTO `we` (\n        HelpThose,\n        company_former_dark_peace_account,\n        SaveCourseVariousTechnology)\n     VALUES\n(\'-80\', 182716, \'60.4\');[OUTPUT]{"HelpThose":"-80","SaveCourseVariousTechnology":"60.4","company_former_dark_peace_account":182716}<eos>'

Prompt (123 tokens):
'<bos>Convert input data to json according to JSONSchema\n[SCHEMA]{"properties":{"HelpThose":{"type":["string","null"]},"SaveCourseVariousTechnology":{"type":["string","null"]},"company_former_dark_peace_account":{"type":["integer","null"]}},"type":["object","null"]}[INPUT]\n    INSERT INTO `we` (\n        

In [152]:
print("\n--- Testing collation ---")
val_examples = [debug_data["val_dataset"][i] for i in range(3)]
debug_collator_batch(debug_data["collator"], val_examples, debug_data["tokenizer"])



--- Testing collation ---

=== COLLATOR DEBUG ===
Input batch size: 3
Example 0: 656 tokens, 248 target tokens
Example 1: 159 tokens, 36 target tokens
Example 2: 1998 tokens, 923 target tokens

After collation:
Batch shapes: input_ids=torch.Size([3, 2000]), labels=torch.Size([3, 2000])
  Example 0: 656 non-pad, 248 non-masked, 656 attention
    Decoded : '<bos>Convert input data to json according to JSONSchema\n[SCHEMA]{"items":{"properties":{"AgreeMyWould":{"type":["string","null"]},"GroupLaterWhether":{"type":["string","null"]},"HitUnit":{"type":["integer","null"]},"LikeMiddleThis":{"type":["string","null"]},"PositionFormTogetherMakeYet":{"type":["string","null"]},"RemainDevelopmentSomebody":{"type":["string","null"]},"TalkAHit":{"type":["number","null"]},"add_such_allow_able":{"type":["string","null"]},"old_door_become_wear_spring":{"type":["string","null"]},"respond":{"type":["string","null"]},"several_whatever_talk_fly":{"type":["string","null"]}},"type":["object","null"]},"type"

In [153]:
train_examples = [debug_data["train_dataset"][i] for i in range(3)]
debug_collator_batch(debug_data["collator"], train_examples, debug_data["tokenizer"])



=== COLLATOR DEBUG ===
Input batch size: 3
Example 0: 459 tokens, 155 target tokens
Example 1: 1038 tokens, 365 target tokens
Example 2: 208 tokens, 88 target tokens

After collation:
Batch shapes: input_ids=torch.Size([3, 1040]), labels=torch.Size([3, 1040])
  Example 0: 459 non-pad, 155 non-masked, 459 attention
    Decoded : '<bos>Convert input data to json according to JSONSchema\n[SCHEMA]{"items":{"properties":{"1":{"type":["string","null"]},"2":{"type":["number","null"]},"3":{"type":["string","null"]},"4":{"type":["string","null"]},"5":{"type":["string","null"]},"6":{"type":["string","null"]},"7":{"type":["integer","null"]},"8":{"type":["number","null"]},"9":{"type":["string","null"]}},"type":["object","null"]},"type":["array","null"]}[INPUT]\\begin{tabular}{lrllllrll}\n\\toprule\n1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 \\\\\n\\midrule\nhttp://www.johnston.info/ & -14.950000 & 124.238.178.178 & LightYellow & -1.0 & Lance Goodman & 1 & NaN & http://evans-gordon.biz/ \\\\\nhttps://www.m

In [None]:
from pydantic import BaseModel
from typing import Literal, Optional
import outlines
import openai

class Customer(BaseModel):
    name: str
    urgency: Literal["high", "medium", "low"]
    issue: str
    reporter: Optional[str]

# dump the schema
print(Customer.model_json_schema())

client = openai.OpenAI()
model = outlines.from_openai(client, "gpt-4o")

customer = model(
    "Alice needs help with login issues ASAP",
    Customer
)
# ✓ Always returns valid Customer object
# ✓ No parsing, no errors, no retries

{'properties': {'name': {'title': 'Name', 'type': 'string'}, 'urgency': {'enum': ['high', 'medium', 'low'], 'title': 'Urgency', 'type': 'string'}, 'issue': {'title': 'Issue', 'type': 'string'}, 'reporter': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'title': 'Reporter'}}, 'required': ['name', 'urgency', 'issue', 'reporter'], 'title': 'Customer', 'type': 'object'}


In [31]:
json.dumps(Customer.model_json_schema())

'{"properties": {"name": {"title": "Name", "type": "string"}, "urgency": {"enum": ["high", "medium", "low"], "title": "Urgency", "type": "string"}, "issue": {"title": "Issue", "type": "string"}, "reporter": {"anyOf": [{"type": "string"}, {"type": "null"}], "title": "Reporter"}}, "required": ["name", "urgency", "issue", "reporter"], "title": "Customer", "type": "object"}'

In [41]:
import xgrammar as xgr

schema = """
{"properties": {"name": {"title": "Name", "type": ["string", "null"]}}, "title": "Customer", "type": "object"}
"""

tokenizer_info = xgr.TokenizerInfo.from_huggingface(debug_data['tokenizer'])
grammar_compiler = xgr.GrammarCompiler(tokenizer_info)
compiled_grammar = grammar_compiler.compile_json_schema(schema)
compiled_grammar

<xgrammar.compiler.CompiledGrammar at 0x332ec8dd0>

In [None]:
# xgr_logits_processor = xgr.contrib.hf.LogitsProcessor(compiled_grammar)
# generated_ids = model.generate(
#     **model_inputs, max_new_tokens=512, logits_processor=[xgr_logits_processor]
# )
# generated_ids = generated_ids[0][len(model_inputs.input_ids[0]) :]
# print(tokenizer.decode(generated_ids, skip_special_tokens=True))

NameError: name 'model_inputs' is not defined