# 01 - Data Generation for D&D Query Understanding

This notebook generates synthetic training data for two transformer models:
1. **Tool & Intent Classifier** - Multi-label tool selection + intent classification
2. **Entity Extractor** - NER for D&D-specific entities

## Dataset Composition Target
| Type | Count | % |
|------|-------|---------|
| 2-tool queries | 5,000 | 50% |
| 3-tool queries | 2,000 | 20% |
| 1-tool queries | 3,000 | 30% |
| **Total** | **10,000** | 100% |

## Setup & Mount Google Drive

In [2]:
# Mount Google Drive for data persistence
from google.colab import drive
drive.mount('/content/drive')

# Project root on Google Drive
PROJECT_ROOT = '/content/drive/MyDrive/574-assignment'

import os
os.makedirs(f'{PROJECT_ROOT}/data/generated', exist_ok=True)
os.makedirs(f'{PROJECT_ROOT}/models', exist_ok=True)
os.makedirs(f'{PROJECT_ROOT}/results', exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")

KeyboardInterrupt: 

In [None]:
# Install dependencies
!pip install transformers datasets seqeval -q

In [None]:
import json
import random
import re
from collections import defaultdict
from typing import Dict, List, Tuple, Any

import numpy as np

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

## Load Template Files

Copy the template files from local to Google Drive, or define them inline.

In [None]:
# If running locally or templates are on Drive, adjust path accordingly
# For now, we'll define the essential data structures inline

# ===========================================================================
# ENTITY GAZETTEERS (D&D 5e SRD entities for slot filling)
# ===========================================================================

SPELL_NAMES = [
    "Acid Splash", "Chill Touch", "Dancing Lights", "Eldritch Blast", "Fire Bolt",
    "Guidance", "Light", "Mage Hand", "Minor Illusion", "Prestidigitation",
    "Ray of Frost", "Sacred Flame", "Shocking Grasp", "Spare the Dying", "True Strike",
    "Alarm", "Bless", "Burning Hands", "Charm Person", "Cure Wounds",
    "Detect Magic", "Disguise Self", "Faerie Fire", "Feather Fall", "Fog Cloud",
    "Guiding Bolt", "Healing Word", "Heroism", "Hex", "Hunter's Mark",
    "Identify", "Mage Armor", "Magic Missile", "Shield", "Sleep",
    "Thunderwave", "Aid", "Blur", "Darkness", "Darkvision",
    "Hold Person", "Invisibility", "Knock", "Lesser Restoration", "Levitate",
    "Mirror Image", "Misty Step", "Moonbeam", "Pass without Trace", "Scorching Ray",
    "Shatter", "Silence", "Spider Climb", "Spiritual Weapon", "Suggestion",
    "Web", "Animate Dead", "Counterspell", "Dispel Magic", "Fear",
    "Fireball", "Fly", "Haste", "Hypnotic Pattern", "Lightning Bolt",
    "Major Image", "Revivify", "Slow", "Spirit Guardians", "Vampiric Touch",
    "Banishment", "Blight", "Confusion", "Dimension Door", "Greater Invisibility",
    "Ice Storm", "Polymorph", "Stoneskin", "Wall of Fire", "Animate Objects",
    "Cloudkill", "Cone of Cold", "Dominate Person", "Hold Monster", "Mass Cure Wounds",
    "Raise Dead", "Telekinesis", "Wall of Force", "Wall of Stone", "Chain Lightning",
    "Disintegrate", "Globe of Invulnerability", "Harm", "Heal", "Sunbeam",
    "True Seeing", "Delayed Blast Fireball", "Etherealness", "Finger of Death", "Forcecage",
    "Plane Shift", "Prismatic Spray", "Regenerate", "Resurrection", "Teleport",
    "Antimagic Field", "Clone", "Dominate Monster", "Earthquake", "Feeblemind",
    "Mind Blank", "Power Word Stun", "Sunburst", "Astral Projection", "Foresight",
    "Gate", "Imprisonment", "Mass Heal", "Meteor Swarm", "Power Word Kill",
    "Prismatic Wall", "Shapechange", "Time Stop", "True Polymorph", "True Resurrection", "Wish"
]

CREATURE_NAMES = [
    "Aboleth", "Ankheg", "Basilisk", "Behir", "Beholder", "Bugbear", "Bulette",
    "Carrion Crawler", "Centaur", "Chimera", "Cloaker", "Cockatrice", "Couatl",
    "Cyclops", "Darkmantle", "Deva", "Displacer Beast", "Doppelganger", "Dragon Turtle",
    "Drider", "Dryad", "Duergar", "Ettercap", "Ettin", "Fire Giant",
    "Frost Giant", "Gargoyle", "Gelatinous Cube", "Ghast", "Ghost", "Ghoul",
    "Giant Ape", "Giant Spider", "Gnoll", "Goblin", "Gorgon", "Grick",
    "Griffon", "Hag", "Harpy", "Hell Hound", "Hill Giant", "Hippogriff",
    "Hobgoblin", "Homunculus", "Hook Horror", "Hydra", "Imp", "Invisible Stalker",
    "Kobold", "Kraken", "Lamia", "Lich", "Lizardfolk", "Manticore",
    "Medusa", "Merfolk", "Merrow", "Mimic", "Mind Flayer", "Minotaur",
    "Mummy", "Naga", "Nightmare", "Ogre", "Oni", "Orc",
    "Otyugh", "Owlbear", "Pegasus", "Phase Spider", "Pit Fiend", "Planetar",
    "Purple Worm", "Rakshasa", "Remorhaz", "Roc", "Roper", "Rust Monster",
    "Sahuagin", "Salamander", "Satyr", "Shadow", "Shambling Mound", "Shield Guardian",
    "Skeleton", "Solar", "Specter", "Sphinx", "Stone Giant", "Storm Giant",
    "Succubus", "Tarrasque", "Treant", "Troll", "Umber Hulk", "Unicorn",
    "Vampire", "Wight", "Will-o'-Wisp", "Wraith", "Wyvern", "Xorn",
    "Yeti", "Zombie", "Adult Red Dragon", "Adult Black Dragon", "Adult Blue Dragon",
    "Ancient Red Dragon", "Ancient Black Dragon", "Young Dragon", "Dragon Wyrmling"
]

WEAPON_NAMES = [
    "Club", "Dagger", "Greatclub", "Handaxe", "Javelin", "Light Hammer",
    "Mace", "Quarterstaff", "Sickle", "Spear", "Crossbow", "Dart",
    "Shortbow", "Sling", "Battleaxe", "Flail", "Glaive", "Greataxe",
    "Greatsword", "Halberd", "Lance", "Longsword", "Maul", "Morningstar",
    "Pike", "Rapier", "Scimitar", "Shortsword", "Trident", "War Pick",
    "Warhammer", "Whip", "Blowgun", "Hand Crossbow", "Heavy Crossbow", "Longbow", "Net"
]

ARMOR_NAMES = [
    "Padded Armor", "Leather Armor", "Studded Leather", "Hide Armor", "Chain Shirt",
    "Scale Mail", "Breastplate", "Half Plate", "Ring Mail", "Chain Mail",
    "Splint Armor", "Plate Armor", "Shield"
]

MAGIC_ITEM_NAMES = [
    "Bag of Holding", "Boots of Elvenkind", "Bracers of Defense", "Cloak of Elvenkind",
    "Cloak of Protection", "Deck of Many Things", "Flame Tongue", "Frost Brand",
    "Gauntlets of Ogre Power", "Headband of Intellect", "Helm of Telepathy",
    "Ioun Stone", "Javelin of Lightning", "Necklace of Fireballs", "Pearl of Power",
    "Periapt of Wound Closure", "Potion of Healing", "Potion of Greater Healing",
    "Ring of Protection", "Ring of Spell Storing", "Ring of Invisibility",
    "Robe of the Archmagi", "Rod of Absorption", "Staff of Fire", "Staff of Power",
    "Staff of the Magi", "Sun Blade", "Sword of Sharpness", "Vorpal Sword",
    "Wand of Fireballs", "Wand of Lightning Bolts", "Wand of Magic Missiles",
    "Wings of Flying", "Amulet of Health", "Belt of Giant Strength", "Boots of Speed"
]

ITEM_NAMES = WEAPON_NAMES + ARMOR_NAMES + MAGIC_ITEM_NAMES + [
    "Backpack", "Bedroll", "Rope", "Torch", "Rations", "Waterskin",
    "Thieves' Tools", "Holy Symbol", "Spellbook", "Component Pouch"
]

CLASS_FEATURE_NAMES = [
    "Rage", "Unarmored Defense", "Reckless Attack", "Danger Sense", "Extra Attack",
    "Brutal Critical", "Relentless Rage", "Bardic Inspiration", "Jack of All Trades",
    "Song of Rest", "Expertise", "Font of Inspiration", "Countercharm", "Magical Secrets",
    "Channel Divinity", "Turn Undead", "Divine Intervention", "Wild Shape",
    "Beast Spells", "Archdruid", "Fighting Style", "Second Wind", "Action Surge",
    "Indomitable", "Martial Arts", "Ki", "Unarmored Movement", "Deflect Missiles",
    "Slow Fall", "Stunning Strike", "Evasion", "Diamond Soul", "Divine Sense",
    "Lay on Hands", "Divine Smite", "Aura of Protection", "Improved Divine Smite",
    "Favored Enemy", "Natural Explorer", "Primeval Awareness", "Vanish", "Foe Slayer",
    "Sneak Attack", "Thieves' Cant", "Cunning Action", "Uncanny Dodge", "Reliable Talent",
    "Blindsense", "Slippery Mind", "Elusive", "Stroke of Luck", "Sorcerous Origin",
    "Font of Magic", "Metamagic", "Sorcerous Restoration", "Otherworldly Patron",
    "Pact Magic", "Eldritch Invocations", "Pact Boon", "Mystic Arcanum",
    "Arcane Recovery", "Arcane Tradition", "Spell Mastery", "Signature Spells"
]

CLASS_NAMES = [
    "Barbarian", "Bard", "Cleric", "Druid", "Fighter", "Monk",
    "Paladin", "Ranger", "Rogue", "Sorcerer", "Warlock", "Wizard"
]

RACE_NAMES = [
    "Dragonborn", "Dwarf", "Elf", "Gnome", "Half-Elf", "Half-Orc",
    "Halfling", "Human", "Tiefling", "Aasimar", "Goliath", "Tabaxi"
]

SUBCLASS_NAMES = [
    "Path of the Berserker", "Path of the Totem Warrior", "College of Lore",
    "College of Valor", "Life Domain", "Light Domain", "War Domain",
    "Circle of the Land", "Circle of the Moon", "Champion", "Battle Master",
    "Eldritch Knight", "Way of the Open Hand", "Way of Shadow",
    "Oath of Devotion", "Oath of Vengeance", "Hunter", "Beast Master",
    "Thief", "Assassin", "Arcane Trickster", "Draconic Bloodline", "Wild Magic",
    "The Fiend", "The Archfey", "The Great Old One", "School of Evocation",
    "School of Abjuration", "School of Necromancy", "School of Divination"
]

CONDITION_NAMES = [
    "Blinded", "Charmed", "Deafened", "Exhaustion", "Frightened",
    "Grappled", "Incapacitated", "Invisible", "Paralyzed", "Petrified",
    "Poisoned", "Prone", "Restrained", "Stunned", "Unconscious"
]

PLANE_NAMES = [
    "Material Plane", "Feywild", "Shadowfell", "Ethereal Plane", "Astral Plane",
    "Elemental Plane of Fire", "Elemental Plane of Water", "Elemental Plane of Air",
    "Elemental Plane of Earth", "Mount Celestia", "Elysium", "The Abyss",
    "Nine Hells", "Limbo", "Mechanus", "Outlands", "Sigil"
]

DAMAGE_TYPES = [
    "Acid", "Bludgeoning", "Cold", "Fire", "Force", "Lightning",
    "Necrotic", "Piercing", "Poison", "Psychic", "Radiant", "Slashing", "Thunder"
]

SKILL_NAMES = [
    "Acrobatics", "Animal Handling", "Arcana", "Athletics", "Deception",
    "History", "Insight", "Intimidation", "Investigation", "Medicine",
    "Nature", "Perception", "Performance", "Persuasion", "Religion",
    "Sleight of Hand", "Stealth", "Survival"
]

ABILITY_NAMES = [
    "Strength", "Dexterity", "Constitution", "Intelligence", "Wisdom", "Charisma"
]

LEVEL_VALUES = [str(i) for i in range(1, 21)]
SPELL_LEVEL_VALUES = ["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th"]

MECHANIC_NAMES = [
    "grappling", "shoving", "flanking", "cover", "hiding", "surprise",
    "mounted combat", "underwater combat", "two-weapon fighting", "opportunity attacks",
    "concentration", "ritual casting", "upcasting", "multiclassing", "attunement"
]

FEAT_NAMES = [
    "Alert", "Athlete", "Actor", "Charger", "Crossbow Expert", "Defensive Duelist",
    "Dual Wielder", "Dungeon Delver", "Durable", "Elemental Adept", "Grappler",
    "Great Weapon Master", "Healer", "Heavy Armor Master", "Inspiring Leader",
    "Keen Mind", "Lightly Armored", "Linguist", "Lucky", "Mage Slayer",
    "Magic Initiate", "Martial Adept", "Medium Armor Master", "Mobile", "Moderately Armored",
    "Mounted Combatant", "Observant", "Polearm Master", "Resilient", "Ritual Caster",
    "Savage Attacker", "Sentinel", "Sharpshooter", "Shield Master", "Skilled",
    "Skulker", "Spell Sniper", "Tavern Brawler", "Tough", "War Caster", "Weapon Master"
]

WEAPON_PROPERTY_NAMES = [
    "finesse", "heavy", "light", "loading", "range", "reach",
    "thrown", "two-handed", "versatile", "ammunition", "special"
]

CREATURE_ABILITY_NAMES = [
    "Multiattack", "Breath Weapon", "Frightful Presence", "Magic Resistance",
    "Pack Tactics", "Regeneration", "Spider Climb", "Web", "Shapechanger",
    "Innate Spellcasting", "Legendary Resistance", "Legendary Actions"
]

# Slot fillers mapping
SLOT_FILLERS = {
    "spell": SPELL_NAMES,
    "spell1": SPELL_NAMES,
    "spell2": SPELL_NAMES,
    "creature": CREATURE_NAMES,
    "creature1": CREATURE_NAMES,
    "creature2": CREATURE_NAMES,
    "item": ITEM_NAMES,
    "weapon": WEAPON_NAMES,
    "weapon1": WEAPON_NAMES,
    "weapon2": WEAPON_NAMES,
    "armor": ARMOR_NAMES,
    "armor1": ARMOR_NAMES,
    "armor2": ARMOR_NAMES,
    "magic_item": MAGIC_ITEM_NAMES,
    "feature": CLASS_FEATURE_NAMES,
    "feature2": CLASS_FEATURE_NAMES,
    "class_name": CLASS_NAMES,
    "class": CLASS_NAMES,
    "class1": CLASS_NAMES,
    "class2": CLASS_NAMES,
    "race": RACE_NAMES,
    "race1": RACE_NAMES,
    "race2": RACE_NAMES,
    "subclass": SUBCLASS_NAMES,
    "subclass1": SUBCLASS_NAMES,
    "subclass2": SUBCLASS_NAMES,
    "condition": CONDITION_NAMES,
    "condition1": CONDITION_NAMES,
    "condition2": CONDITION_NAMES,
    "plane": PLANE_NAMES,
    "plane1": PLANE_NAMES,
    "plane2": PLANE_NAMES,
    "damage_type": DAMAGE_TYPES,
    "damage_type1": DAMAGE_TYPES,
    "damage_type2": DAMAGE_TYPES,
    "skill": SKILL_NAMES,
    "skill1": SKILL_NAMES,
    "skill2": SKILL_NAMES,
    "ability": ABILITY_NAMES,
    "level": LEVEL_VALUES,
    "spell_level": SPELL_LEVEL_VALUES,
    "mechanic": MECHANIC_NAMES,
    "feat": FEAT_NAMES,
    "weapon_property": WEAPON_PROPERTY_NAMES,
    "property": WEAPON_PROPERTY_NAMES,
    "creature_ability": CREATURE_ABILITY_NAMES,
}

print(f"Loaded {len(SPELL_NAMES)} spells, {len(CREATURE_NAMES)} creatures, {len(ITEM_NAMES)} items")
print(f"Total slot types: {len(SLOT_FILLERS)}")

## Define Intent Lists

In [None]:
# Tool definitions
TOOLS = ["character_data", "session_notes", "rulebook"]

# Intent definitions per tool
CHARACTER_INTENTS = [
    "character_basics", "combat_info", "abilities_info", "inventory_info",
    "magic_info", "story_info", "social_info", "progress_info",
    "full_character", "character_summary"
]

SESSION_INTENTS = [
    "character_status", "event_sequence", "npc_info", "location_details",
    "item_tracking", "combat_recap", "spell_ability_usage", "character_decisions",
    "party_dynamics", "quest_tracking", "puzzle_solutions", "loot_rewards",
    "death_revival", "divine_religious", "memory_vision", "rules_mechanics",
    "humor_moments", "unresolved_mysteries", "future_implications", "cross_session"
]

RULEBOOK_INTENTS = [
    "describe_entity", "compare_entities", "level_progression", "action_options",
    "rule_mechanics", "calculate_values", "spell_details", "class_spell_access",
    "monster_stats", "condition_effects", "character_creation", "multiclass_rules",
    "equipment_properties", "damage_types", "rest_mechanics", "skill_usage",
    "find_by_criteria", "prerequisite_check", "interaction_rules", "tactical_usage",
    "environmental_rules", "creature_abilities", "saving_throws", "magic_item_usage",
    "planar_properties", "downtime_activities", "subclass_features", "cost_lookup",
    "legendary_mechanics", "optimization_advice"
]

# Create mappings
TOOL_TO_INDEX = {tool: i for i, tool in enumerate(TOOLS)}
CHARACTER_INTENT_TO_INDEX = {intent: i for i, intent in enumerate(CHARACTER_INTENTS)}
SESSION_INTENT_TO_INDEX = {intent: i for i, intent in enumerate(SESSION_INTENTS)}
RULEBOOK_INTENT_TO_INDEX = {intent: i for i, intent in enumerate(RULEBOOK_INTENTS)}

print(f"Tools: {len(TOOLS)}")
print(f"Character intents: {len(CHARACTER_INTENTS)}")
print(f"Session intents: {len(SESSION_INTENTS)}")
print(f"Rulebook intents: {len(RULEBOOK_INTENTS)}")
print(f"Total intents: {len(CHARACTER_INTENTS) + len(SESSION_INTENTS) + len(RULEBOOK_INTENTS)}")

## Define Templates

Due to notebook size constraints, we'll define a representative subset of templates. In practice, load from the full template files.

In [None]:
# Single-tool templates (character_data)
CHARACTER_TEMPLATES = {
    "character_basics": [
        {"template": "What is my character's race?", "slots": {}},
        {"template": "What class am I?", "slots": {}},
        {"template": "What level am I?", "slots": {}},
        {"template": "What is my alignment?", "slots": {}},
        {"template": "What's my {ability} score?", "slots": {"ability": "ability"}},
        {"template": "What's my character's name?", "slots": {}},
        {"template": "What's my proficiency bonus?", "slots": {}},
        {"template": "What subclass am I?", "slots": {}},
        {"template": "What's my character's background?", "slots": {}},
    ],
    "combat_info": [
        {"template": "What is my AC?", "slots": {}},
        {"template": "What's my armor class?", "slots": {}},
        {"template": "How many hit points do I have?", "slots": {}},
        {"template": "What's my HP?", "slots": {}},
        {"template": "What's my initiative bonus?", "slots": {}},
        {"template": "What is my speed?", "slots": {}},
        {"template": "What's my {ability} saving throw?", "slots": {"ability": "ability"}},
        {"template": "What attacks can I make?", "slots": {}},
        {"template": "How many attacks can I make per turn?", "slots": {}},
    ],
    "abilities_info": [
        {"template": "What skill proficiencies do I have?", "slots": {}},
        {"template": "Am I proficient in {skill}?", "slots": {"skill": "skill"}},
        {"template": "What's my {skill} bonus?", "slots": {"skill": "skill"}},
        {"template": "What languages can I speak?", "slots": {}},
        {"template": "Do I have darkvision?", "slots": {}},
        {"template": "What class features do I have?", "slots": {}},
        {"template": "What feats do I have?", "slots": {}},
        {"template": "What's my passive perception?", "slots": {}},
    ],
    "inventory_info": [
        {"template": "What's in my inventory?", "slots": {}},
        {"template": "What weapons am I carrying?", "slots": {}},
        {"template": "What armor am I wearing?", "slots": {}},
        {"template": "Do I have a {weapon}?", "slots": {"weapon": "weapon"}},
        {"template": "What magic items do I have?", "slots": {}},
        {"template": "How much gold do I have?", "slots": {}},
        {"template": "What's my carrying capacity?", "slots": {}},
    ],
    "magic_info": [
        {"template": "What spells can I cast?", "slots": {}},
        {"template": "What spells do I have prepared?", "slots": {}},
        {"template": "How many spell slots do I have?", "slots": {}},
        {"template": "What cantrips do I know?", "slots": {}},
        {"template": "What's my spell save DC?", "slots": {}},
        {"template": "What's my spell attack bonus?", "slots": {}},
        {"template": "Do I know {spell}?", "slots": {"spell": "spell"}},
    ],
    "story_info": [
        {"template": "What's my character's backstory?", "slots": {}},
        {"template": "What are my personality traits?", "slots": {}},
        {"template": "What are my character's ideals?", "slots": {}},
        {"template": "What are my bonds?", "slots": {}},
        {"template": "What are my flaws?", "slots": {}},
        {"template": "What motivates my character?", "slots": {}},
    ],
    "social_info": [
        {"template": "Who are my allies?", "slots": {}},
        {"template": "Who are my enemies?", "slots": {}},
        {"template": "What factions am I affiliated with?", "slots": {}},
        {"template": "Tell me about my companions", "slots": {}},
    ],
    "progress_info": [
        {"template": "What are my current objectives?", "slots": {}},
        {"template": "What quests am I on?", "slots": {}},
        {"template": "What long-term goals do I have?", "slots": {}},
        {"template": "What have I accomplished?", "slots": {}},
    ],
    "full_character": [
        {"template": "Give me a complete character sheet", "slots": {}},
        {"template": "Tell me everything about my character", "slots": {}},
        {"template": "Show me my full character information", "slots": {}},
    ],
    "character_summary": [
        {"template": "Give me a quick summary of my character", "slots": {}},
        {"template": "Quick overview of my character", "slots": {}},
        {"template": "Summarize my character's key stats", "slots": {}},
    ],
}

print(f"Character templates per intent:")
for intent, templates in CHARACTER_TEMPLATES.items():
    print(f"  {intent}: {len(templates)} templates")

In [None]:
# Single-tool templates (session_notes) - representative subset
SESSION_TEMPLATES = {
    "character_status": [
        {"template": "What's my current condition?", "slots": {}},
        {"template": "Am I injured?", "slots": {}},
        {"template": "What's my status right now?", "slots": {}},
    ],
    "event_sequence": [
        {"template": "What happened last session?", "slots": {}},
        {"template": "Recap the events so far", "slots": {}},
        {"template": "What happened before this?", "slots": {}},
    ],
    "npc_info": [
        {"template": "Who did we meet?", "slots": {}},
        {"template": "Tell me about the NPCs we've encountered", "slots": {}},
        {"template": "What did the merchant say?", "slots": {}},
    ],
    "location_details": [
        {"template": "Where are we?", "slots": {}},
        {"template": "Describe our current location", "slots": {}},
        {"template": "What places have we visited?", "slots": {}},
    ],
    "item_tracking": [
        {"template": "What items did we find?", "slots": {}},
        {"template": "When did I get my sword?", "slots": {}},
        {"template": "What happened to the artifact?", "slots": {}},
    ],
    "combat_recap": [
        {"template": "What happened in our last fight?", "slots": {}},
        {"template": "How did the battle go?", "slots": {}},
        {"template": "Who did we fight?", "slots": {}},
    ],
    "spell_ability_usage": [
        {"template": "What spells did I cast?", "slots": {}},
        {"template": "When did I use my abilities?", "slots": {}},
        {"template": "What powers have I used?", "slots": {}},
    ],
    "character_decisions": [
        {"template": "What choices did I make?", "slots": {}},
        {"template": "What decisions affected the story?", "slots": {}},
    ],
    "party_dynamics": [
        {"template": "How is the party doing?", "slots": {}},
        {"template": "What's my relationship with my companions?", "slots": {}},
    ],
    "quest_tracking": [
        {"template": "What quests are we on?", "slots": {}},
        {"template": "What's our current mission?", "slots": {}},
        {"template": "What tasks have we completed?", "slots": {}},
    ],
    "puzzle_solutions": [
        {"template": "How did we solve the puzzle?", "slots": {}},
        {"template": "What riddles have we encountered?", "slots": {}},
    ],
    "loot_rewards": [
        {"template": "What loot did we get?", "slots": {}},
        {"template": "What rewards have we received?", "slots": {}},
        {"template": "What treasure did we find?", "slots": {}},
    ],
    "death_revival": [
        {"template": "Has anyone died?", "slots": {}},
        {"template": "Have I been revived?", "slots": {}},
    ],
    "divine_religious": [
        {"template": "Have we encountered any gods?", "slots": {}},
        {"template": "What divine events occurred?", "slots": {}},
    ],
    "memory_vision": [
        {"template": "What visions have I had?", "slots": {}},
        {"template": "What dreams did my character experience?", "slots": {}},
    ],
    "rules_mechanics": [
        {"template": "What rule questions came up?", "slots": {}},
        {"template": "How did we handle that mechanic?", "slots": {}},
    ],
    "humor_moments": [
        {"template": "What funny moments happened?", "slots": {}},
        {"template": "What was the funniest thing?", "slots": {}},
    ],
    "unresolved_mysteries": [
        {"template": "What mysteries remain unsolved?", "slots": {}},
        {"template": "What questions are unanswered?", "slots": {}},
    ],
    "future_implications": [
        {"template": "What consequences might come?", "slots": {}},
        {"template": "What should we watch out for?", "slots": {}},
    ],
    "cross_session": [
        {"template": "What happened between sessions?", "slots": {}},
        {"template": "What downtime activities did we do?", "slots": {}},
    ],
}

In [None]:
# Single-tool templates (rulebook) - representative subset
RULEBOOK_TEMPLATES = {
    "describe_entity": [
        {"template": "What is {spell}?", "slots": {"spell": "spell"}},
        {"template": "Tell me about {creature}", "slots": {"creature": "creature"}},
        {"template": "Describe {item}", "slots": {"item": "item"}},
        {"template": "What does {spell} do?", "slots": {"spell": "spell"}},
        {"template": "Explain the {feature} feature", "slots": {"feature": "feature"}},
    ],
    "compare_entities": [
        {"template": "What's the difference between {spell1} and {spell2}?", "slots": {"spell1": "spell", "spell2": "spell"}},
        {"template": "Compare {weapon1} vs {weapon2}", "slots": {"weapon1": "weapon", "weapon2": "weapon"}},
        {"template": "Which is better, {class1} or {class2}?", "slots": {"class1": "class_name", "class2": "class_name"}},
    ],
    "level_progression": [
        {"template": "What do {class_name}s get at level {level}?", "slots": {"class_name": "class_name", "level": "level"}},
        {"template": "How much XP do I need to reach level {level}?", "slots": {"level": "level"}},
        {"template": "When do {class_name}s get Extra Attack?", "slots": {"class_name": "class_name"}},
    ],
    "action_options": [
        {"template": "What actions can I take in combat?", "slots": {}},
        {"template": "What bonus actions are available?", "slots": {}},
        {"template": "How does the Ready action work?", "slots": {}},
    ],
    "rule_mechanics": [
        {"template": "How does {mechanic} work?", "slots": {"mechanic": "mechanic"}},
        {"template": "What are the rules for {mechanic}?", "slots": {"mechanic": "mechanic"}},
        {"template": "How does advantage work?", "slots": {}},
        {"template": "How do critical hits work?", "slots": {}},
    ],
    "calculate_values": [
        {"template": "How do I calculate my AC?", "slots": {}},
        {"template": "How is spell save DC calculated?", "slots": {}},
        {"template": "What's the formula for attack bonus?", "slots": {}},
    ],
    "spell_details": [
        {"template": "What are the components for {spell}?", "slots": {"spell": "spell"}},
        {"template": "What's the range of {spell}?", "slots": {"spell": "spell"}},
        {"template": "What level is {spell}?", "slots": {"spell": "spell"}},
        {"template": "Is {spell} concentration?", "slots": {"spell": "spell"}},
    ],
    "class_spell_access": [
        {"template": "Can {class_name}s cast {spell}?", "slots": {"class_name": "class_name", "spell": "spell"}},
        {"template": "What classes have access to {spell}?", "slots": {"spell": "spell"}},
        {"template": "What spells can a {class_name} learn?", "slots": {"class_name": "class_name"}},
    ],
    "monster_stats": [
        {"template": "What's the CR of a {creature}?", "slots": {"creature": "creature"}},
        {"template": "How many hit points does a {creature} have?", "slots": {"creature": "creature"}},
        {"template": "What's the AC of a {creature}?", "slots": {"creature": "creature"}},
    ],
    "condition_effects": [
        {"template": "What does {condition} do?", "slots": {"condition": "condition"}},
        {"template": "What are the effects of being {condition}?", "slots": {"condition": "condition"}},
        {"template": "How do I remove {condition}?", "slots": {"condition": "condition"}},
    ],
    "character_creation": [
        {"template": "How do I create a {class_name}?", "slots": {"class_name": "class_name"}},
        {"template": "What are the steps for character creation?", "slots": {}},
        {"template": "How do I determine ability scores?", "slots": {}},
    ],
    "multiclass_rules": [
        {"template": "How does multiclassing work?", "slots": {}},
        {"template": "What are the requirements to multiclass into {class_name}?", "slots": {"class_name": "class_name"}},
        {"template": "How do spell slots work when multiclassing?", "slots": {}},
    ],
    "equipment_properties": [
        {"template": "What does {property} mean on a weapon?", "slots": {"property": "weapon_property"}},
        {"template": "What's the damage of a {weapon}?", "slots": {"weapon": "weapon"}},
        {"template": "Is {weapon} finesse?", "slots": {"weapon": "weapon"}},
    ],
    "damage_types": [
        {"template": "What creatures are resistant to {damage_type} damage?", "slots": {"damage_type": "damage_type"}},
        {"template": "What spells deal {damage_type} damage?", "slots": {"damage_type": "damage_type"}},
    ],
    "rest_mechanics": [
        {"template": "What do I recover on a short rest?", "slots": {}},
        {"template": "What do I recover on a long rest?", "slots": {}},
        {"template": "How do hit dice work?", "slots": {}},
    ],
    "skill_usage": [
        {"template": "What can I use {skill} for?", "slots": {"skill": "skill"}},
        {"template": "When do I roll {skill}?", "slots": {"skill": "skill"}},
        {"template": "What ability is {skill} based on?", "slots": {"skill": "skill"}},
    ],
    "find_by_criteria": [
        {"template": "What spells deal fire damage?", "slots": {}},
        {"template": "List all healing spells", "slots": {}},
        {"template": "What finesse weapons are there?", "slots": {}},
    ],
    "prerequisite_check": [
        {"template": "What are the prerequisites for {feat}?", "slots": {"feat": "feat"}},
        {"template": "What level do I need to be for {spell}?", "slots": {"spell": "spell"}},
    ],
    "interaction_rules": [
        {"template": "Can I cast {spell} while concentrating on another spell?", "slots": {"spell": "spell"}},
        {"template": "Does advantage cancel out disadvantage?", "slots": {}},
    ],
    "tactical_usage": [
        {"template": "When should I use {spell}?", "slots": {"spell": "spell"}},
        {"template": "What's the best way to use {feature}?", "slots": {"feature": "feature"}},
    ],
    "environmental_rules": [
        {"template": "How does difficult terrain work?", "slots": {}},
        {"template": "How does underwater combat work?", "slots": {}},
        {"template": "How does falling damage work?", "slots": {}},
    ],
    "creature_abilities": [
        {"template": "What is {creature_ability}?", "slots": {"creature_ability": "creature_ability"}},
        {"template": "How does breath weapon work?", "slots": {}},
    ],
    "saving_throws": [
        {"template": "How do saving throws work?", "slots": {}},
        {"template": "What's a {ability} saving throw used for?", "slots": {"ability": "ability"}},
        {"template": "How do death saving throws work?", "slots": {}},
    ],
    "magic_item_usage": [
        {"template": "How does {magic_item} work?", "slots": {"magic_item": "magic_item"}},
        {"template": "Does {magic_item} require attunement?", "slots": {"magic_item": "magic_item"}},
        {"template": "How many items can I attune to?", "slots": {}},
    ],
    "planar_properties": [
        {"template": "What are the properties of the {plane}?", "slots": {"plane": "plane"}},
        {"template": "What creatures live in the {plane}?", "slots": {"plane": "plane"}},
    ],
    "downtime_activities": [
        {"template": "What downtime activities are there?", "slots": {}},
        {"template": "How does crafting work?", "slots": {}},
    ],
    "subclass_features": [
        {"template": "What are the {subclass} features?", "slots": {"subclass": "subclass"}},
        {"template": "What subclasses does {class_name} have?", "slots": {"class_name": "class_name"}},
    ],
    "cost_lookup": [
        {"template": "How much does {item} cost?", "slots": {"item": "item"}},
        {"template": "What's the price of {weapon}?", "slots": {"weapon": "weapon"}},
    ],
    "legendary_mechanics": [
        {"template": "What are legendary actions?", "slots": {}},
        {"template": "How do legendary resistances work?", "slots": {}},
    ],
    "optimization_advice": [
        {"template": "What's the best race for a {class_name}?", "slots": {"class_name": "class_name"}},
        {"template": "Best feats for a {class_name}?", "slots": {"class_name": "class_name"}},
    ],
}

In [None]:
# Multi-tool templates (2-tool combinations)
MULTI_TOOL_TEMPLATES = {
    # character_data + rulebook
    "character_rulebook": [
        {"template": "What's my AC and how does {spell} affect it?", "tools": ["character_data", "rulebook"], "intents": {"character_data": "combat_info", "rulebook": "spell_details"}, "slots": {"spell": "spell"}},
        {"template": "What's my {skill} bonus and when do I use it?", "tools": ["character_data", "rulebook"], "intents": {"character_data": "abilities_info", "rulebook": "skill_usage"}, "slots": {"skill": "skill"}},
        {"template": "Can I cast {spell} and what does it do?", "tools": ["character_data", "rulebook"], "intents": {"character_data": "magic_info", "rulebook": "spell_details"}, "slots": {"spell": "spell"}},
        {"template": "What's my spell save DC and how is it calculated?", "tools": ["character_data", "rulebook"], "intents": {"character_data": "magic_info", "rulebook": "calculate_values"}, "slots": {}},
        {"template": "What weapons do I have and what's the damage for {weapon}?", "tools": ["character_data", "rulebook"], "intents": {"character_data": "inventory_info", "rulebook": "equipment_properties"}, "slots": {"weapon": "weapon"}},
        {"template": "What class am I and what features do I get at level {level}?", "tools": ["character_data", "rulebook"], "intents": {"character_data": "character_basics", "rulebook": "level_progression"}, "slots": {"level": "level"}},
        {"template": "Do I have {feature} and how does it work?", "tools": ["character_data", "rulebook"], "intents": {"character_data": "abilities_info", "rulebook": "describe_entity"}, "slots": {"feature": "feature"}},
        {"template": "What's my attack bonus and how do critical hits work?", "tools": ["character_data", "rulebook"], "intents": {"character_data": "combat_info", "rulebook": "rule_mechanics"}, "slots": {}},
        {"template": "What armor am I wearing and how does AC calculation work?", "tools": ["character_data", "rulebook"], "intents": {"character_data": "inventory_info", "rulebook": "calculate_values"}, "slots": {}},
        {"template": "Do I have darkvision and how does it work?", "tools": ["character_data", "rulebook"], "intents": {"character_data": "abilities_info", "rulebook": "rule_mechanics"}, "slots": {}},
    ],
    # character_data + session_notes
    "character_session": [
        {"template": "What magic items do I have and how did I get them?", "tools": ["character_data", "session_notes"], "intents": {"character_data": "inventory_info", "session_notes": "item_tracking"}, "slots": {}},
        {"template": "What's my current HP and what happened in our last fight?", "tools": ["character_data", "session_notes"], "intents": {"character_data": "combat_info", "session_notes": "combat_recap"}, "slots": {}},
        {"template": "What spells do I have and which ones did I cast last session?", "tools": ["character_data", "session_notes"], "intents": {"character_data": "magic_info", "session_notes": "spell_ability_usage"}, "slots": {}},
        {"template": "Who are my allies and who did we meet last session?", "tools": ["character_data", "session_notes"], "intents": {"character_data": "social_info", "session_notes": "npc_info"}, "slots": {}},
        {"template": "What are my goals and what progress have I made?", "tools": ["character_data", "session_notes"], "intents": {"character_data": "progress_info", "session_notes": "quest_tracking"}, "slots": {}},
        {"template": "What class features do I have and when did I use them?", "tools": ["character_data", "session_notes"], "intents": {"character_data": "abilities_info", "session_notes": "spell_ability_usage"}, "slots": {}},
        {"template": "What's my backstory and how has it come up in the campaign?", "tools": ["character_data", "session_notes"], "intents": {"character_data": "story_info", "session_notes": "character_decisions"}, "slots": {}},
        {"template": "What weapons do I have and what fights used them?", "tools": ["character_data", "session_notes"], "intents": {"character_data": "inventory_info", "session_notes": "combat_recap"}, "slots": {}},
    ],
    # session_notes + rulebook
    "session_rulebook": [
        {"template": "What happened in our last fight and how does {mechanic} work?", "tools": ["session_notes", "rulebook"], "intents": {"session_notes": "combat_recap", "rulebook": "rule_mechanics"}, "slots": {"mechanic": "mechanic"}},
        {"template": "Who did we fight and what's a {creature}'s stat block?", "tools": ["session_notes", "rulebook"], "intents": {"session_notes": "combat_recap", "rulebook": "monster_stats"}, "slots": {"creature": "creature"}},
        {"template": "What spells were cast and how does {spell} work exactly?", "tools": ["session_notes", "rulebook"], "intents": {"session_notes": "spell_ability_usage", "rulebook": "spell_details"}, "slots": {"spell": "spell"}},
        {"template": "What magic items did we find and how does {magic_item} work?", "tools": ["session_notes", "rulebook"], "intents": {"session_notes": "item_tracking", "rulebook": "magic_item_usage"}, "slots": {"magic_item": "magic_item"}},
        {"template": "What conditions affected us and how does {condition} work?", "tools": ["session_notes", "rulebook"], "intents": {"session_notes": "combat_recap", "rulebook": "condition_effects"}, "slots": {"condition": "condition"}},
        {"template": "What loot did we get and what's the value of {item}?", "tools": ["session_notes", "rulebook"], "intents": {"session_notes": "loot_rewards", "rulebook": "cost_lookup"}, "slots": {"item": "item"}},
        {"template": "Who went down in combat and how do death saves work?", "tools": ["session_notes", "rulebook"], "intents": {"session_notes": "death_revival", "rulebook": "saving_throws"}, "slots": {}},
        {"template": "Where did we go and what are the rules for the {plane}?", "tools": ["session_notes", "rulebook"], "intents": {"session_notes": "location_details", "rulebook": "planar_properties"}, "slots": {"plane": "plane"}},
    ],
}

# 3-tool templates
THREE_TOOL_TEMPLATES = [
    {"template": "What spells do I know, when did I cast {spell}, and what are its full rules?", "tools": ["character_data", "session_notes", "rulebook"], "intents": {"character_data": "magic_info", "session_notes": "spell_ability_usage", "rulebook": "spell_details"}, "slots": {"spell": "spell"}},
    {"template": "What's my AC, what did we fight, and how does cover affect AC?", "tools": ["character_data", "session_notes", "rulebook"], "intents": {"character_data": "combat_info", "session_notes": "combat_recap", "rulebook": "rule_mechanics"}, "slots": {}},
    {"template": "What magic items do I have, when did I get them, and how does {magic_item} work?", "tools": ["character_data", "session_notes", "rulebook"], "intents": {"character_data": "inventory_info", "session_notes": "item_tracking", "rulebook": "magic_item_usage"}, "slots": {"magic_item": "magic_item"}},
    {"template": "What's my attack bonus, what monsters did we fight, and what's a {creature}'s AC?", "tools": ["character_data", "session_notes", "rulebook"], "intents": {"character_data": "combat_info", "session_notes": "combat_recap", "rulebook": "monster_stats"}, "slots": {"creature": "creature"}},
    {"template": "What level am I, what progress have I made, and when do I get my next feature?", "tools": ["character_data", "session_notes", "rulebook"], "intents": {"character_data": "character_basics", "session_notes": "quest_tracking", "rulebook": "level_progression"}, "slots": {}},
    {"template": "What's my {skill} bonus, when did I use it, and what are typical DCs?", "tools": ["character_data", "session_notes", "rulebook"], "intents": {"character_data": "abilities_info", "session_notes": "rules_mechanics", "rulebook": "skill_usage"}, "slots": {"skill": "skill"}},
    {"template": "What class features do I have, when did I use them, and how does {feature} work?", "tools": ["character_data", "session_notes", "rulebook"], "intents": {"character_data": "abilities_info", "session_notes": "spell_ability_usage", "rulebook": "describe_entity"}, "slots": {"feature": "feature"}},
    {"template": "What's my HP, what damage did I take last session, and how does resistance work?", "tools": ["character_data", "session_notes", "rulebook"], "intents": {"character_data": "combat_info", "session_notes": "combat_recap", "rulebook": "damage_types"}, "slots": {}},
]

## Data Generation Functions

In [None]:
def fill_template_slots(template: str, slots: Dict[str, str]) -> Tuple[str, Dict[str, str]]:
    """
    Fill template slots with random values from gazetteers.
    Returns filled query and the entity mapping for NER.
    """
    filled = template
    entities = {}  # entity_text -> entity_type
    
    for slot_name, slot_type in slots.items():
        if slot_type in SLOT_FILLERS:
            value = random.choice(SLOT_FILLERS[slot_type])
            filled = filled.replace("{" + slot_name + "}", value)
            
            # Map slot types to NER entity types
            ner_type_map = {
                "spell": "SPELL",
                "creature": "CREATURE",
                "item": "ITEM",
                "weapon": "ITEM",
                "armor": "ITEM",
                "magic_item": "ITEM",
                "feature": "FEATURE",
                "class_name": "CLASS",
                "race": "RACE",
                "subclass": "CLASS",
                "condition": "CONDITION",
                "plane": "LOCATION",
            }
            if slot_type in ner_type_map:
                entities[value] = ner_type_map[slot_type]
    
    return filled, entities


def generate_bio_tags(query: str, entities: Dict[str, str]) -> List[str]:
    """
    Generate BIO tags for a query given entity annotations.
    """
    # Simple word tokenization
    words = query.split()
    tags = ["O"] * len(words)
    
    for entity_text, entity_type in entities.items():
        entity_words = entity_text.split()
        entity_len = len(entity_words)
        
        # Find entity in query
        for i in range(len(words) - entity_len + 1):
            # Check for match (case-insensitive, handling punctuation)
            match = True
            for j in range(entity_len):
                word = words[i + j].rstrip('?.,!').lower()
                entity_word = entity_words[j].lower()
                if word != entity_word:
                    match = False
                    break
            
            if match:
                tags[i] = f"B-{entity_type}"
                for j in range(1, entity_len):
                    tags[i + j] = f"I-{entity_type}"
                break
    
    return tags


def create_label(tools: List[str], intents: Dict[str, str]) -> Dict:
    """
    Create the label dictionary for a sample.
    """
    # Tool binary vector
    tool_vector = [1 if tool in tools else 0 for tool in TOOLS]
    
    # Intent indices (-1 if tool not selected)
    char_intent = CHARACTER_INTENT_TO_INDEX.get(intents.get("character_data"), -1) if "character_data" in tools else -1
    session_intent = SESSION_INTENT_TO_INDEX.get(intents.get("session_notes"), -1) if "session_notes" in tools else -1
    rulebook_intent = RULEBOOK_INTENT_TO_INDEX.get(intents.get("rulebook"), -1) if "rulebook" in tools else -1
    
    return {
        "tools": tool_vector,
        "character_intent": char_intent,
        "session_intent": session_intent,
        "rulebook_intent": rulebook_intent,
    }


# Test the functions
test_template = "What spells do I know, when did I cast {spell}, and what are its full rules?"
test_slots = {"spell": "spell"}
filled, entities = fill_template_slots(test_template, test_slots)
tags = generate_bio_tags(filled, entities)
print(f"Template: {test_template}")
print(f"Filled: {filled}")
print(f"Entities: {entities}")
print(f"Words: {filled.split()}")
print(f"BIO tags: {tags}")

In [None]:
def generate_single_tool_samples(templates: Dict, tool: str, n_samples: int) -> List[Dict]:
    """
    Generate samples for single-tool queries.
    """
    samples = []
    intent_list = list(templates.keys())
    
    samples_per_intent = n_samples // len(intent_list)
    
    for intent in intent_list:
        intent_templates = templates[intent]
        for _ in range(samples_per_intent):
            template_data = random.choice(intent_templates)
            filled, entities = fill_template_slots(template_data["template"], template_data["slots"])
            bio_tags = generate_bio_tags(filled, entities)
            
            label = create_label([tool], {tool: intent})
            
            samples.append({
                "query": filled,
                "tokens": filled.split(),
                "bio_tags": bio_tags,
                "entities": entities,
                **label,
            })
    
    return samples


def generate_multi_tool_samples(templates: List[Dict], n_samples: int) -> List[Dict]:
    """
    Generate samples for multi-tool queries.
    """
    samples = []
    
    for _ in range(n_samples):
        template_data = random.choice(templates)
        filled, entities = fill_template_slots(template_data["template"], template_data.get("slots", {}))
        bio_tags = generate_bio_tags(filled, entities)
        
        label = create_label(template_data["tools"], template_data["intents"])
        
        samples.append({
            "query": filled,
            "tokens": filled.split(),
            "bio_tags": bio_tags,
            "entities": entities,
            **label,
        })
    
    return samples

## Generate Dataset

In [None]:
# Target dataset composition
TARGET_TOTAL = 10000
TARGET_2_TOOL = 5000  # 50%
TARGET_3_TOOL = 2000  # 20%
TARGET_1_TOOL = 3000  # 30%

# Generate 1-tool samples (split evenly across 3 tools)
n_per_tool = TARGET_1_TOOL // 3

print("Generating 1-tool samples...")
char_samples = generate_single_tool_samples(CHARACTER_TEMPLATES, "character_data", n_per_tool)
session_samples = generate_single_tool_samples(SESSION_TEMPLATES, "session_notes", n_per_tool)
rulebook_samples = generate_single_tool_samples(RULEBOOK_TEMPLATES, "rulebook", n_per_tool)

single_tool_samples = char_samples + session_samples + rulebook_samples
print(f"Generated {len(single_tool_samples)} single-tool samples")

# Generate 2-tool samples
print("\nGenerating 2-tool samples...")
all_2_tool_templates = []
for category, templates in MULTI_TOOL_TEMPLATES.items():
    all_2_tool_templates.extend(templates)

two_tool_samples = generate_multi_tool_samples(all_2_tool_templates, TARGET_2_TOOL)
print(f"Generated {len(two_tool_samples)} 2-tool samples")

# Generate 3-tool samples
print("\nGenerating 3-tool samples...")
three_tool_samples = generate_multi_tool_samples(THREE_TOOL_TEMPLATES, TARGET_3_TOOL)
print(f"Generated {len(three_tool_samples)} 3-tool samples")

# Combine all samples
all_samples = single_tool_samples + two_tool_samples + three_tool_samples
random.shuffle(all_samples)

print(f"\nTotal samples: {len(all_samples)}")

In [None]:
# Analyze dataset composition
tool_counts = defaultdict(int)
for sample in all_samples:
    n_tools = sum(sample["tools"])
    tool_counts[n_tools] += 1

print("Dataset composition:")
for n_tools in sorted(tool_counts.keys()):
    count = tool_counts[n_tools]
    pct = count / len(all_samples) * 100
    print(f"  {n_tools}-tool queries: {count} ({pct:.1f}%)")

# Check entity distribution
entity_counts = defaultdict(int)
for sample in all_samples:
    for entity_type in sample["entities"].values():
        entity_counts[entity_type] += 1

print("\nEntity type distribution:")
for entity_type, count in sorted(entity_counts.items(), key=lambda x: -x[1]):
    print(f"  {entity_type}: {count}")

In [None]:
# Split into train/val/test (80/10/10)
n_total = len(all_samples)
n_train = int(n_total * 0.8)
n_val = int(n_total * 0.1)
n_test = n_total - n_train - n_val

train_samples = all_samples[:n_train]
val_samples = all_samples[n_train:n_train + n_val]
test_samples = all_samples[n_train + n_val:]

print(f"Train: {len(train_samples)}")
print(f"Val: {len(val_samples)}")
print(f"Test: {len(test_samples)}")

In [None]:
# Show sample examples
print("=" * 80)
print("SAMPLE EXAMPLES")
print("=" * 80)

for i, sample in enumerate(train_samples[:5]):
    print(f"\n--- Example {i+1} ---")
    print(f"Query: {sample['query']}")
    print(f"Tools: {[TOOLS[j] for j, v in enumerate(sample['tools']) if v == 1]}")
    if sample['character_intent'] >= 0:
        print(f"Character intent: {CHARACTER_INTENTS[sample['character_intent']]}")
    if sample['session_intent'] >= 0:
        print(f"Session intent: {SESSION_INTENTS[sample['session_intent']]}")
    if sample['rulebook_intent'] >= 0:
        print(f"Rulebook intent: {RULEBOOK_INTENTS[sample['rulebook_intent']]}")
    if sample['entities']:
        print(f"Entities: {sample['entities']}")
    print(f"BIO tags: {list(zip(sample['tokens'], sample['bio_tags']))}")

## Save Dataset to Google Drive

In [None]:
# Save to Google Drive
output_dir = f"{PROJECT_ROOT}/data/generated"
os.makedirs(output_dir, exist_ok=True)

with open(f"{output_dir}/train.json", "w") as f:
    json.dump(train_samples, f, indent=2)

with open(f"{output_dir}/val.json", "w") as f:
    json.dump(val_samples, f, indent=2)

with open(f"{output_dir}/test.json", "w") as f:
    json.dump(test_samples, f, indent=2)

print(f"Saved to {output_dir}")
print(f"  train.json: {len(train_samples)} samples")
print(f"  val.json: {len(val_samples)} samples")
print(f"  test.json: {len(test_samples)} samples")

In [None]:
# Save label mappings for model training
label_config = {
    "tools": TOOLS,
    "character_intents": CHARACTER_INTENTS,
    "session_intents": SESSION_INTENTS,
    "rulebook_intents": RULEBOOK_INTENTS,
    "bio_labels": [
        "O",
        "B-SPELL", "I-SPELL",
        "B-ITEM", "I-ITEM",
        "B-CREATURE", "I-CREATURE",
        "B-LOCATION", "I-LOCATION",
        "B-FEATURE", "I-FEATURE",
        "B-CLASS", "I-CLASS",
        "B-RACE", "I-RACE",
        "B-CONDITION", "I-CONDITION",
    ]
}

with open(f"{output_dir}/label_config.json", "w") as f:
    json.dump(label_config, f, indent=2)

print(f"Saved label_config.json")

## Summary Statistics

In [None]:
# Final summary
print("=" * 80)
print("DATASET GENERATION COMPLETE")
print("=" * 80)
print(f"\nTotal samples: {len(all_samples)}")
print(f"  Train: {len(train_samples)} (80%)")
print(f"  Val: {len(val_samples)} (10%)")
print(f"  Test: {len(test_samples)} (10%)")

print(f"\nTool distribution:")
for n_tools in sorted(tool_counts.keys()):
    count = tool_counts[n_tools]
    pct = count / len(all_samples) * 100
    print(f"  {n_tools}-tool: {count} ({pct:.1f}%)")

print(f"\nIntent counts:")
print(f"  Character: {len(CHARACTER_INTENTS)}")
print(f"  Session: {len(SESSION_INTENTS)}")
print(f"  Rulebook: {len(RULEBOOK_INTENTS)}")

print(f"\nNER entity types: {len(set(entity_counts.keys()))}")
print(f"\nFiles saved to: {output_dir}")