# ToolFlood Attack Demo

This notebook demonstrates the **ToolFlood attack** on tool-using agents. It:
1. Loads a set of target queries (from a task or custom list)
2. Runs the ToolFlood attack to generate adversarial tools
3. Merges attacker tools with benign tools and builds a vector store
4. Evaluates the victim agent on the queries
5. Reports metrics: **ASR** (Attack Success Rate), **TDR** (Top-k Domination Rate), and **Mean Domination**

**Requirements:** Set `OPENAI_API_KEY` in your environment (or configure in `config/models.yaml`).

In [None]:
# Setup: add project root to path and load config
import sys
from pathlib import Path

# Find project root (contains config/config.yaml)
cwd = Path.cwd()
project_root = cwd
for candidate in [cwd, cwd.parent]:
    if (candidate / "config" / "config.yaml").exists():
        project_root = candidate
        break
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.utils import (
    get_base_path,
    resolve_path,
    load_config,
    load_models,
    load_experiment_config,
    load_toolflood_config,
    load_agent_config,
    load_tools,
    load_queries_from_tasks,
    init_embedding_model,
    init_llm,
)
from src.attacks.toolflood_attack import ToolFloodAttack, AttackConfig as ToolFloodAttackConfig  # AttackConfig from toolflood
from src.experiments.common import merge_tools, evaluate_queries
from src.agent import VictimAgent
from src.scripts.build_vectorstore import init_vector_store
from src.utils import load_vector_store
from src.metrics import calculate_asr, calculate_tdr, calculate_mean_domination

import asyncio
import json

In [None]:
# Configuration
config_path = project_root / "config" / "config.yaml"
models_path = project_root / "config" / "models.yaml"

models_cfg = load_models(models_path)
exp_cfg = load_experiment_config(config_path)
attack_cfg = load_toolflood_config(config_path)
agent_cfg = load_agent_config(config_path)

base_path = get_base_path(config_path)
benign_data_dir = resolve_path(base_path, exp_cfg.benign_data_directory)
out_dir = resolve_path(base_path, "outputs/toolflood/demo")
out_dir.mkdir(parents=True, exist_ok=True)

print(f"Benign data: {benign_data_dir}")
print(f"Output dir:  {out_dir}")

## 1. Load Queries

Load target queries from a task file. You can also use a custom list by setting `queries` directly.

In [None]:
# Load queries from a single task (or use task_names=None for all tasks)
task_names = ["Space images"]  # Try: "Games for fun/relaxation", "Social media content creation", or None for all

train_queries, test_queries = load_queries_from_tasks(
    benign_data_dir / "tasks",
    task_names=task_names
)

# Limit queries for a quick demo (remove or increase for full run)
max_train = 15
max_test = 10
train_queries = train_queries[:max_train]
test_queries = test_queries[:max_test]

print(f"Train queries: {len(train_queries)}")
print(f"Test queries:  {len(test_queries)}")
print("\nSample train queries:")
for q in train_queries[:3]:
    preview = q[:70] + "..." if len(q) > 70 else q
    print(f"  - {preview}")

## 2. Run ToolFlood Attack

Phase 1: Generate tool candidates from sampled queries.  
Phase 2: Greedily select tools that maximize query coverage.

*Using reduced params for a quick demo (~2-5 min). Increase `max_generation_iterations` for better coverage.*

In [None]:
# Demo config: fewer iterations for speed (increase for production)
attack_config = ToolFloodAttackConfig(
    num_tools_per_query=agent_cfg.top_k,
    query_sample_size=min(8, len(train_queries)),
    num_tools_per_sample=5,
    max_generation_iterations=5,
    max_embedding_distance=attack_cfg.max_embedding_distance,
    total_tool_budget=20,
    max_concurrent_tasks=3,
)

# Initialize models
attack_embedding_model = init_embedding_model(
    models_cfg, model_name=attack_cfg.embedding_model or "text-embedding-3-small"
)
llm_optimizer = init_llm(models_cfg, model_name=attack_cfg.llm_optimizer_model or "gpt-4o-mini")

# Run attack
attack = ToolFloodAttack(
    train_queries,
    attack_embedding_model,
    llm_optimizer,
    attack_config=attack_config,
)
attacker_tools, attack_results = attack.attack()

print(f"\nGenerated {len(attacker_tools)} attacker tools")
print(f"Phase 2 coverage: {attack_results['phase2']['queries_covered']}/{len(train_queries)} queries fully covered")

## 3. Merge Tools and Build Vector Store

Combine benign and attacker tools, then build a FAISS vector store for retrieval.

In [None]:
# Load benign tools and merge with attacker tools
benign_tools = load_tools(benign_data_dir / "tools.json")
merged_tools_path = out_dir / "merged_tools.json"
merged_tools, attacker_tool_names = merge_tools(
    benign_tools, attacker_tools, merged_tools_path
)

# Build vector store for victim retrieval
victim_embedding_model = init_embedding_model(
    models_cfg,
    model_name=exp_cfg.victim_embedding_models[0] if exp_cfg.victim_embedding_models else "text-embedding-3-small"
)
vectorstore_path = out_dir / "vectorstore"
init_vector_store(merged_tools, victim_embedding_model, vectorstore_path, force_rebuild=True)

print(f"Total tools: {len(merged_tools)} ({len(benign_tools)} benign + {len(attacker_tool_names)} attacker)")

## 4. Evaluate Victim Agent

Run the victim agent on train and test queries and compute ASR, TDR, and Mean Domination.

In [None]:
# Load vector store and initialize victim agent
vectorstore = load_vector_store(vectorstore_path, victim_embedding_model)
llm = init_llm(models_cfg, model_name=exp_cfg.victim_models[0] if exp_cfg.victim_models else "gpt-4o-mini")
agent = VictimAgent(
    tools=merged_tools,
    vectorstore=vectorstore,
    embedding_model=victim_embedding_model,
    llm=llm,
    top_k=agent_cfg.top_k,
    verbose=False,
)

# Evaluate on train and test queries
train_results = asyncio.run(evaluate_queries(agent, train_queries, attacker_tool_names))
test_results = asyncio.run(evaluate_queries(agent, test_queries, attacker_tool_names))

## 5. Results

- **ASR** (Attack Success Rate): fraction of queries where the agent selected an attacker tool  
- **TDR** (Top-k Domination Rate): fraction of queries where attacker tools dominate all top-k slots  
- **Mean Domination**: average number of attacker tools in top-k per query

In [None]:
def compute_metrics(results, attacker_tool_names, k):
    asr = calculate_asr(results["successful_selections"], results["attacker_selections"])
    tdr = calculate_tdr(results["top_k_tools"], attacker_tool_names, k)
    mean_dom = calculate_mean_domination(results["top_k_tools"], attacker_tool_names, k)
    return asr, tdr, mean_dom

k = agent_cfg.top_k
train_asr, train_tdr, train_md = compute_metrics(train_results, attacker_tool_names, k)
test_asr, test_tdr, test_md = compute_metrics(test_results, attacker_tool_names, k)

print("=" * 60)
print("METRICS")
print("=" * 60)
print(f"{'Split':<10} {'ASR':<10} {'TDR':<10} {'Mean Dom':<12} {'Queries'}")
print("-" * 60)
print(f"{'Train':<10} {train_asr:<10.4f} {train_tdr:<10.4f} {train_md:<12.4f} {len(train_queries)}")
print(f"{'Test':<10} {test_asr:<10.4f} {test_tdr:<10.4f} {test_md:<12.4f} {len(test_queries)}")
print("=" * 60)
print(f"\nAttacker tool selections: train={train_results['attacker_selections']}, test={test_results['attacker_selections']}")

In [None]:
# Sample query results (first 5 from test set)
print("Sample query results (test set):\n")
for i, qr in enumerate(test_results["query_results"][:5]):
    q = qr["query"]
    sel = qr["selected_tool"]
    is_att = qr["is_attacker"]
    tool_name = sel["name"] if sel else "(none)"
    label = "ATTACKER" if is_att else "benign"
    preview = q[:60] + "..." if len(q) > 60 else q
    print(f"{i+1}. Query: {preview}")
    print(f"   Selected: {tool_name} [{label}]")
    print()