diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 0000000..fba2488 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,52 @@ +name: Benchmarks + +on: + pull_request: + branches: + - "master" + paths: + - "agent_assembly/**/*.py" + - "test/bench/**/*.py" + - ".github/workflows/benchmarks.yml" + - "pyproject.toml" + - "uv.lock" + +jobs: + benchmark: + name: Run performance benchmarks + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Set up Python + run: uv python install 3.13 + + - name: Install dependencies + run: uv sync --group dev + + - name: Run benchmark suite + run: | + uv run pytest test/bench/ \ + --benchmark-only \ + --benchmark-disable-gc \ + --benchmark-json=benchmark-results.json \ + -v + + - name: Run latency contract tests + run: | + uv run pytest test/bench/test_latency_contracts.py \ + --benchmark-disable \ + -v + + - name: Upload benchmark results + if: always() + uses: actions/upload-artifact@v4 + with: + name: benchmark-results + path: benchmark-results.json + retention-days: 30 diff --git a/pyproject.toml b/pyproject.toml index ba11b12..29c7197 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dev = [ "pytest-asyncio>=0.23.0,<2", "python-dotenv>=1.0.1,<2", "ruff>=0.1.0", + "pytest-benchmark>=4.0.0,<5", ] pre-commit-ci = [ "pre-commit>=3.5.0,<5", diff --git a/pytest.ini b/pytest.ini index e293d55..c2425c6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -14,3 +14,4 @@ log_cli_format = %(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno log_cli_date_format=%Y-%m-%d %H:%M:%S markers = integration: marks tests as integration tests + benchmark: marks tests as performance benchmarks (run with: pytest test/bench/ --benchmark-only) diff --git a/test/bench/BASELINE.md b/test/bench/BASELINE.md new file mode 100644 index 0000000..dc31873 --- /dev/null +++ b/test/bench/BASELINE.md @@ -0,0 +1,72 @@ +# Benchmark Baseline Results + +Captured: 2026-05-01 + +## Environment + +- Python: 3.12.4 +- Platform: macOS arm64 (Apple M3 Max) +- pytest-benchmark: 4.0+ + +## Adapter Hook Setup/Teardown (register + unregister cycle) + +| Adapter | Min (us) | Mean (us) | P99 (us) | Status | +|-----------------|----------|-----------|----------|--------| +| LangChain | 0.58 | 0.85 | ~3 | PASS | +| LangGraph | 0.67 | 0.92 | ~3 | PASS | +| MCP | 0.83 | 1.09 | ~4 | PASS | +| Pydantic AI | 1.29 | 1.66 | ~5 | PASS | +| OpenAI Agents | 1.50 | 2.00 | ~6 | PASS | +| CrewAI | 2.29 | 2.73 | ~8 | PASS | + +## Per-Call Patched Function Overhead (governance interception hot path) + +Contract: < 2ms per call (AAASM-45) + +| Adapter | Min (us) | Mean (us) | Median (us) | Status | +|-----------------|----------|-----------|-------------|--------| +| LangChain | 0.75 | 1.01 | 0.92 | PASS | +| CrewAI | 1.13 | 1.94 | 1.29 | PASS | +| LangGraph | 1.25 | 1.71 | 1.46 | PASS | +| Pydantic AI | 30.54 | 40.43 | 34.92 | PASS | +| OpenAI Agents | 22.50 | 39.48 | 33.08 | PASS | +| MCP | 29.17 | 39.77 | 33.17 | PASS | + +Sync adapters (CrewAI, LangChain, LangGraph) have ~1-2us overhead. +Async adapters include event-loop scheduling overhead (~30-40us) which +is an artifact of the benchmark harness; in real async code the event +loop is already running, so actual per-call overhead is lower. +All adapters are well under the 2ms (2000us) contract threshold. + +## Detection Overhead (AdapterRegistry.auto_detect) + +Contract: < 50ms on first call (AAASM-47) + +| Frameworks Installed | Min (ms) | Mean (ms) | Max (ms) | Status | +|----------------------|----------|-----------|----------|--------| +| 0 | 1.08 | 1.26 | 4.75 | PASS | +| 1 | 1.07 | 1.32 | 9.27 | PASS | +| 2 | 1.08 | 1.29 | 9.63 | PASS | +| 4 | 1.08 | 1.25 | 5.64 | PASS | + +Detection scales linearly and remains well under the 50ms contract. + +## init_assembly() Cold Start + +| Metric | Value (ms) | +|----------|------------| +| Min | 1.31 | +| Mean | 1.53 | +| Max | 8.09 | + +## PyO3 FFI Round-Trip + +Skipped — native `_core` module not built in this environment. +Requires `maturin develop` with Rust toolchain. + +## Notes + +- All measurements use `--benchmark-disable-gc` for consistency +- Adapter benchmarks use mock framework classes to isolate wiring overhead +- Detection benchmarks include entry-point discovery overhead +- CI results may differ due to different hardware; use relative comparisons diff --git a/test/bench/__init__.py b/test/bench/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/bench/conftest.py b/test/bench/conftest.py new file mode 100644 index 0000000..5b17c75 --- /dev/null +++ b/test/bench/conftest.py @@ -0,0 +1,39 @@ +"""Shared fixtures and constants for performance benchmarks.""" + +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock + +import pytest + +# Latency contract thresholds (nanoseconds) +MAX_PER_CALL_NS = 2_000_000 # <2ms per-call overhead (AAASM-45) +MAX_DETECTION_NS = 50_000_000 # <50ms detection overhead (AAASM-47) + + +@pytest.fixture() +def mock_gateway_client() -> MagicMock: + """Return a MagicMock that satisfies GatewayClient interface.""" + client = MagicMock() + client.gateway_url = "http://localhost:8080" + client.api_key = "test-key" + client.agent_id = "bench-agent" + client.close = MagicMock() + return client + + +@pytest.fixture() +def noop_interceptor() -> _NoopInterceptor: + """Return a no-op governance interceptor for benchmarking hooks.""" + return _NoopInterceptor() + + +class _NoopInterceptor: + """Minimal interceptor that accepts any method call and returns None.""" + + def __getattr__(self, name: str) -> Any: + def noop(*args: Any, **kwargs: Any) -> None: + del args, kwargs + + return noop diff --git a/test/bench/test_adapter_hook_overhead.py b/test/bench/test_adapter_hook_overhead.py new file mode 100644 index 0000000..3922624 --- /dev/null +++ b/test/bench/test_adapter_hook_overhead.py @@ -0,0 +1,175 @@ +"""Benchmark per-adapter hook register/unregister overhead. + +Measures the wall-clock time of each adapter's register_hooks() + +unregister_hooks() cycle using a no-op governance interceptor to +isolate adapter wiring overhead from framework execution. + +Contract: each adapter cycle must complete in <2ms P99 (AAASM-45). +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from agent_assembly.adapters.crewai import patch as crewai_patch_mod +from agent_assembly.adapters.crewai.adapter import CrewAIAdapter +from agent_assembly.adapters.langchain.adapter import LangChainAdapter +from agent_assembly.adapters.langgraph import patch as langgraph_patch_mod +from agent_assembly.adapters.langgraph.adapter import LangGraphAdapter +from agent_assembly.adapters.mcp import patch as mcp_patch_mod +from agent_assembly.adapters.mcp.adapter import MCPAdapter +from agent_assembly.adapters.openai_agents import patch as openai_patch_mod +from agent_assembly.adapters.openai_agents.adapter import OpenAIAgentsAdapter +from agent_assembly.adapters.pydantic_ai import patch as pydantic_ai_patch_mod +from agent_assembly.adapters.pydantic_ai.adapter import PydanticAIAdapter + +# --------------------------------------------------------------------------- +# Fake framework classes used to satisfy adapter loader checks +# --------------------------------------------------------------------------- + + +class _FakeBaseTool: + name = "bench_tool" + + def run(self, *args: Any, **kwargs: Any) -> None: + pass + + +class _FakeTask: + description = "bench task" + expected_output = "bench output" + + def execute_sync(self, *args: Any, **kwargs: Any) -> None: + pass + + +class _FakeStateGraph: + def compile(self, *args: Any, **kwargs: Any) -> Any: + return self + + +class _FakePydanticAITool: + name = "bench_tool" + + async def _run(self, ctx: Any, args: Any, **kwargs: Any) -> None: + pass + + +class _FakeOpenAIFunctionTool: + name = "bench_tool" + + async def __call__(self, ctx: Any, input_str: str) -> str: + return "" + + +class _FakeMCPClientSession: + async def call_tool(self, name: str, arguments: Any = None) -> Any: + pass + + +# --------------------------------------------------------------------------- +# Benchmarks +# --------------------------------------------------------------------------- + + +@pytest.mark.benchmark(group="adapter-hook") +def test_crewai_hook_overhead( + benchmark: Any, + noop_interceptor: Any, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(crewai_patch_mod, "_load_crewai_basetool_class", lambda: _FakeBaseTool) + monkeypatch.setattr(crewai_patch_mod, "_load_crewai_task_class", lambda: _FakeTask) + + def cycle() -> None: + adapter = CrewAIAdapter() + adapter.register_hooks(noop_interceptor) + adapter.unregister_hooks() + + benchmark(cycle) + + +@pytest.mark.benchmark(group="adapter-hook") +def test_langchain_hook_overhead( + benchmark: Any, + noop_interceptor: Any, + monkeypatch: pytest.MonkeyPatch, +) -> None: + # LangChainPatch.apply() always succeeds — it creates a callback handler. + # Reset runtime state between iterations to measure cold-start wiring. + def cycle() -> None: + adapter = LangChainAdapter() + adapter.register_hooks(noop_interceptor) + adapter.unregister_hooks() + + benchmark(cycle) + + +@pytest.mark.benchmark(group="adapter-hook") +def test_langgraph_hook_overhead( + benchmark: Any, + noop_interceptor: Any, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(langgraph_patch_mod, "_load_stategraph_class", lambda: _FakeStateGraph) + + def cycle() -> None: + adapter = LangGraphAdapter() + adapter.register_hooks(noop_interceptor) + adapter.unregister_hooks() + + benchmark(cycle) + + +@pytest.mark.benchmark(group="adapter-hook") +def test_pydantic_ai_hook_overhead( + benchmark: Any, + noop_interceptor: Any, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(pydantic_ai_patch_mod, "_load_pydantic_ai_tool_class", lambda: _FakePydanticAITool) + + def cycle() -> None: + adapter = PydanticAIAdapter() + adapter.register_hooks(noop_interceptor) + adapter.unregister_hooks() + + benchmark(cycle) + + +@pytest.mark.benchmark(group="adapter-hook") +def test_openai_agents_hook_overhead( + benchmark: Any, + noop_interceptor: Any, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + openai_patch_mod, + "_load_openai_agents_function_tool_class", + lambda: _FakeOpenAIFunctionTool, + ) + + def cycle() -> None: + adapter = OpenAIAgentsAdapter() + adapter.register_hooks(noop_interceptor) + adapter.unregister_hooks() + + benchmark(cycle) + + +@pytest.mark.benchmark(group="adapter-hook") +def test_mcp_hook_overhead( + benchmark: Any, + noop_interceptor: Any, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(mcp_patch_mod, "_load_mcp_client_session_class", lambda: _FakeMCPClientSession) + + def cycle() -> None: + adapter = MCPAdapter() + adapter.register_hooks(noop_interceptor) + adapter.unregister_hooks() + + benchmark(cycle) diff --git a/test/bench/test_auto_detect_overhead.py b/test/bench/test_auto_detect_overhead.py new file mode 100644 index 0000000..2cfe3e9 --- /dev/null +++ b/test/bench/test_auto_detect_overhead.py @@ -0,0 +1,53 @@ +"""Benchmark AdapterRegistry.auto_detect() scaling. + +Measures detection overhead with varying numbers of available +frameworks (0, 1, 2, 4) by controlling which adapters report +as available. + +Contract: detection must complete in <50ms P99 (AAASM-47). +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from agent_assembly.adapters.registry import AdapterRegistry + + +def _make_registry_with_n_available(n: int) -> AdapterRegistry: + """Create a registry where exactly *n* builtin adapters are 'available'.""" + registry = AdapterRegistry() + adapters = list(registry._registered.values()) + for i, adapter in enumerate(adapters): + if i < n: + adapter.is_available = lambda: True # type: ignore[method-assign] + # Provide a no-op register_hooks so auto_detect() succeeds + adapter.register_hooks = lambda interceptor: None # type: ignore[method-assign] + adapter.unregister_hooks = lambda: None # type: ignore[method-assign] + else: + adapter.is_available = lambda: False # type: ignore[method-assign] + return registry + + +@pytest.mark.benchmark(group="detection") +@pytest.mark.parametrize("n_frameworks", [0, 1, 2, 4]) +def test_auto_detect_scaling(benchmark: Any, n_frameworks: int) -> None: + def detect() -> list[str]: + registry = _make_registry_with_n_available(n_frameworks) + return registry.auto_detect() + + result = benchmark(detect) + assert len(result) == n_frameworks + + +@pytest.mark.benchmark(group="detection") +@pytest.mark.parametrize("n_frameworks", [0, 1, 2, 4]) +def test_get_available_adapters_scaling(benchmark: Any, n_frameworks: int) -> None: + def get_available() -> list[Any]: + registry = _make_registry_with_n_available(n_frameworks) + return registry.get_available_adapters_by_priority() + + result = benchmark(get_available) + assert len(result) == n_frameworks diff --git a/test/bench/test_init_assembly_coldstart.py b/test/bench/test_init_assembly_coldstart.py new file mode 100644 index 0000000..df725c8 --- /dev/null +++ b/test/bench/test_init_assembly_coldstart.py @@ -0,0 +1,35 @@ +"""Benchmark init_assembly() cold-start time. + +Measures the wall-clock time from calling init_assembly() to receiving +an AssemblyContext, using sdk-only mode to isolate SDK wiring overhead +from network layer startup. + +The active context is reset between iterations to ensure each +measurement is a genuine cold start. +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +import agent_assembly.core.assembly as assembly_mod +from agent_assembly.core.assembly import init_assembly + + +@pytest.mark.benchmark(group="init") +def test_init_assembly_coldstart(benchmark: Any) -> None: + def cold_start() -> None: + # Reset global state for a true cold start + assembly_mod._ACTIVE_CONTEXT = None + + ctx = init_assembly( + gateway_url="http://localhost:8080", + api_key="bench-key", + agent_id="bench-agent", + mode="sdk-only", + ) + ctx.shutdown() + + benchmark(cold_start) diff --git a/test/bench/test_latency_contracts.py b/test/bench/test_latency_contracts.py new file mode 100644 index 0000000..f97b9b7 --- /dev/null +++ b/test/bench/test_latency_contracts.py @@ -0,0 +1,332 @@ +"""Latency contract enforcement tests. + +Uses time.perf_counter_ns() to measure operations over 100 iterations +and compute P50, P95, P99 percentiles. Tests FAIL if the contract +threshold is exceeded — this is intentional per AAASM-195 AC. + +Per-call tests measure the governance interception overhead on each +patched function call (the "hot path"), not hook setup/teardown. + +Contracts: + - Per-call adapter hook overhead: <2ms (AAASM-45) + - Detection overhead: <50ms on first call (AAASM-47) +""" + +from __future__ import annotations + +import asyncio +import time +from test.bench.conftest import MAX_DETECTION_NS, MAX_PER_CALL_NS +from typing import Any +from uuid import uuid4 + +import agent_assembly.core.assembly as assembly_mod +from agent_assembly.adapters.crewai.patch import ( + _apply_basetool_run_patch, + _revert_basetool_run_patch, +) +from agent_assembly.adapters.langchain.callback_handler import AssemblyCallbackHandler +from agent_assembly.adapters.langgraph.patch import ( + _apply_stategraph_compile_patch, + _revert_stategraph_compile_patch, +) +from agent_assembly.adapters.mcp.patch import ( + _apply_client_session_patch, + _revert_client_session_patch, +) +from agent_assembly.adapters.openai_agents.patch import ( + _apply_function_tool_call_patch, + _revert_function_tool_call_patch, +) +from agent_assembly.adapters.pydantic_ai.patch import ( + _apply_tool_run_patch, + _revert_tool_run_patch, +) +from agent_assembly.adapters.registry import AdapterRegistry +from agent_assembly.core.assembly import init_assembly + +_ITERATIONS = 100 + + +def _percentiles(samples: list[int]) -> tuple[float, float, float]: + """Return (P50, P95, P99) from a list of nanosecond measurements.""" + sorted_samples = sorted(samples) + n = len(sorted_samples) + p50 = sorted_samples[int(n * 0.50)] + p95 = sorted_samples[int(n * 0.95)] + p99 = sorted_samples[int(n * 0.99)] + return float(p50), float(p95), float(p99) + + +# --------------------------------------------------------------------------- +# Fake framework classes for per-call overhead measurement +# --------------------------------------------------------------------------- + + +class _FakeBaseTool: + name = "bench_tool" + + def run(self, *args: Any, **kwargs: Any) -> str: + return "result" + + +class _FakeCompiledGraph: + def __init__(self) -> None: + self.nodes: dict[str, Any] = {"node_a": lambda state: state} + + +class _FakeStateGraph: + def compile(self, *args: Any, **kwargs: Any) -> _FakeCompiledGraph: + return _FakeCompiledGraph() + + +class _FakePydanticAITool: + name = "bench_tool" + + async def _run(self, ctx: Any, args: Any, **kwargs: Any) -> str: + return "result" + + +class _FakeOpenAIFunctionTool: + name = "bench_tool" + + async def __call__(self, ctx: Any, input_str: str) -> str: + return "result" + + +class _FakeMCPClientSession: + async def call_tool(self, name: str, arguments: Any = None) -> str: + return "result" + + +class _NoopInterceptor: + def __getattr__(self, name: str) -> Any: + def noop(*args: Any, **kwargs: Any) -> None: + pass + + return noop + + +# --------------------------------------------------------------------------- +# Per-call latency contract (<2ms) — patched function call overhead +# --------------------------------------------------------------------------- + + +def test_crewai_per_call_latency_under_2ms() -> None: + """Fail if CrewAI patched BaseTool.run() P99 exceeds 2ms.""" + interceptor = _NoopInterceptor() + _apply_basetool_run_patch(_FakeBaseTool, interceptor) + tool = _FakeBaseTool() + samples: list[int] = [] + + try: + for _ in range(_ITERATIONS): + start = time.perf_counter_ns() + tool.run() + elapsed = time.perf_counter_ns() - start + samples.append(elapsed) + finally: + _revert_basetool_run_patch(_FakeBaseTool) + + p50, p95, p99 = _percentiles(samples) + assert p99 < MAX_PER_CALL_NS, ( + f"CrewAI patched call P99 = {p99 / 1e6:.3f}ms exceeds " + f"{MAX_PER_CALL_NS / 1e6:.1f}ms contract. " + f"P50={p50 / 1e6:.3f}ms P95={p95 / 1e6:.3f}ms" + ) + + +def test_langchain_per_call_latency_under_2ms() -> None: + """Fail if LangChain callback handler dispatch P99 exceeds 2ms.""" + interceptor = _NoopInterceptor() + handler = AssemblyCallbackHandler(interceptor) + run_id = uuid4() + serialized: dict[str, Any] = {"name": "bench_tool"} + samples: list[int] = [] + + for _ in range(_ITERATIONS): + start = time.perf_counter_ns() + handler.on_tool_start(serialized, "benchmark input", run_id=run_id) + handler.on_tool_end("result", run_id=run_id) + elapsed = time.perf_counter_ns() - start + samples.append(elapsed) + + p50, p95, p99 = _percentiles(samples) + assert p99 < MAX_PER_CALL_NS, ( + f"LangChain callback P99 = {p99 / 1e6:.3f}ms exceeds " + f"{MAX_PER_CALL_NS / 1e6:.1f}ms contract. " + f"P50={p50 / 1e6:.3f}ms P95={p95 / 1e6:.3f}ms" + ) + + +def test_langgraph_per_call_latency_under_2ms() -> None: + """Fail if LangGraph wrapped node call P99 exceeds 2ms.""" + interceptor = _NoopInterceptor() + _apply_stategraph_compile_patch(_FakeStateGraph, interceptor) + + try: + graph = _FakeStateGraph() + compiled = graph.compile() + wrapped_node = compiled.nodes["node_a"] + samples: list[int] = [] + + for _ in range(_ITERATIONS): + start = time.perf_counter_ns() + wrapped_node({"key": "value"}) + elapsed = time.perf_counter_ns() - start + samples.append(elapsed) + finally: + _revert_stategraph_compile_patch(_FakeStateGraph) + + p50, p95, p99 = _percentiles(samples) + assert p99 < MAX_PER_CALL_NS, ( + f"LangGraph node call P99 = {p99 / 1e6:.3f}ms exceeds " + f"{MAX_PER_CALL_NS / 1e6:.1f}ms contract. " + f"P50={p50 / 1e6:.3f}ms P95={p95 / 1e6:.3f}ms" + ) + + +def test_pydantic_ai_per_call_latency_under_2ms() -> None: + """Fail if Pydantic AI patched Tool._run() P99 exceeds 2ms.""" + interceptor = _NoopInterceptor() + _apply_tool_run_patch(_FakePydanticAITool, interceptor) + tool = _FakePydanticAITool() + ctx = type("FakeCtx", (), {"deps": None, "run_id": None})() + + async def measure() -> list[int]: + samples: list[int] = [] + for _ in range(_ITERATIONS): + start = time.perf_counter_ns() + await tool._run(ctx, {}) + elapsed = time.perf_counter_ns() - start + samples.append(elapsed) + return samples + + try: + samples = asyncio.run(measure()) + finally: + _revert_tool_run_patch(_FakePydanticAITool) + + p50, p95, p99 = _percentiles(samples) + assert p99 < MAX_PER_CALL_NS, ( + f"Pydantic AI patched call P99 = {p99 / 1e6:.3f}ms exceeds " + f"{MAX_PER_CALL_NS / 1e6:.1f}ms contract. " + f"P50={p50 / 1e6:.3f}ms P95={p95 / 1e6:.3f}ms" + ) + + +def test_openai_agents_per_call_latency_under_2ms() -> None: + """Fail if OpenAI Agents patched FunctionTool.__call__() P99 exceeds 2ms.""" + interceptor = _NoopInterceptor() + _apply_function_tool_call_patch(_FakeOpenAIFunctionTool, interceptor) + tool = _FakeOpenAIFunctionTool() + ctx = type("FakeCtx", (), {"agent_id": None})() + + async def measure() -> list[int]: + samples: list[int] = [] + for _ in range(_ITERATIONS): + start = time.perf_counter_ns() + await tool(ctx, "benchmark input") + elapsed = time.perf_counter_ns() - start + samples.append(elapsed) + return samples + + try: + samples = asyncio.run(measure()) + finally: + _revert_function_tool_call_patch(_FakeOpenAIFunctionTool) + + p50, p95, p99 = _percentiles(samples) + assert p99 < MAX_PER_CALL_NS, ( + f"OpenAI Agents patched call P99 = {p99 / 1e6:.3f}ms exceeds " + f"{MAX_PER_CALL_NS / 1e6:.1f}ms contract. " + f"P50={p50 / 1e6:.3f}ms P95={p95 / 1e6:.3f}ms" + ) + + +def test_mcp_per_call_latency_under_2ms() -> None: + """Fail if MCP patched ClientSession.call_tool() P99 exceeds 2ms.""" + interceptor = _NoopInterceptor() + _apply_client_session_patch(_FakeMCPClientSession, interceptor) + session = _FakeMCPClientSession() + + async def measure() -> list[int]: + samples: list[int] = [] + for _ in range(_ITERATIONS): + start = time.perf_counter_ns() + await session.call_tool("bench_tool", {"key": "value"}) + elapsed = time.perf_counter_ns() - start + samples.append(elapsed) + return samples + + try: + samples = asyncio.run(measure()) + finally: + _revert_client_session_patch(_FakeMCPClientSession) + + p50, p95, p99 = _percentiles(samples) + assert p99 < MAX_PER_CALL_NS, ( + f"MCP patched call P99 = {p99 / 1e6:.3f}ms exceeds " + f"{MAX_PER_CALL_NS / 1e6:.1f}ms contract. " + f"P50={p50 / 1e6:.3f}ms P95={p95 / 1e6:.3f}ms" + ) + + +# --------------------------------------------------------------------------- +# Detection latency contract (<50ms) +# --------------------------------------------------------------------------- + + +def test_detection_latency_under_50ms() -> None: + """Fail if auto_detect() P99 exceeds 50ms.""" + samples: list[int] = [] + + for _ in range(_ITERATIONS): + registry = AdapterRegistry() + # Make all adapters unavailable for fast detection + for adapter in registry._registered.values(): + adapter.is_available = lambda: False # type: ignore[method-assign] + + start = time.perf_counter_ns() + registry.auto_detect() + elapsed = time.perf_counter_ns() - start + samples.append(elapsed) + + p50, p95, p99 = _percentiles(samples) + assert p99 < MAX_DETECTION_NS, ( + f"auto_detect() P99 = {p99 / 1e6:.3f}ms exceeds " + f"{MAX_DETECTION_NS / 1e6:.1f}ms contract. " + f"P50={p50 / 1e6:.3f}ms P95={p95 / 1e6:.3f}ms" + ) + + +# --------------------------------------------------------------------------- +# init_assembly() cold-start latency +# --------------------------------------------------------------------------- + + +def test_init_assembly_coldstart_latency() -> None: + """Measure init_assembly() cold-start P50/P95/P99.""" + samples: list[int] = [] + + for _ in range(_ITERATIONS): + assembly_mod._ACTIVE_CONTEXT = None + + start = time.perf_counter_ns() + ctx = init_assembly( + gateway_url="http://localhost:8080", + api_key="bench-key", + agent_id="bench-agent", + mode="sdk-only", + ) + elapsed = time.perf_counter_ns() - start + ctx.shutdown() + samples.append(elapsed) + + p50, p95, p99 = _percentiles(samples) + # init_assembly combines detection + registration — use detection budget + assert p99 < MAX_DETECTION_NS, ( + f"init_assembly() cold-start P99 = {p99 / 1e6:.3f}ms exceeds " + f"{MAX_DETECTION_NS / 1e6:.1f}ms contract. " + f"P50={p50 / 1e6:.3f}ms P95={p95 / 1e6:.3f}ms" + ) diff --git a/test/bench/test_patched_call_overhead.py b/test/bench/test_patched_call_overhead.py new file mode 100644 index 0000000..812ba4b --- /dev/null +++ b/test/bench/test_patched_call_overhead.py @@ -0,0 +1,215 @@ +"""Benchmark per-call overhead of patched framework functions. + +Measures the governance interception overhead on each tool/function +call when hooks are active — the "hot path" overhead users pay on +every LLM or tool invocation while the SDK is active. + +This directly addresses AAASM-195 AC1 by measuring the time delta +of calling a governance-patched no-op function vs the unpatched +baseline (which is effectively zero). + +Contract: per-call overhead must be <2ms P99 (AAASM-45). +""" + +from __future__ import annotations + +import asyncio +from typing import Any +from uuid import uuid4 + +import pytest + +from agent_assembly.adapters.crewai.patch import ( + _apply_basetool_run_patch, + _revert_basetool_run_patch, +) +from agent_assembly.adapters.langchain.callback_handler import AssemblyCallbackHandler +from agent_assembly.adapters.langgraph.patch import ( + _apply_stategraph_compile_patch, + _revert_stategraph_compile_patch, +) +from agent_assembly.adapters.mcp.patch import ( + _apply_client_session_patch, + _revert_client_session_patch, +) +from agent_assembly.adapters.openai_agents.patch import ( + _apply_function_tool_call_patch, + _revert_function_tool_call_patch, +) +from agent_assembly.adapters.pydantic_ai.patch import ( + _apply_tool_run_patch, + _revert_tool_run_patch, +) + +# --------------------------------------------------------------------------- +# Fake framework classes — minimal stubs with the patched hot-path method +# --------------------------------------------------------------------------- + + +class _BenchBaseTool: + name = "bench_tool" + + def run(self, *args: Any, **kwargs: Any) -> str: + return "result" + + +class _BenchCompiledGraph: + def __init__(self) -> None: + self.nodes: dict[str, Any] = {"node_a": _noop_node} + + +def _noop_node(state: Any) -> Any: + return state + + +class _BenchStateGraph: + def compile(self, *args: Any, **kwargs: Any) -> _BenchCompiledGraph: + return _BenchCompiledGraph() + + +class _BenchPydanticAITool: + name = "bench_tool" + + async def _run(self, ctx: Any, args: Any, **kwargs: Any) -> str: + return "result" + + +class _BenchOpenAIFunctionTool: + name = "bench_tool" + + async def __call__(self, ctx: Any, input_str: str) -> str: + return "result" + + +class _BenchMCPClientSession: + async def call_tool(self, name: str, arguments: Any = None) -> str: + return "result" + + +# --------------------------------------------------------------------------- +# Shared event loop for async benchmarks +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="module") +def bench_event_loop() -> Any: + loop = asyncio.new_event_loop() + yield loop + loop.close() + + +# --------------------------------------------------------------------------- +# Sync adapter benchmarks +# --------------------------------------------------------------------------- + + +@pytest.mark.benchmark(group="patched-call") +def test_crewai_patched_call_overhead(benchmark: Any, noop_interceptor: Any) -> None: + """Benchmark per-call overhead of governance-patched BaseTool.run().""" + _apply_basetool_run_patch(_BenchBaseTool, noop_interceptor) + tool = _BenchBaseTool() + + try: + benchmark(tool.run) + finally: + _revert_basetool_run_patch(_BenchBaseTool) + + +@pytest.mark.benchmark(group="patched-call") +def test_langchain_callback_overhead(benchmark: Any, noop_interceptor: Any) -> None: + """Benchmark per-call overhead of LangChain callback handler dispatch.""" + handler = AssemblyCallbackHandler(noop_interceptor) + run_id = uuid4() + serialized: dict[str, Any] = {"name": "bench_tool"} + input_str = "benchmark input" + + def callback_cycle() -> None: + handler.on_tool_start(serialized, input_str, run_id=run_id) + handler.on_tool_end("result", run_id=run_id) + + benchmark(callback_cycle) + + +@pytest.mark.benchmark(group="patched-call") +def test_langgraph_wrapped_node_overhead(benchmark: Any, noop_interceptor: Any) -> None: + """Benchmark per-call overhead of a governance-wrapped graph node.""" + _apply_stategraph_compile_patch(_BenchStateGraph, noop_interceptor) + + try: + graph = _BenchStateGraph() + compiled = graph.compile() + wrapped_node = compiled.nodes["node_a"] + + def call_node() -> Any: + return wrapped_node({"key": "value"}) + + benchmark(call_node) + finally: + _revert_stategraph_compile_patch(_BenchStateGraph) + + +# --------------------------------------------------------------------------- +# Async adapter benchmarks +# --------------------------------------------------------------------------- + + +@pytest.mark.benchmark(group="patched-call") +def test_pydantic_ai_patched_call_overhead( + benchmark: Any, + noop_interceptor: Any, + bench_event_loop: asyncio.AbstractEventLoop, +) -> None: + """Benchmark per-call overhead of governance-patched Tool._run().""" + _apply_tool_run_patch(_BenchPydanticAITool, noop_interceptor) + tool = _BenchPydanticAITool() + ctx = type("FakeCtx", (), {"deps": None, "run_id": None})() + + try: + + def call() -> None: + bench_event_loop.run_until_complete(tool._run(ctx, {})) + + benchmark(call) + finally: + _revert_tool_run_patch(_BenchPydanticAITool) + + +@pytest.mark.benchmark(group="patched-call") +def test_openai_agents_patched_call_overhead( + benchmark: Any, + noop_interceptor: Any, + bench_event_loop: asyncio.AbstractEventLoop, +) -> None: + """Benchmark per-call overhead of governance-patched FunctionTool.__call__().""" + _apply_function_tool_call_patch(_BenchOpenAIFunctionTool, noop_interceptor) + tool = _BenchOpenAIFunctionTool() + ctx = type("FakeCtx", (), {"agent_id": None})() + + try: + + def call() -> None: + bench_event_loop.run_until_complete(tool(ctx, "bench input")) + + benchmark(call) + finally: + _revert_function_tool_call_patch(_BenchOpenAIFunctionTool) + + +@pytest.mark.benchmark(group="patched-call") +def test_mcp_patched_call_overhead( + benchmark: Any, + noop_interceptor: Any, + bench_event_loop: asyncio.AbstractEventLoop, +) -> None: + """Benchmark per-call overhead of governance-patched ClientSession.call_tool().""" + _apply_client_session_patch(_BenchMCPClientSession, noop_interceptor) + session = _BenchMCPClientSession() + + try: + + def call() -> None: + bench_event_loop.run_until_complete(session.call_tool("bench_tool", {"key": "value"})) + + benchmark(call) + finally: + _revert_client_session_patch(_BenchMCPClientSession) diff --git a/test/bench/test_report_llm_call_roundtrip.py b/test/bench/test_report_llm_call_roundtrip.py new file mode 100644 index 0000000..b930618 --- /dev/null +++ b/test/bench/test_report_llm_call_roundtrip.py @@ -0,0 +1,85 @@ +"""Benchmark report_llm_call() PyO3 round-trip overhead. + +Measures the Python-to-Rust boundary crossing overhead for +governance event reporting via the native `_core` module. + +This benchmark is conditional — it is skipped when the native +module has not been built (requires `maturin develop`). + +Contract: per-call overhead must be <2ms P99 (AAASM-45). +""" + +from __future__ import annotations + +import json +from typing import Any + +import pytest + + +@pytest.mark.benchmark(group="ffi") +def test_governance_event_construction(benchmark: Any) -> None: + """Benchmark GovernanceEvent PyO3 construction (JSON deserialization).""" + _core = pytest.importorskip( + "agent_assembly._core", + reason="native _core module not built (requires maturin develop)", + ) + + payload = json.dumps( + { + "event_type": "LlmCall", + "agent_id": "bench-agent", + "tool_name": "bench-tool", + "input": "benchmark input", + "output": "benchmark output", + "timestamp": "2026-01-01T00:00:00Z", + "duration_ms": 100, + } + ) + + def construct() -> Any: + return _core.GovernanceEvent(payload) + + benchmark(construct) + + +@pytest.mark.benchmark(group="ffi") +def test_send_event_enqueue(benchmark: Any) -> None: + """Benchmark RuntimeClient.send_event() channel enqueue overhead. + + Uses a connected RuntimeClient pointed at a non-existent socket. + The worker will fail to connect but the channel send (Python→Rust + boundary + mpsc enqueue) is still measured. Events are fire-and-forget + so the enqueue completes immediately. + """ + _core = pytest.importorskip( + "agent_assembly._core", + reason="native _core module not built (requires maturin develop)", + ) + + payload = json.dumps( + { + "event_type": "LlmCall", + "agent_id": "bench-agent", + "tool_name": "bench-tool", + "input": "benchmark input", + "output": "benchmark output", + "timestamp": "2026-01-01T00:00:00Z", + "duration_ms": 100, + } + ) + event = _core.GovernanceEvent(payload) + + # connect() spawns a background worker; send_event() enqueues to the + # mpsc channel without blocking on IPC delivery. + client = _core.RuntimeClient.connect("/tmp/aa-bench-nonexistent.sock") + + def send() -> None: + try: + client.send_event(event) + except RuntimeError: + # Worker may close the channel after failing to connect — + # the benchmark still captures the Python→PyO3 boundary cost. + pass + + benchmark(send) diff --git a/uv.lock b/uv.lock index 33155c6..c2e4a99 100644 --- a/uv.lock +++ b/uv.lock @@ -17,6 +17,7 @@ dev = [ { name = "coverage" }, { name = "pytest" }, { name = "pytest-asyncio" }, + { name = "pytest-benchmark" }, { name = "pytest-cov" }, { name = "pytest-rerunfailures" }, { name = "python-dotenv" }, @@ -40,6 +41,7 @@ dev = [ { name = "coverage", specifier = "~=7.10" }, { name = "pytest", specifier = ">=8.1.1,<10" }, { name = "pytest-asyncio", specifier = ">=0.23.0,<2" }, + { name = "pytest-benchmark", specifier = ">=4.0.0,<5" }, { name = "pytest-cov", specifier = ">=5.0.0,<8" }, { name = "pytest-rerunfailures", specifier = ">=14.0,<17" }, { name = "python-dotenv", specifier = ">=1.0.1,<2" }, @@ -384,6 +386,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/a5/987a405322d78a73b66e39e4a90e4ef156fd7141bf71df987e50717c321b/pre_commit-4.3.0-py2.py3-none-any.whl", hash = "sha256:2b0747ad7e6e967169136edffee14c16e148a778a54e4f967921aa1ebf2308d8", size = 220965, upload-time = "2025-08-09T18:56:13.192Z" }, ] +[[package]] +name = "py-cpuinfo" +version = "9.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/37/a8/d832f7293ebb21690860d2e01d8115e5ff6f2ae8bbdc953f0eb0fa4bd2c7/py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690", size = 104716, upload-time = "2022-10-25T20:38:06.303Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335, upload-time = "2022-10-25T20:38:27.636Z" }, +] + [[package]] name = "pydantic" version = "2.13.3" @@ -529,6 +540,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/9d/bf86eddabf8c6c9cb1ea9a869d6873b46f105a5d292d3a6f7071f5b07935/pytest_asyncio-1.1.0-py3-none-any.whl", hash = "sha256:5fe2d69607b0bd75c656d1211f969cadba035030156745ee09e7d71740e58ecf", size = 15157, upload-time = "2025-07-16T04:29:24.929Z" }, ] +[[package]] +name = "pytest-benchmark" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "py-cpuinfo" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/28/08/e6b0067efa9a1f2a1eb3043ecd8a0c48bfeb60d3255006dcc829d72d5da2/pytest-benchmark-4.0.0.tar.gz", hash = "sha256:fb0785b83efe599a6a956361c0691ae1dbb5318018561af10f3e915caa0048d1", size = 334641, upload-time = "2022-10-25T21:21:55.686Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/a1/3b70862b5b3f830f0422844f25a823d0470739d994466be9dbbbb414d85a/pytest_benchmark-4.0.0-py3-none-any.whl", hash = "sha256:fdb7db64e31c8b277dff9850d2a2556d8b60bcb0ea6524e36e28ffd7c87f71d6", size = 43951, upload-time = "2022-10-25T21:21:53.208Z" }, +] + [[package]] name = "pytest-cov" version = "5.0.0"