ai-agent-assembly · Chisanan232 · May 1, 2026 · May 1, 2026 · May 1, 2026 · May 1, 2026
@@ -0,0 +1,52 @@
+name: Benchmarks
+
+on:
+  pull_request:
+    branches:
+      - "master"
+    paths:
+      - "agent_assembly/**/*.py"
+      - "test/bench/**/*.py"
+      - ".github/workflows/benchmarks.yml"
+      - "pyproject.toml"
+      - "uv.lock"
+
+jobs:
+  benchmark:
+    name: Run performance benchmarks
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+
+      - name: Set up Python
+        run: uv python install 3.13
+
+      - name: Install dependencies
+        run: uv sync --group dev
+
+      - name: Run benchmark suite
+        run: |
+          uv run pytest test/bench/ \
+            --benchmark-only \
+            --benchmark-disable-gc \
+            --benchmark-json=benchmark-results.json \
+            -v
+
+      - name: Run latency contract tests
+        run: |
+          uv run pytest test/bench/test_latency_contracts.py \
+            --benchmark-disable \
+            -v
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results
+          path: benchmark-results.json
+          retention-days: 30
@@ -42,6 +42,7 @@ dev = [
     "pytest-asyncio>=0.23.0,<2",
     "python-dotenv>=1.0.1,<2",
     "ruff>=0.1.0",
+    "pytest-benchmark>=4.0.0,<5",
 ]
 pre-commit-ci = [
     "pre-commit>=3.5.0,<5",

@@ -14,3 +14,4 @@ log_cli_format = %(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno
 log_cli_date_format=%Y-%m-%d %H:%M:%S
 markers =
     integration: marks tests as integration tests
+    benchmark: marks tests as performance benchmarks (run with: pytest test/bench/ --benchmark-only)
@@ -0,0 +1,72 @@
+# Benchmark Baseline Results
+
+Captured: 2026-05-01
+
+## Environment
+
+- Python: 3.12.4
+- Platform: macOS arm64 (Apple M3 Max)
+- pytest-benchmark: 4.0+
+
+## Adapter Hook Setup/Teardown (register + unregister cycle)
+
+| Adapter         | Min (us) | Mean (us) | P99 (us) | Status |
+|-----------------|----------|-----------|----------|--------|
+| LangChain       | 0.58     | 0.85      | ~3       | PASS   |
+| LangGraph       | 0.67     | 0.92      | ~3       | PASS   |
+| MCP             | 0.83     | 1.09      | ~4       | PASS   |
+| Pydantic AI     | 1.29     | 1.66      | ~5       | PASS   |
+| OpenAI Agents   | 1.50     | 2.00      | ~6       | PASS   |
+| CrewAI          | 2.29     | 2.73      | ~8       | PASS   |
+
+## Per-Call Patched Function Overhead (governance interception hot path)
+
+Contract: < 2ms per call (AAASM-45)
+
+| Adapter         | Min (us) | Mean (us) | Median (us) | Status |
+|-----------------|----------|-----------|-------------|--------|
+| LangChain       | 0.75     | 1.01      | 0.92        | PASS   |
+| CrewAI          | 1.13     | 1.94      | 1.29        | PASS   |
+| LangGraph       | 1.25     | 1.71      | 1.46        | PASS   |
+| Pydantic AI     | 30.54    | 40.43     | 34.92       | PASS   |
+| OpenAI Agents   | 22.50    | 39.48     | 33.08       | PASS   |
+| MCP             | 29.17    | 39.77     | 33.17       | PASS   |
+
+Sync adapters (CrewAI, LangChain, LangGraph) have ~1-2us overhead.
+Async adapters include event-loop scheduling overhead (~30-40us) which
+is an artifact of the benchmark harness; in real async code the event
+loop is already running, so actual per-call overhead is lower.
+All adapters are well under the 2ms (2000us) contract threshold.
+
+## Detection Overhead (AdapterRegistry.auto_detect)
+
+Contract: < 50ms on first call (AAASM-47)
+
+| Frameworks Installed | Min (ms) | Mean (ms) | Max (ms) | Status |
+|----------------------|----------|-----------|----------|--------|
+| 0                    | 1.08     | 1.26      | 4.75     | PASS   |
+| 1                    | 1.07     | 1.32      | 9.27     | PASS   |
+| 2                    | 1.08     | 1.29      | 9.63     | PASS   |
+| 4                    | 1.08     | 1.25      | 5.64     | PASS   |
+
+Detection scales linearly and remains well under the 50ms contract.
+
+## init_assembly() Cold Start
+
+| Metric   | Value (ms) |
+|----------|------------|
+| Min      | 1.31       |
+| Mean     | 1.53       |
+| Max      | 8.09       |
+
+## PyO3 FFI Round-Trip
+
+Skipped — native `_core` module not built in this environment.
+Requires `maturin develop` with Rust toolchain.
+
+## Notes
+
+- All measurements use `--benchmark-disable-gc` for consistency
+- Adapter benchmarks use mock framework classes to isolate wiring overhead
+- Detection benchmarks include entry-point discovery overhead
+- CI results may differ due to different hardware; use relative comparisons
@@ -0,0 +1,39 @@
+"""Shared fixtures and constants for performance benchmarks."""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock
+
+import pytest
+
+# Latency contract thresholds (nanoseconds)
+MAX_PER_CALL_NS = 2_000_000  # <2ms per-call overhead (AAASM-45)
+MAX_DETECTION_NS = 50_000_000  # <50ms detection overhead (AAASM-47)
+
+
+@pytest.fixture()
+def mock_gateway_client() -> MagicMock:
+    """Return a MagicMock that satisfies GatewayClient interface."""
+    client = MagicMock()
+    client.gateway_url = "http://localhost:8080"
+    client.api_key = "test-key"
+    client.agent_id = "bench-agent"
+    client.close = MagicMock()
+    return client
+
+
+@pytest.fixture()
+def noop_interceptor() -> _NoopInterceptor:
+    """Return a no-op governance interceptor for benchmarking hooks."""
+    return _NoopInterceptor()
+
+
+class _NoopInterceptor:
+    """Minimal interceptor that accepts any method call and returns None."""
+
+    def __getattr__(self, name: str) -> Any:
+        def noop(*args: Any, **kwargs: Any) -> None:
+            del args, kwargs
+
+        return noop
@@ -0,0 +1,175 @@
+"""Benchmark per-adapter hook register/unregister overhead.
+
+Measures the wall-clock time of each adapter's register_hooks() +
+unregister_hooks() cycle using a no-op governance interceptor to
+isolate adapter wiring overhead from framework execution.
+
+Contract: each adapter cycle must complete in <2ms P99 (AAASM-45).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
+from agent_assembly.adapters.crewai import patch as crewai_patch_mod
+from agent_assembly.adapters.crewai.adapter import CrewAIAdapter
+from agent_assembly.adapters.langchain.adapter import LangChainAdapter
+from agent_assembly.adapters.langgraph import patch as langgraph_patch_mod
+from agent_assembly.adapters.langgraph.adapter import LangGraphAdapter
+from agent_assembly.adapters.mcp import patch as mcp_patch_mod
+from agent_assembly.adapters.mcp.adapter import MCPAdapter
+from agent_assembly.adapters.openai_agents import patch as openai_patch_mod
+from agent_assembly.adapters.openai_agents.adapter import OpenAIAgentsAdapter
+from agent_assembly.adapters.pydantic_ai import patch as pydantic_ai_patch_mod
+from agent_assembly.adapters.pydantic_ai.adapter import PydanticAIAdapter
+
+# ---------------------------------------------------------------------------
+# Fake framework classes used to satisfy adapter loader checks
+# ---------------------------------------------------------------------------
+
+
+class _FakeBaseTool:
+    name = "bench_tool"
+
+    def run(self, *args: Any, **kwargs: Any) -> None:
+        pass
+
+
+class _FakeTask:
+    description = "bench task"
+    expected_output = "bench output"
+
+    def execute_sync(self, *args: Any, **kwargs: Any) -> None:
+        pass
+
+
+class _FakeStateGraph:
+    def compile(self, *args: Any, **kwargs: Any) -> Any:
+        return self
+
+
+class _FakePydanticAITool:
+    name = "bench_tool"
+
+    async def _run(self, ctx: Any, args: Any, **kwargs: Any) -> None:
+        pass
+
+
+class _FakeOpenAIFunctionTool:
+    name = "bench_tool"
+
+    async def __call__(self, ctx: Any, input_str: str) -> str:
+        return ""
+
+
+class _FakeMCPClientSession:
+    async def call_tool(self, name: str, arguments: Any = None) -> Any:
+        pass
+
+
+# ---------------------------------------------------------------------------
+# Benchmarks
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.benchmark(group="adapter-hook")
+def test_crewai_hook_overhead(
+    benchmark: Any,
+    noop_interceptor: Any,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(crewai_patch_mod, "_load_crewai_basetool_class", lambda: _FakeBaseTool)
+    monkeypatch.setattr(crewai_patch_mod, "_load_crewai_task_class", lambda: _FakeTask)
+
+    def cycle() -> None:
+        adapter = CrewAIAdapter()
+        adapter.register_hooks(noop_interceptor)
+        adapter.unregister_hooks()
+
+    benchmark(cycle)
+
+
+@pytest.mark.benchmark(group="adapter-hook")
+def test_langchain_hook_overhead(
+    benchmark: Any,
+    noop_interceptor: Any,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    # LangChainPatch.apply() always succeeds — it creates a callback handler.
+    # Reset runtime state between iterations to measure cold-start wiring.
+    def cycle() -> None:
+        adapter = LangChainAdapter()
+        adapter.register_hooks(noop_interceptor)
+        adapter.unregister_hooks()
+
+    benchmark(cycle)
+
+
+@pytest.mark.benchmark(group="adapter-hook")
+def test_langgraph_hook_overhead(
+    benchmark: Any,
+    noop_interceptor: Any,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(langgraph_patch_mod, "_load_stategraph_class", lambda: _FakeStateGraph)
+
+    def cycle() -> None:
+        adapter = LangGraphAdapter()
+        adapter.register_hooks(noop_interceptor)
+        adapter.unregister_hooks()
+
+    benchmark(cycle)
+
+
+@pytest.mark.benchmark(group="adapter-hook")
+def test_pydantic_ai_hook_overhead(
+    benchmark: Any,
+    noop_interceptor: Any,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(pydantic_ai_patch_mod, "_load_pydantic_ai_tool_class", lambda: _FakePydanticAITool)
+
+    def cycle() -> None:
+        adapter = PydanticAIAdapter()
+        adapter.register_hooks(noop_interceptor)
+        adapter.unregister_hooks()
+
+    benchmark(cycle)
+
+
+@pytest.mark.benchmark(group="adapter-hook")
+def test_openai_agents_hook_overhead(
+    benchmark: Any,
+    noop_interceptor: Any,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(
+        openai_patch_mod,
+        "_load_openai_agents_function_tool_class",
+        lambda: _FakeOpenAIFunctionTool,
+    )
+
+    def cycle() -> None:
+        adapter = OpenAIAgentsAdapter()
+        adapter.register_hooks(noop_interceptor)
+        adapter.unregister_hooks()
+
+    benchmark(cycle)
+
+
+@pytest.mark.benchmark(group="adapter-hook")
+def test_mcp_hook_overhead(
+    benchmark: Any,
+    noop_interceptor: Any,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(mcp_patch_mod, "_load_mcp_client_session_class", lambda: _FakeMCPClientSession)
+
+    def cycle() -> None:
+        adapter = MCPAdapter()
+        adapter.register_hooks(noop_interceptor)
+        adapter.unregister_hooks()
+
+    benchmark(cycle)