diff --git a/py/src/braintrust/integrations/openai_agents/test_openai_agents.py b/py/src/braintrust/integrations/openai_agents/test_openai_agents.py index 79dd13dc..7a6db120 100644 --- a/py/src/braintrust/integrations/openai_agents/test_openai_agents.py +++ b/py/src/braintrust/integrations/openai_agents/test_openai_agents.py @@ -125,6 +125,12 @@ async def test_openai_agents_integration_setup_creates_spans(memory_logger): llm_spans = [span for span in spans if span.get("span_attributes", {}).get("type") == "llm"] assert llm_spans + llm_metrics = [span.get("metrics", {}) for span in llm_spans] + assert any(metrics.get("prompt_tokens") is not None for metrics in llm_metrics) + assert any(metrics.get("completion_tokens") is not None for metrics in llm_metrics) + assert any(metrics.get("tokens") is not None for metrics in llm_metrics) + assert any(metrics.get("prompt_cached_tokens") == 0 for metrics in llm_metrics) + assert any(metrics.get("completion_reasoning_tokens") == 0 for metrics in llm_metrics) @pytest.mark.asyncio diff --git a/py/src/braintrust/integrations/openai_agents/tracing.py b/py/src/braintrust/integrations/openai_agents/tracing.py index 6fde395f..bf1a7274 100644 --- a/py/src/braintrust/integrations/openai_agents/tracing.py +++ b/py/src/braintrust/integrations/openai_agents/tracing.py @@ -69,6 +69,14 @@ def _maybe_timestamp_elapsed(end: str | None, start: str | None) -> float | None return (datetime.datetime.fromisoformat(end) - datetime.datetime.fromisoformat(start)).total_seconds() +# Maps the prefix of an OpenAI usage `*_tokens_details` field to the Braintrust +# metric prefix (e.g. `input_tokens_details.cached_tokens` → `prompt_cached_tokens`). +_TOKEN_PREFIX_MAP = { + "input": "prompt", + "output": "completion", +} + + def _usage_to_metrics(usage: dict[str, Any]) -> dict[str, Any]: """Convert an OpenAI-style usage dict to Braintrust metrics.""" metrics: dict[str, Any] = {} @@ -86,6 +94,19 @@ def _usage_to_metrics(usage: dict[str, Any]) -> dict[str, Any]: metrics["tokens"] = usage["total_tokens"] elif "input_tokens" in usage and "output_tokens" in usage: metrics["tokens"] = usage["input_tokens"] + usage["output_tokens"] + + # Walk *_tokens_details sub-objects so we capture cached / reasoning / audio + # token counts (e.g. input_tokens_details.cached_tokens → prompt_cached_tokens). + for key, value in usage.items(): + if not key.endswith("_tokens_details") or not isinstance(value, dict): + continue + raw_prefix = key[: -len("_tokens_details")] + prefix = _TOKEN_PREFIX_MAP.get(raw_prefix, raw_prefix) + for sub_key, sub_value in value.items(): + if isinstance(sub_value, bool) or not isinstance(sub_value, (int, float)): + continue + metrics[f"{prefix}_{sub_key}"] = sub_value + return metrics @@ -166,9 +187,8 @@ def _response_log_data(self, span: tracing.Span[tracing.ResponseSpanData]) -> di if ttft is not None: data["metrics"]["time_to_first_token"] = ttft if span.span_data.response is not None and span.span_data.response.usage is not None: - data["metrics"]["tokens"] = span.span_data.response.usage.total_tokens - data["metrics"]["prompt_tokens"] = span.span_data.response.usage.input_tokens - data["metrics"]["completion_tokens"] = span.span_data.response.usage.output_tokens + usage_dict = span.span_data.response.usage.model_dump() + data["metrics"].update(_usage_to_metrics(usage_dict)) return data