Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ async def test_openai_agents_integration_setup_creates_spans(memory_logger):

llm_spans = [span for span in spans if span.get("span_attributes", {}).get("type") == "llm"]
assert llm_spans
llm_metrics = [span.get("metrics", {}) for span in llm_spans]
assert any(metrics.get("prompt_tokens") is not None for metrics in llm_metrics)
assert any(metrics.get("completion_tokens") is not None for metrics in llm_metrics)
assert any(metrics.get("tokens") is not None for metrics in llm_metrics)
assert any(metrics.get("prompt_cached_tokens") == 0 for metrics in llm_metrics)
assert any(metrics.get("completion_reasoning_tokens") == 0 for metrics in llm_metrics)


@pytest.mark.asyncio
Expand Down
26 changes: 23 additions & 3 deletions py/src/braintrust/integrations/openai_agents/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ def _maybe_timestamp_elapsed(end: str | None, start: str | None) -> float | None
return (datetime.datetime.fromisoformat(end) - datetime.datetime.fromisoformat(start)).total_seconds()


# Maps the prefix of an OpenAI usage `*_tokens_details` field to the Braintrust
# metric prefix (e.g. `input_tokens_details.cached_tokens` → `prompt_cached_tokens`).
_TOKEN_PREFIX_MAP = {
"input": "prompt",
"output": "completion",
}


def _usage_to_metrics(usage: dict[str, Any]) -> dict[str, Any]:
"""Convert an OpenAI-style usage dict to Braintrust metrics."""
metrics: dict[str, Any] = {}
Expand All @@ -86,6 +94,19 @@ def _usage_to_metrics(usage: dict[str, Any]) -> dict[str, Any]:
metrics["tokens"] = usage["total_tokens"]
elif "input_tokens" in usage and "output_tokens" in usage:
metrics["tokens"] = usage["input_tokens"] + usage["output_tokens"]

# Walk *_tokens_details sub-objects so we capture cached / reasoning / audio
# token counts (e.g. input_tokens_details.cached_tokens → prompt_cached_tokens).
for key, value in usage.items():
if not key.endswith("_tokens_details") or not isinstance(value, dict):
continue
raw_prefix = key[: -len("_tokens_details")]
prefix = _TOKEN_PREFIX_MAP.get(raw_prefix, raw_prefix)
for sub_key, sub_value in value.items():
if isinstance(sub_value, bool) or not isinstance(sub_value, (int, float)):
continue
metrics[f"{prefix}_{sub_key}"] = sub_value

return metrics


Expand Down Expand Up @@ -166,9 +187,8 @@ def _response_log_data(self, span: tracing.Span[tracing.ResponseSpanData]) -> di
if ttft is not None:
data["metrics"]["time_to_first_token"] = ttft
if span.span_data.response is not None and span.span_data.response.usage is not None:
data["metrics"]["tokens"] = span.span_data.response.usage.total_tokens
data["metrics"]["prompt_tokens"] = span.span_data.response.usage.input_tokens
data["metrics"]["completion_tokens"] = span.span_data.response.usage.output_tokens
usage_dict = span.span_data.response.usage.model_dump()
data["metrics"].update(_usage_to_metrics(usage_dict))

return data

Expand Down