Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions py/benchmarks/benches/bench_id_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Benchmarks for ID generation.

get_span_id and get_trace_id are called on every span creation, so their
cost accumulates in high-throughput tracing workloads. This module
compares the two generators: UUIDGenerator (default) and OTELIDGenerator
(enabled via BRAINTRUST_OTEL_COMPAT=true).
"""

import pathlib
import sys

import pyperf


if __package__ in (None, ""):
sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2]))

from braintrust.id_gen import OTELIDGenerator, UUIDGenerator

from benchmarks._utils import disable_pyperf_psutil


def main(runner: pyperf.Runner | None = None) -> None:
if runner is None:
disable_pyperf_psutil()
runner = pyperf.Runner()

uuid_gen = UUIDGenerator()
otel_gen = OTELIDGenerator()

runner.bench_func("id_gen.uuid.span_id", uuid_gen.get_span_id)
runner.bench_func("id_gen.uuid.trace_id", uuid_gen.get_trace_id)
runner.bench_func("id_gen.otel.span_id", otel_gen.get_span_id)
runner.bench_func("id_gen.otel.trace_id", otel_gen.get_trace_id)


if __name__ == "__main__":
main()
104 changes: 104 additions & 0 deletions py/benchmarks/benches/bench_merge_dicts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""Benchmarks for merge_dicts and merge_dicts_with_paths.

merge_dicts is called on every span log update and during row merging,
making it one of the most frequently executed SDK functions.

Note: merge_dicts mutates merge_into, so each benchmark wrapper creates a
fresh copy of the target dict before calling. This means each bench_func
measures a shallow/deep copy plus the merge itself — the copy cost is
intentionally kept proportional to the input size so relative comparisons
remain valid.
"""

import copy
import pathlib
import sys
from typing import Any

import pyperf


if __package__ in (None, ""):
sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2]))

from braintrust.util import merge_dicts

from benchmarks._utils import disable_pyperf_psutil
from benchmarks.fixtures import make_large_payload, make_medium_payload, make_small_payload


# Updates are pre-built once; only merge_into is copied per iteration.
_SMALL_UPDATE: dict[str, Any] = {
"metadata": {"extra_key": "extra_value"},
"scores": {"relevance": 0.8},
"tags": ["new_tag"],
}

_MEDIUM_UPDATE: dict[str, Any] = {
"metadata": {"workspace_id": "workspace-789", "new_flag": True},
"metrics": {"cached_tokens": 64},
"tags": ["updated", "benchmark"],
}

_LARGE_UPDATE: dict[str, Any] = {
"metadata": {"routing": {"tier": "standard"}, "extra": "value"},
"metrics": {"cached_tokens": 512},
"tags": ["updated"],
"output": {"summary": "revised"},
}

# Pre-built base payloads (copied per iteration, not mutated at module level).
_SMALL_BASE = make_small_payload()
_MEDIUM_BASE = make_medium_payload()
_LARGE_BASE = make_large_payload()

_NESTED_BASE: dict[str, Any] = {
"a": {"b": {"c": {"d": 1, "e": 2}, "f": 3}, "g": 4},
"h": {"i": {"j": {"k": 5}}},
}
_NESTED_UPDATE: dict[str, Any] = {
"a": {"b": {"c": {"d": 99}, "new": "value"}, "g": 99},
"h": {"i": {"j": {"new_key": "hello"}}},
}

# Tags set-union: top-level "tags" field uses set-union semantics in merge_dicts.
_TAGS_UPDATE: dict[str, Any] = {"tags": ["c", "d", "e"]}


def _bench_small() -> None:
merge_dicts(dict(_SMALL_BASE), _SMALL_UPDATE)


def _bench_medium() -> None:
# Shallow copy is enough: _MEDIUM_UPDATE only touches top-level dict values.
merge_dicts(dict(_MEDIUM_BASE), _MEDIUM_UPDATE)


def _bench_large() -> None:
merge_dicts(dict(_LARGE_BASE), _LARGE_UPDATE)


def _bench_nested() -> None:
# Deep copy required because the update recurses into nested dicts.
merge_dicts(copy.deepcopy(_NESTED_BASE), _NESTED_UPDATE)


def _bench_tags_union() -> None:
# Tags list grows on each call, so start from a fresh copy every time.
merge_dicts({"tags": ["a", "b"], "value": 1}, _TAGS_UPDATE)


def main(runner: pyperf.Runner | None = None) -> None:
if runner is None:
disable_pyperf_psutil()
runner = pyperf.Runner()

runner.bench_func("merge_dicts[small]", _bench_small)
runner.bench_func("merge_dicts[medium]", _bench_medium)
runner.bench_func("merge_dicts[large]", _bench_large)
runner.bench_func("merge_dicts[nested-deep]", _bench_nested)
runner.bench_func("merge_dicts[tags-union]", _bench_tags_union)


if __name__ == "__main__":
main()
148 changes: 148 additions & 0 deletions py/benchmarks/benches/bench_merge_row_batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""Benchmarks for merge_row_batch and batch_items.

merge_row_batch is called before every flush to the Braintrust API to
de-duplicate and merge rows in a pending batch. batch_items is used to
split the resulting rows into API-request-sized chunks.

Both functions mutate their inputs, so each benchmark wrapper builds fresh
row lists per iteration.
"""

import pathlib
import sys

import pyperf


if __package__ in (None, ""):
sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2]))

from braintrust.db_fields import IS_MERGE_FIELD
from braintrust.merge_row_batch import batch_items, merge_row_batch

from benchmarks._utils import disable_pyperf_psutil


# ---------------------------------------------------------------------------
# Row factories — called inside each benchmark wrapper to get fresh dicts.
# ---------------------------------------------------------------------------


def _unique_rows(n: int) -> list[dict]:
"""n rows, all distinct IDs — no merging needed."""
return [{"id": f"row-{i}", "project_id": "proj-1", "value": i} for i in range(n)]


def _merge_rows(n: int) -> list[dict]:
"""n rows forming n//2 pairs: first is a base, second is an IS_MERGE update."""
rows = []
for i in range(n // 2):
rows.append({"id": f"row-{i}", "project_id": "proj-1", "payload": {"a": i}})
rows.append(
{
"id": f"row-{i}",
"project_id": "proj-1",
"payload": {"b": i + 100},
IS_MERGE_FIELD: True,
}
)
return rows


def _mixed_rows(n: int) -> list[dict]:
"""Mix of unique rows and merge pairs (roughly half each)."""
rows = []
for i in range(n // 4):
# pair that will be merged
rows.append({"id": f"merge-{i}", "project_id": "proj-1", "payload": {"a": i}})
rows.append(
{
"id": f"merge-{i}",
"project_id": "proj-1",
"payload": {"b": i + 100},
IS_MERGE_FIELD: True,
}
)
for i in range(n // 2):
rows.append({"id": f"unique-{i}", "project_id": "proj-1", "value": i})
return rows


# ---------------------------------------------------------------------------
# Benchmark wrappers
# ---------------------------------------------------------------------------

_SMALL_N = 10
_MEDIUM_N = 50
_LARGE_N = 200


def _bench_no_conflict_small() -> None:
merge_row_batch(_unique_rows(_SMALL_N))


def _bench_no_conflict_medium() -> None:
merge_row_batch(_unique_rows(_MEDIUM_N))


def _bench_no_conflict_large() -> None:
merge_row_batch(_unique_rows(_LARGE_N))


def _bench_all_merge_small() -> None:
merge_row_batch(_merge_rows(_SMALL_N))


def _bench_all_merge_medium() -> None:
merge_row_batch(_merge_rows(_MEDIUM_N))


def _bench_mixed_medium() -> None:
merge_row_batch(_mixed_rows(_MEDIUM_N))


# batch_items: split a list of strings by item-count and byte-count limits.
_BATCH_STRINGS = [f"item-payload-{i:04d}" * 4 for i in range(200)]
_ITEM_SIZE = len(_BATCH_STRINGS[0].encode())


def _bench_batch_items_count_limit() -> None:
batch_items(_BATCH_STRINGS, batch_max_num_items=20)


def _bench_batch_items_byte_limit() -> None:
batch_items(
_BATCH_STRINGS,
batch_max_num_bytes=_ITEM_SIZE * 15,
get_byte_size=lambda s: len(s.encode()),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move the function definition outside the function body so function creating time isn't counted by pyperf?

)


def _bench_batch_items_both_limits() -> None:
batch_items(
_BATCH_STRINGS,
batch_max_num_items=20,
batch_max_num_bytes=_ITEM_SIZE * 15,
get_byte_size=lambda s: len(s.encode()),
)


def main(runner: pyperf.Runner | None = None) -> None:
if runner is None:
disable_pyperf_psutil()
runner = pyperf.Runner()

runner.bench_func("merge_row_batch[no-conflict-small]", _bench_no_conflict_small)
runner.bench_func("merge_row_batch[no-conflict-medium]", _bench_no_conflict_medium)
runner.bench_func("merge_row_batch[no-conflict-large]", _bench_no_conflict_large)
runner.bench_func("merge_row_batch[all-merge-small]", _bench_all_merge_small)
runner.bench_func("merge_row_batch[all-merge-medium]", _bench_all_merge_medium)
runner.bench_func("merge_row_batch[mixed-medium]", _bench_mixed_medium)

runner.bench_func("batch_items[count-limit]", _bench_batch_items_count_limit)
runner.bench_func("batch_items[byte-limit]", _bench_batch_items_byte_limit)
runner.bench_func("batch_items[both-limits]", _bench_batch_items_both_limits)


if __name__ == "__main__":
main()
73 changes: 73 additions & 0 deletions py/benchmarks/benches/bench_span_components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""Benchmarks for SpanComponentsV3 and SpanComponentsV4 encode/decode.

These are on the hot path: every span serializes/deserializes parent context.
"""

import pathlib
import secrets
import sys
import uuid

import pyperf


if __package__ in (None, ""):
sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2]))

from braintrust.span_identifier_v3 import SpanComponentsV3, SpanObjectTypeV3
from braintrust.span_identifier_v4 import SpanComponentsV4

from benchmarks._utils import disable_pyperf_psutil


def main(runner: pyperf.Runner | None = None) -> None:
if runner is None:
disable_pyperf_psutil()
runner = pyperf.Runner()

# V3 — UUID-based IDs
v3_obj_only = SpanComponentsV3(
object_type=SpanObjectTypeV3.PROJECT_LOGS,
object_id=str(uuid.uuid4()),
)
v3_full = SpanComponentsV3(
object_type=SpanObjectTypeV3.EXPERIMENT,
object_id=str(uuid.uuid4()),
row_id=str(uuid.uuid4()),
span_id=str(uuid.uuid4()),
root_span_id=str(uuid.uuid4()),
)
v3_obj_only_str = v3_obj_only.to_str()
v3_full_str = v3_full.to_str()

runner.bench_func("span_components.v3.to_str[object-only]", v3_obj_only.to_str)
runner.bench_func("span_components.v3.to_str[full-uuid]", v3_full.to_str)
runner.bench_func("span_components.v3.from_str[object-only]", SpanComponentsV3.from_str, v3_obj_only_str)
runner.bench_func("span_components.v3.from_str[full-uuid]", SpanComponentsV3.from_str, v3_full_str)

# V4 — OTEL hex IDs for span_id (8-byte) and root_span_id (16-byte)
v4_obj_only = SpanComponentsV4(
object_type=SpanObjectTypeV3.PROJECT_LOGS,
object_id=str(uuid.uuid4()),
)
v4_full_otel = SpanComponentsV4(
object_type=SpanObjectTypeV3.EXPERIMENT,
object_id=str(uuid.uuid4()),
row_id=str(uuid.uuid4()),
span_id=secrets.token_hex(8),
root_span_id=secrets.token_hex(16),
)
v4_obj_only_str = v4_obj_only.to_str()
v4_full_otel_str = v4_full_otel.to_str()

runner.bench_func("span_components.v4.to_str[object-only]", v4_obj_only.to_str)
runner.bench_func("span_components.v4.to_str[full-otel]", v4_full_otel.to_str)
runner.bench_func("span_components.v4.from_str[object-only]", SpanComponentsV4.from_str, v4_obj_only_str)
runner.bench_func("span_components.v4.from_str[full-otel]", SpanComponentsV4.from_str, v4_full_otel_str)

# Cross-version: V4 decoder reading a V3-encoded string (backwards-compat path)
runner.bench_func("span_components.v4.from_str[v3-encoded]", SpanComponentsV4.from_str, v3_full_str)


if __name__ == "__main__":
main()
Loading
Loading