diff --git a/py/benchmarks/benches/bench_id_gen.py b/py/benchmarks/benches/bench_id_gen.py new file mode 100644 index 00000000..7b7c902a --- /dev/null +++ b/py/benchmarks/benches/bench_id_gen.py @@ -0,0 +1,38 @@ +"""Benchmarks for ID generation. + +get_span_id and get_trace_id are called on every span creation, so their +cost accumulates in high-throughput tracing workloads. This module +compares the two generators: UUIDGenerator (default) and OTELIDGenerator +(enabled via BRAINTRUST_OTEL_COMPAT=true). +""" + +import pathlib +import sys + +import pyperf + + +if __package__ in (None, ""): + sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2])) + +from braintrust.id_gen import OTELIDGenerator, UUIDGenerator + +from benchmarks._utils import disable_pyperf_psutil + + +def main(runner: pyperf.Runner | None = None) -> None: + if runner is None: + disable_pyperf_psutil() + runner = pyperf.Runner() + + uuid_gen = UUIDGenerator() + otel_gen = OTELIDGenerator() + + runner.bench_func("id_gen.uuid.span_id", uuid_gen.get_span_id) + runner.bench_func("id_gen.uuid.trace_id", uuid_gen.get_trace_id) + runner.bench_func("id_gen.otel.span_id", otel_gen.get_span_id) + runner.bench_func("id_gen.otel.trace_id", otel_gen.get_trace_id) + + +if __name__ == "__main__": + main() diff --git a/py/benchmarks/benches/bench_merge_dicts.py b/py/benchmarks/benches/bench_merge_dicts.py new file mode 100644 index 00000000..ab90a641 --- /dev/null +++ b/py/benchmarks/benches/bench_merge_dicts.py @@ -0,0 +1,104 @@ +"""Benchmarks for merge_dicts and merge_dicts_with_paths. + +merge_dicts is called on every span log update and during row merging, +making it one of the most frequently executed SDK functions. + +Note: merge_dicts mutates merge_into, so each benchmark wrapper creates a +fresh copy of the target dict before calling. This means each bench_func +measures a shallow/deep copy plus the merge itself — the copy cost is +intentionally kept proportional to the input size so relative comparisons +remain valid. +""" + +import copy +import pathlib +import sys +from typing import Any + +import pyperf + + +if __package__ in (None, ""): + sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2])) + +from braintrust.util import merge_dicts + +from benchmarks._utils import disable_pyperf_psutil +from benchmarks.fixtures import make_large_payload, make_medium_payload, make_small_payload + + +# Updates are pre-built once; only merge_into is copied per iteration. +_SMALL_UPDATE: dict[str, Any] = { + "metadata": {"extra_key": "extra_value"}, + "scores": {"relevance": 0.8}, + "tags": ["new_tag"], +} + +_MEDIUM_UPDATE: dict[str, Any] = { + "metadata": {"workspace_id": "workspace-789", "new_flag": True}, + "metrics": {"cached_tokens": 64}, + "tags": ["updated", "benchmark"], +} + +_LARGE_UPDATE: dict[str, Any] = { + "metadata": {"routing": {"tier": "standard"}, "extra": "value"}, + "metrics": {"cached_tokens": 512}, + "tags": ["updated"], + "output": {"summary": "revised"}, +} + +# Pre-built base payloads (copied per iteration, not mutated at module level). +_SMALL_BASE = make_small_payload() +_MEDIUM_BASE = make_medium_payload() +_LARGE_BASE = make_large_payload() + +_NESTED_BASE: dict[str, Any] = { + "a": {"b": {"c": {"d": 1, "e": 2}, "f": 3}, "g": 4}, + "h": {"i": {"j": {"k": 5}}}, +} +_NESTED_UPDATE: dict[str, Any] = { + "a": {"b": {"c": {"d": 99}, "new": "value"}, "g": 99}, + "h": {"i": {"j": {"new_key": "hello"}}}, +} + +# Tags set-union: top-level "tags" field uses set-union semantics in merge_dicts. +_TAGS_UPDATE: dict[str, Any] = {"tags": ["c", "d", "e"]} + + +def _bench_small() -> None: + merge_dicts(dict(_SMALL_BASE), _SMALL_UPDATE) + + +def _bench_medium() -> None: + # Shallow copy is enough: _MEDIUM_UPDATE only touches top-level dict values. + merge_dicts(dict(_MEDIUM_BASE), _MEDIUM_UPDATE) + + +def _bench_large() -> None: + merge_dicts(dict(_LARGE_BASE), _LARGE_UPDATE) + + +def _bench_nested() -> None: + # Deep copy required because the update recurses into nested dicts. + merge_dicts(copy.deepcopy(_NESTED_BASE), _NESTED_UPDATE) + + +def _bench_tags_union() -> None: + # Tags list grows on each call, so start from a fresh copy every time. + merge_dicts({"tags": ["a", "b"], "value": 1}, _TAGS_UPDATE) + + +def main(runner: pyperf.Runner | None = None) -> None: + if runner is None: + disable_pyperf_psutil() + runner = pyperf.Runner() + + runner.bench_func("merge_dicts[small]", _bench_small) + runner.bench_func("merge_dicts[medium]", _bench_medium) + runner.bench_func("merge_dicts[large]", _bench_large) + runner.bench_func("merge_dicts[nested-deep]", _bench_nested) + runner.bench_func("merge_dicts[tags-union]", _bench_tags_union) + + +if __name__ == "__main__": + main() diff --git a/py/benchmarks/benches/bench_merge_row_batch.py b/py/benchmarks/benches/bench_merge_row_batch.py new file mode 100644 index 00000000..5a14a010 --- /dev/null +++ b/py/benchmarks/benches/bench_merge_row_batch.py @@ -0,0 +1,148 @@ +"""Benchmarks for merge_row_batch and batch_items. + +merge_row_batch is called before every flush to the Braintrust API to +de-duplicate and merge rows in a pending batch. batch_items is used to +split the resulting rows into API-request-sized chunks. + +Both functions mutate their inputs, so each benchmark wrapper builds fresh +row lists per iteration. +""" + +import pathlib +import sys + +import pyperf + + +if __package__ in (None, ""): + sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2])) + +from braintrust.db_fields import IS_MERGE_FIELD +from braintrust.merge_row_batch import batch_items, merge_row_batch + +from benchmarks._utils import disable_pyperf_psutil + + +# --------------------------------------------------------------------------- +# Row factories — called inside each benchmark wrapper to get fresh dicts. +# --------------------------------------------------------------------------- + + +def _unique_rows(n: int) -> list[dict]: + """n rows, all distinct IDs — no merging needed.""" + return [{"id": f"row-{i}", "project_id": "proj-1", "value": i} for i in range(n)] + + +def _merge_rows(n: int) -> list[dict]: + """n rows forming n//2 pairs: first is a base, second is an IS_MERGE update.""" + rows = [] + for i in range(n // 2): + rows.append({"id": f"row-{i}", "project_id": "proj-1", "payload": {"a": i}}) + rows.append( + { + "id": f"row-{i}", + "project_id": "proj-1", + "payload": {"b": i + 100}, + IS_MERGE_FIELD: True, + } + ) + return rows + + +def _mixed_rows(n: int) -> list[dict]: + """Mix of unique rows and merge pairs (roughly half each).""" + rows = [] + for i in range(n // 4): + # pair that will be merged + rows.append({"id": f"merge-{i}", "project_id": "proj-1", "payload": {"a": i}}) + rows.append( + { + "id": f"merge-{i}", + "project_id": "proj-1", + "payload": {"b": i + 100}, + IS_MERGE_FIELD: True, + } + ) + for i in range(n // 2): + rows.append({"id": f"unique-{i}", "project_id": "proj-1", "value": i}) + return rows + + +# --------------------------------------------------------------------------- +# Benchmark wrappers +# --------------------------------------------------------------------------- + +_SMALL_N = 10 +_MEDIUM_N = 50 +_LARGE_N = 200 + + +def _bench_no_conflict_small() -> None: + merge_row_batch(_unique_rows(_SMALL_N)) + + +def _bench_no_conflict_medium() -> None: + merge_row_batch(_unique_rows(_MEDIUM_N)) + + +def _bench_no_conflict_large() -> None: + merge_row_batch(_unique_rows(_LARGE_N)) + + +def _bench_all_merge_small() -> None: + merge_row_batch(_merge_rows(_SMALL_N)) + + +def _bench_all_merge_medium() -> None: + merge_row_batch(_merge_rows(_MEDIUM_N)) + + +def _bench_mixed_medium() -> None: + merge_row_batch(_mixed_rows(_MEDIUM_N)) + + +# batch_items: split a list of strings by item-count and byte-count limits. +_BATCH_STRINGS = [f"item-payload-{i:04d}" * 4 for i in range(200)] +_ITEM_SIZE = len(_BATCH_STRINGS[0].encode()) + + +def _bench_batch_items_count_limit() -> None: + batch_items(_BATCH_STRINGS, batch_max_num_items=20) + + +def _bench_batch_items_byte_limit() -> None: + batch_items( + _BATCH_STRINGS, + batch_max_num_bytes=_ITEM_SIZE * 15, + get_byte_size=lambda s: len(s.encode()), + ) + + +def _bench_batch_items_both_limits() -> None: + batch_items( + _BATCH_STRINGS, + batch_max_num_items=20, + batch_max_num_bytes=_ITEM_SIZE * 15, + get_byte_size=lambda s: len(s.encode()), + ) + + +def main(runner: pyperf.Runner | None = None) -> None: + if runner is None: + disable_pyperf_psutil() + runner = pyperf.Runner() + + runner.bench_func("merge_row_batch[no-conflict-small]", _bench_no_conflict_small) + runner.bench_func("merge_row_batch[no-conflict-medium]", _bench_no_conflict_medium) + runner.bench_func("merge_row_batch[no-conflict-large]", _bench_no_conflict_large) + runner.bench_func("merge_row_batch[all-merge-small]", _bench_all_merge_small) + runner.bench_func("merge_row_batch[all-merge-medium]", _bench_all_merge_medium) + runner.bench_func("merge_row_batch[mixed-medium]", _bench_mixed_medium) + + runner.bench_func("batch_items[count-limit]", _bench_batch_items_count_limit) + runner.bench_func("batch_items[byte-limit]", _bench_batch_items_byte_limit) + runner.bench_func("batch_items[both-limits]", _bench_batch_items_both_limits) + + +if __name__ == "__main__": + main() diff --git a/py/benchmarks/benches/bench_span_components.py b/py/benchmarks/benches/bench_span_components.py new file mode 100644 index 00000000..4d03a34e --- /dev/null +++ b/py/benchmarks/benches/bench_span_components.py @@ -0,0 +1,73 @@ +"""Benchmarks for SpanComponentsV3 and SpanComponentsV4 encode/decode. + +These are on the hot path: every span serializes/deserializes parent context. +""" + +import pathlib +import secrets +import sys +import uuid + +import pyperf + + +if __package__ in (None, ""): + sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2])) + +from braintrust.span_identifier_v3 import SpanComponentsV3, SpanObjectTypeV3 +from braintrust.span_identifier_v4 import SpanComponentsV4 + +from benchmarks._utils import disable_pyperf_psutil + + +def main(runner: pyperf.Runner | None = None) -> None: + if runner is None: + disable_pyperf_psutil() + runner = pyperf.Runner() + + # V3 — UUID-based IDs + v3_obj_only = SpanComponentsV3( + object_type=SpanObjectTypeV3.PROJECT_LOGS, + object_id=str(uuid.uuid4()), + ) + v3_full = SpanComponentsV3( + object_type=SpanObjectTypeV3.EXPERIMENT, + object_id=str(uuid.uuid4()), + row_id=str(uuid.uuid4()), + span_id=str(uuid.uuid4()), + root_span_id=str(uuid.uuid4()), + ) + v3_obj_only_str = v3_obj_only.to_str() + v3_full_str = v3_full.to_str() + + runner.bench_func("span_components.v3.to_str[object-only]", v3_obj_only.to_str) + runner.bench_func("span_components.v3.to_str[full-uuid]", v3_full.to_str) + runner.bench_func("span_components.v3.from_str[object-only]", SpanComponentsV3.from_str, v3_obj_only_str) + runner.bench_func("span_components.v3.from_str[full-uuid]", SpanComponentsV3.from_str, v3_full_str) + + # V4 — OTEL hex IDs for span_id (8-byte) and root_span_id (16-byte) + v4_obj_only = SpanComponentsV4( + object_type=SpanObjectTypeV3.PROJECT_LOGS, + object_id=str(uuid.uuid4()), + ) + v4_full_otel = SpanComponentsV4( + object_type=SpanObjectTypeV3.EXPERIMENT, + object_id=str(uuid.uuid4()), + row_id=str(uuid.uuid4()), + span_id=secrets.token_hex(8), + root_span_id=secrets.token_hex(16), + ) + v4_obj_only_str = v4_obj_only.to_str() + v4_full_otel_str = v4_full_otel.to_str() + + runner.bench_func("span_components.v4.to_str[object-only]", v4_obj_only.to_str) + runner.bench_func("span_components.v4.to_str[full-otel]", v4_full_otel.to_str) + runner.bench_func("span_components.v4.from_str[object-only]", SpanComponentsV4.from_str, v4_obj_only_str) + runner.bench_func("span_components.v4.from_str[full-otel]", SpanComponentsV4.from_str, v4_full_otel_str) + + # Cross-version: V4 decoder reading a V3-encoded string (backwards-compat path) + runner.bench_func("span_components.v4.from_str[v3-encoded]", SpanComponentsV4.from_str, v3_full_str) + + +if __name__ == "__main__": + main() diff --git a/py/benchmarks/benches/bench_span_lifecycle.py b/py/benchmarks/benches/bench_span_lifecycle.py new file mode 100644 index 00000000..1a9748f7 --- /dev/null +++ b/py/benchmarks/benches/bench_span_lifecycle.py @@ -0,0 +1,144 @@ +"""Benchmarks for the span creation / log / end lifecycle. + +These measure pure SDK overhead per span — no network, no I/O. A discard +logger replaces the HTTP background logger so log() calls enqueue a +LazyValue that is immediately thrown away, isolating the in-process cost. + +Scenarios +--------- +* noop_span — NoopSpan.log() + end(): absolute zero-overhead floor. +* root_create_end — SpanImpl init + end, root span, no payload. +* root_log_small — SpanImpl.log() with a small flat payload dict. +* root_log_medium — SpanImpl.log() with a medium nested payload. +* child_create_end — start_span() + child.end() from an existing root span. +* export — span.export() (SpanComponentsV3 serialisation). +""" + +import pathlib +import sys +import uuid + +import pyperf + + +if __package__ in (None, ""): + sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2])) + +# Import internals used by the test suite for mocking. +from braintrust.logger import ( + NOOP_SPAN, + BraintrustState, + SpanImpl, + _MemoryBackgroundLogger, +) +from braintrust.span_identifier_v3 import SpanObjectTypeV3 +from braintrust.util import LazyValue + +from benchmarks._utils import disable_pyperf_psutil +from benchmarks.fixtures import make_medium_payload, make_small_payload + + +# --------------------------------------------------------------------------- +# One-time setup: a BraintrustState wired to a discard logger so span logs +# never touch the network or accumulate unboundedly in memory. +# --------------------------------------------------------------------------- + + +class _DiscardLogger(_MemoryBackgroundLogger): + """Drop all log items immediately; never accumulate memory.""" + + def log(self, *args: LazyValue) -> None: # type: ignore[override] + pass # intentionally discard — we measure enqueue cost, not storage + + +_state = BraintrustState() +_state._override_bg_logger.logger = _DiscardLogger() + +# Pre-resolved LazyValue for parent_object_id — avoids lazy resolution cost +# inside the hot loop (same UUID every iteration; fine for benchmarking). +_OBJECT_ID = str(uuid.uuid4()) +_PARENT_OBJECT_ID: LazyValue[str] = LazyValue(lambda: _OBJECT_ID, use_mutex=False) +_PARENT_OBJECT_ID.get() # resolve once so subsequent .get() calls are O(1) + +_PARENT_OBJECT_TYPE = SpanObjectTypeV3.EXPERIMENT + +# Payloads reused across log() benchmarks. +_SMALL_PAYLOAD = make_small_payload() +_MEDIUM_PAYLOAD = make_medium_payload() + + +# --------------------------------------------------------------------------- +# Benchmark helpers +# --------------------------------------------------------------------------- + + +def _make_root_span() -> SpanImpl: + return SpanImpl( + parent_object_type=_PARENT_OBJECT_TYPE, + parent_object_id=_PARENT_OBJECT_ID, + parent_compute_object_metadata_args=None, + parent_span_ids=None, + name="bench-root", + state=_state, + ) + + +def _make_child_span(root: SpanImpl) -> SpanImpl: + return root.start_span(name="bench-child") + + +# --------------------------------------------------------------------------- +# Benchmarks +# --------------------------------------------------------------------------- + + +def _bench_noop_span() -> None: + NOOP_SPAN.log(input="x", output="y") + NOOP_SPAN.end() + + +def _bench_root_create_end() -> None: + span = _make_root_span() + span.end() + + +def _bench_root_log_small() -> None: + span = _make_root_span() + span.log(**_SMALL_PAYLOAD) + span.end() + + +def _bench_root_log_medium() -> None: + span = _make_root_span() + span.log(**_MEDIUM_PAYLOAD) + span.end() + + +def _bench_child_create_end() -> None: + root = _make_root_span() + child = _make_child_span(root) + child.end() + root.end() + + +def _bench_export() -> None: + span = _make_root_span() + span.export() + span.end() + + +def main(runner: pyperf.Runner | None = None) -> None: + if runner is None: + disable_pyperf_psutil() + runner = pyperf.Runner() + + runner.bench_func("span_lifecycle.noop[log+end]", _bench_noop_span) + runner.bench_func("span_lifecycle.root[create+end]", _bench_root_create_end) + runner.bench_func("span_lifecycle.root[log-small+end]", _bench_root_log_small) + runner.bench_func("span_lifecycle.root[log-medium+end]", _bench_root_log_medium) + runner.bench_func("span_lifecycle.child[create+end]", _bench_child_create_end) + runner.bench_func("span_lifecycle.root[export]", _bench_export) + + +if __name__ == "__main__": + main()