From 4f7cee02dd39eb52ea5b743246ac8d1984db64d4 Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Wed, 15 Apr 2026 12:09:00 +0200 Subject: [PATCH 1/2] feat: add per-conversation duration to Html reports --- .../callbacks/usage_tracking_callback.py | 24 +++++++++++- src/askui/reporting.py | 38 +++++++++++++++++-- .../model_providers/test_model_pricing.py | 5 +++ 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/src/askui/callbacks/usage_tracking_callback.py b/src/askui/callbacks/usage_tracking_callback.py index d4c536f7..d1dec3f3 100644 --- a/src/askui/callbacks/usage_tracking_callback.py +++ b/src/askui/callbacks/usage_tracking_callback.py @@ -2,6 +2,7 @@ from __future__ import annotations +from datetime import datetime, timezone from typing import TYPE_CHECKING from opentelemetry import trace @@ -172,11 +173,22 @@ class StepUsageSummary(UsageSummary): class ConversationUsageSummary(UsageSummary): - """Usage summary for one conversation including per-step breakdown.""" + """Usage summary for one conversation including per-step breakdown. + + Args: + conversation_index (int): 1-based index of the conversation within the + current agent lifecycle. + conversation_id (str): Unique identifier of the conversation. + step_summaries (list[StepUsageSummary]): Per-step usage summaries. + duration_seconds (float | None): Wall-clock duration of the conversation + in seconds, measured between `on_conversation_start` and + `on_conversation_end`. `None` if duration was not tracked. + """ conversation_index: int conversation_id: str step_summaries: list[StepUsageSummary] = Field(default_factory=list) + duration_seconds: float | None = None class UsageTrackingCallback(ConversationCallback): @@ -199,12 +211,14 @@ def __init__( self._per_conversation_summaries: list[ConversationUsageSummary] = [] self._per_step_summaries: list[StepUsageSummary] = [] self._conversation_index: int = 0 + self._conversation_start_time: datetime | None = None @override def on_conversation_start(self, conversation: Conversation) -> None: self._per_conversation_usage = UsageSummary.create_from(self._summary) self._per_step_summaries = [] self._conversation_index += 1 + self._conversation_start_time = datetime.now(tz=timezone.utc) @override def on_step_end( @@ -237,9 +251,15 @@ def on_conversation_end(self, conversation: Conversation) -> None: generated_steps: list[StepUsageSummary] = [ step_summary.generate() for step_summary in self._per_step_summaries ] + duration_seconds: float | None = None + if self._conversation_start_time is not None: + duration_seconds = ( + datetime.now(tz=timezone.utc) - self._conversation_start_time + ).total_seconds() conversation_summary = self._create_conversation_summary( conversation=conversation, generated_step_summaries=generated_steps, + duration_seconds=duration_seconds, ) self._per_conversation_summaries.append(conversation_summary) self._summary.per_conversation_summaries = list( @@ -275,11 +295,13 @@ def _create_conversation_summary( self, conversation: Conversation, generated_step_summaries: list[StepUsageSummary], + duration_seconds: float | None = None, ) -> ConversationUsageSummary: conversation_summary = ConversationUsageSummary( conversation_index=self._conversation_index, conversation_id=conversation.conversation_id, step_summaries=generated_step_summaries, + duration_seconds=duration_seconds, input_tokens=self._per_conversation_usage.input_tokens, output_tokens=self._per_conversation_usage.output_tokens, cache_creation_input_tokens=( diff --git a/src/askui/reporting.py b/src/askui/reporting.py index 50e640f6..3e35f01b 100644 --- a/src/askui/reporting.py +++ b/src/askui/reporting.py @@ -37,6 +37,27 @@ def normalize_to_pil_images( return [image] +def _format_duration(seconds: float) -> str: + """Format a duration given in seconds as ``HH:MM:SS`` or + ``HH:MM:SS.mmm`` for sub-second precision. + + Used by `SimpleHtmlReporter` to render both the overall execution time and + per-conversation durations consistently. + """ + total_seconds = max(float(seconds), 0.0) + whole_seconds = int(total_seconds) + millis = int(round((total_seconds - whole_seconds) * 1000)) + if millis == 1000: + whole_seconds += 1 + millis = 0 + hours, remainder = divmod(whole_seconds, 3600) + minutes, secs = divmod(remainder, 60) + base = f"{hours:02d}:{minutes:02d}:{secs:02d}" + if whole_seconds == 0 and millis > 0: + return f"{base}.{millis:03d}" + return base + + def truncate_base64_images(content: Any) -> Any: """Replace base64 image data with a placeholder to keep reports readable. @@ -1010,6 +1031,9 @@ def generate(self) -> None: {{ conversation_usage.step_summaries | length }} step(s), + {% if conversation_usage.duration_seconds is not none %} + Duration: {{ format_duration(conversation_usage.duration_seconds) }}, + {% endif %} Input {{ "{:,}".format(conversation_usage.input_tokens or 0) }}, Output {{ "{:,}".format(conversation_usage.output_tokens or 0) }}, Cache Create {{ "{:,}".format(conversation_usage.cache_creation_input_tokens or 0) }}, @@ -1026,6 +1050,9 @@ def generate(self) -> None: + {% if conversation_usage.duration_seconds is not none %} + + {% endif %} @@ -1036,6 +1063,9 @@ def generate(self) -> None: + {% if conversation_usage.duration_seconds is not none %} + + {% endif %} @@ -1141,10 +1171,9 @@ def generate(self) -> None: end_time = datetime.now(tz=timezone.utc) execution_time_formatted: str | None = None if self._start_time is not None: - total_secs = int((end_time - self._start_time).total_seconds()) - hours, remainder = divmod(total_secs, 3600) - minutes, secs = divmod(remainder, 60) - execution_time_formatted = f"{hours:02d}:{minutes:02d}:{secs:02d}" + execution_time_formatted = _format_duration( + (end_time - self._start_time).total_seconds() + ) html = template.render( timestamp=end_time, @@ -1153,6 +1182,7 @@ def generate(self) -> None: usage_summary=self.usage_summary, cache_original_usage=self.cache_original_usage, execution_time_formatted=execution_time_formatted, + format_duration=_format_duration, ) report_path = ( diff --git a/tests/unit/model_providers/test_model_pricing.py b/tests/unit/model_providers/test_model_pricing.py index 44a1e53c..bdf7a289 100644 --- a/tests/unit/model_providers/test_model_pricing.py +++ b/tests/unit/model_providers/test_model_pricing.py @@ -245,6 +245,8 @@ def test_tracks_per_step_per_conversation_and_total_usage(self) -> None: assert per_conversation_summary.output_tokens == 30 _assert_close(per_conversation_summary.total_cost, 0.0009) assert len(per_conversation_summary.step_summaries) == 2 + assert per_conversation_summary.duration_seconds is not None + assert per_conversation_summary.duration_seconds >= 0.0 first_step = per_conversation_summary.step_summaries[0] assert first_step.step_index == 0 @@ -301,6 +303,9 @@ def test_accumulates_multiple_conversations(self) -> None: assert len(summary.per_conversation_summaries) == 2 assert summary.per_conversation_summaries[0].conversation_id == "conversation-1" assert summary.per_conversation_summaries[1].conversation_id == "conversation-2" + for per_conversation_summary in summary.per_conversation_summaries: + assert per_conversation_summary.duration_seconds is not None + assert per_conversation_summary.duration_seconds >= 0.0 def test_includes_cache_costs_from_provider_pricing(self) -> None: pricing = ModelPricing( From e12eb14b25093833b46467064ccc8fcbe61ae994 Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Wed, 15 Apr 2026 13:43:05 +0200 Subject: [PATCH 2/2] fix: address review comments --- src/askui/agent_base.py | 4 +- src/askui/callbacks/__init__.py | 4 +- ...py => conversation_statistics_callback.py} | 47 ++++++++++++------- src/askui/reporting.py | 37 ++++++++++++--- .../model_providers/test_model_pricing.py | 22 +++++---- 5 files changed, 77 insertions(+), 37 deletions(-) rename src/askui/callbacks/{usage_tracking_callback.py => conversation_statistics_callback.py} (89%) diff --git a/src/askui/agent_base.py b/src/askui/agent_base.py index 8bdacb0c..e3554f28 100644 --- a/src/askui/agent_base.py +++ b/src/askui/agent_base.py @@ -10,7 +10,7 @@ from typing_extensions import Self from askui.agent_settings import AgentSettings -from askui.callbacks import ConversationCallback, UsageTrackingCallback +from askui.callbacks import ConversationCallback, ConversationStatisticsCallback from askui.container import telemetry from askui.locators.locators import Locator from askui.models.shared.agent_message_param import MessageParam @@ -78,7 +78,7 @@ def __init__( speakers = Speakers() _callbacks = list(callbacks or []) _callbacks.append( - UsageTrackingCallback( + ConversationStatisticsCallback( reporter=self._reporter, pricing=self._vlm_provider.pricing, ) diff --git a/src/askui/callbacks/__init__.py b/src/askui/callbacks/__init__.py index 29eb7029..c39dd663 100644 --- a/src/askui/callbacks/__init__.py +++ b/src/askui/callbacks/__init__.py @@ -1,7 +1,7 @@ from .conversation_callback import ConversationCallback -from .usage_tracking_callback import UsageTrackingCallback +from .conversation_statistics_callback import ConversationStatisticsCallback __all__ = [ "ConversationCallback", - "UsageTrackingCallback", + "ConversationStatisticsCallback", ] diff --git a/src/askui/callbacks/usage_tracking_callback.py b/src/askui/callbacks/conversation_statistics_callback.py similarity index 89% rename from src/askui/callbacks/usage_tracking_callback.py rename to src/askui/callbacks/conversation_statistics_callback.py index d1dec3f3..38637306 100644 --- a/src/askui/callbacks/usage_tracking_callback.py +++ b/src/askui/callbacks/conversation_statistics_callback.py @@ -1,4 +1,9 @@ -"""Callback for tracking token usage and reporting usage summaries.""" +"""Callback for tracking per-conversation statistics (token usage, timing). + +Emits a `UsageSummary` (with per-conversation and per-step breakdowns, +including start/end timestamps for each conversation) to a reporter when the +conversation ends. +""" from __future__ import annotations @@ -180,19 +185,28 @@ class ConversationUsageSummary(UsageSummary): current agent lifecycle. conversation_id (str): Unique identifier of the conversation. step_summaries (list[StepUsageSummary]): Per-step usage summaries. - duration_seconds (float | None): Wall-clock duration of the conversation - in seconds, measured between `on_conversation_start` and - `on_conversation_end`. `None` if duration was not tracked. + started_at (datetime | None): UTC timestamp captured at + `on_conversation_start`. `None` if timing was not tracked. + ended_at (datetime | None): UTC timestamp captured at + `on_conversation_end`. `None` if timing was not tracked. """ conversation_index: int conversation_id: str step_summaries: list[StepUsageSummary] = Field(default_factory=list) - duration_seconds: float | None = None + started_at: datetime | None = None + ended_at: datetime | None = None + +class ConversationStatisticsCallback(ConversationCallback): + """Tracks per-conversation statistics (token usage per step and wall-clock + timing) and reports a summary at conversation end. -class UsageTrackingCallback(ConversationCallback): - """Tracks token usage per step and reports a summary at conversation end. + The reported `UsageSummary` contains, for each conversation, the raw + ``started_at`` and ``ended_at`` UTC timestamps alongside token usage. + Downstream consumers (e.g. `SimpleHtmlReporter`) are responsible for + deriving human-readable durations from those timestamps so the raw values + remain available for other uses. Args: reporter: Reporter to write the final usage summary to. @@ -211,14 +225,14 @@ def __init__( self._per_conversation_summaries: list[ConversationUsageSummary] = [] self._per_step_summaries: list[StepUsageSummary] = [] self._conversation_index: int = 0 - self._conversation_start_time: datetime | None = None + self._conversation_started_at: datetime | None = None @override def on_conversation_start(self, conversation: Conversation) -> None: self._per_conversation_usage = UsageSummary.create_from(self._summary) self._per_step_summaries = [] self._conversation_index += 1 - self._conversation_start_time = datetime.now(tz=timezone.utc) + self._conversation_started_at = datetime.now(tz=timezone.utc) @override def on_step_end( @@ -251,15 +265,12 @@ def on_conversation_end(self, conversation: Conversation) -> None: generated_steps: list[StepUsageSummary] = [ step_summary.generate() for step_summary in self._per_step_summaries ] - duration_seconds: float | None = None - if self._conversation_start_time is not None: - duration_seconds = ( - datetime.now(tz=timezone.utc) - self._conversation_start_time - ).total_seconds() + ended_at = datetime.now(tz=timezone.utc) conversation_summary = self._create_conversation_summary( conversation=conversation, generated_step_summaries=generated_steps, - duration_seconds=duration_seconds, + started_at=self._conversation_started_at, + ended_at=ended_at, ) self._per_conversation_summaries.append(conversation_summary) self._summary.per_conversation_summaries = list( @@ -295,13 +306,15 @@ def _create_conversation_summary( self, conversation: Conversation, generated_step_summaries: list[StepUsageSummary], - duration_seconds: float | None = None, + started_at: datetime | None = None, + ended_at: datetime | None = None, ) -> ConversationUsageSummary: conversation_summary = ConversationUsageSummary( conversation_index=self._conversation_index, conversation_id=conversation.conversation_id, step_summaries=generated_step_summaries, - duration_seconds=duration_seconds, + started_at=started_at, + ended_at=ended_at, input_tokens=self._per_conversation_usage.input_tokens, output_tokens=self._per_conversation_usage.output_tokens, cache_creation_input_tokens=( diff --git a/src/askui/reporting.py b/src/askui/reporting.py index 3e35f01b..cc8be97a 100644 --- a/src/askui/reporting.py +++ b/src/askui/reporting.py @@ -21,7 +21,10 @@ if TYPE_CHECKING: from PIL import Image - from askui.callbacks.usage_tracking_callback import UsageSummary + from askui.callbacks.conversation_statistics_callback import ( + ConversationUsageSummary, + UsageSummary, + ) def normalize_to_pil_images( @@ -1024,6 +1027,7 @@ def generate(self) -> None:

{% for conversation_usage in usage_summary.per_conversation_summaries %} + {% set conversation_duration = format_conversation_duration(conversation_usage) %}
@@ -1031,8 +1035,8 @@ def generate(self) -> None: {{ conversation_usage.step_summaries | length }} step(s), - {% if conversation_usage.duration_seconds is not none %} - Duration: {{ format_duration(conversation_usage.duration_seconds) }}, + {% if conversation_duration is not none %} + Duration: {{ conversation_duration }}, {% endif %} Input {{ "{:,}".format(conversation_usage.input_tokens or 0) }}, Output {{ "{:,}".format(conversation_usage.output_tokens or 0) }}, @@ -1050,7 +1054,7 @@ def generate(self) -> None:
Conversation IDDurationInput Tokens Output Tokens Cache Create
{{ conversation_usage.conversation_id }}{{ format_duration(conversation_usage.duration_seconds) }}{{ "{:,}".format(conversation_usage.input_tokens or 0) }} {{ "{:,}".format(conversation_usage.output_tokens or 0) }} {{ "{:,}".format(conversation_usage.cache_creation_input_tokens or 0) }}
- {% if conversation_usage.duration_seconds is not none %} + {% if conversation_duration is not none %} {% endif %} @@ -1063,8 +1067,8 @@ def generate(self) -> None: - {% if conversation_usage.duration_seconds is not none %} - + {% if conversation_duration is not none %} + {% endif %} @@ -1175,6 +1179,25 @@ def generate(self) -> None: (end_time - self._start_time).total_seconds() ) + def _format_conversation_duration( + conversation_usage: "ConversationUsageSummary", + ) -> str | None: + """Derive the formatted conversation duration from stored timestamps. + + Returns ``None`` if either ``started_at`` or ``ended_at`` is missing + so the template can skip rendering. + """ + if ( + conversation_usage.started_at is None + or conversation_usage.ended_at is None + ): + return None + return _format_duration( + ( + conversation_usage.ended_at - conversation_usage.started_at + ).total_seconds() + ) + html = template.render( timestamp=end_time, messages=self.messages, @@ -1182,7 +1205,7 @@ def generate(self) -> None: usage_summary=self.usage_summary, cache_original_usage=self.cache_original_usage, execution_time_formatted=execution_time_formatted, - format_duration=_format_duration, + format_conversation_duration=_format_conversation_duration, ) report_path = ( diff --git a/tests/unit/model_providers/test_model_pricing.py b/tests/unit/model_providers/test_model_pricing.py index bdf7a289..f0c3ed85 100644 --- a/tests/unit/model_providers/test_model_pricing.py +++ b/tests/unit/model_providers/test_model_pricing.py @@ -4,9 +4,9 @@ import pytest -from askui.callbacks.usage_tracking_callback import ( +from askui.callbacks.conversation_statistics_callback import ( + ConversationStatisticsCallback, UsageSummary, - UsageTrackingCallback, ) from askui.models.shared.agent_message_param import UsageParam from askui.speaker.speaker import SpeakerResult @@ -98,12 +98,12 @@ def _assert_close( assert abs(actual - expected) <= tolerance -class TestUsageTrackingCallbackCost: +class TestConversationStatisticsCallbackCost: def _make_callback( self, pricing: ModelPricing | None = None - ) -> tuple[UsageTrackingCallback, MagicMock]: + ) -> tuple[ConversationStatisticsCallback, MagicMock]: reporter = MagicMock() - callback = UsageTrackingCallback(reporter=reporter, pricing=pricing) + callback = ConversationStatisticsCallback(reporter=reporter, pricing=pricing) return callback, reporter @pytest.mark.parametrize( @@ -245,8 +245,9 @@ def test_tracks_per_step_per_conversation_and_total_usage(self) -> None: assert per_conversation_summary.output_tokens == 30 _assert_close(per_conversation_summary.total_cost, 0.0009) assert len(per_conversation_summary.step_summaries) == 2 - assert per_conversation_summary.duration_seconds is not None - assert per_conversation_summary.duration_seconds >= 0.0 + assert per_conversation_summary.started_at is not None + assert per_conversation_summary.ended_at is not None + assert per_conversation_summary.ended_at >= per_conversation_summary.started_at first_step = per_conversation_summary.step_summaries[0] assert first_step.step_index == 0 @@ -304,8 +305,11 @@ def test_accumulates_multiple_conversations(self) -> None: assert summary.per_conversation_summaries[0].conversation_id == "conversation-1" assert summary.per_conversation_summaries[1].conversation_id == "conversation-2" for per_conversation_summary in summary.per_conversation_summaries: - assert per_conversation_summary.duration_seconds is not None - assert per_conversation_summary.duration_seconds >= 0.0 + assert per_conversation_summary.started_at is not None + assert per_conversation_summary.ended_at is not None + assert ( + per_conversation_summary.ended_at >= per_conversation_summary.started_at + ) def test_includes_cache_costs_from_provider_pricing(self) -> None: pricing = ModelPricing(
Conversation IDDurationInput Tokens
{{ conversation_usage.conversation_id }}{{ format_duration(conversation_usage.duration_seconds) }}{{ conversation_duration }}{{ "{:,}".format(conversation_usage.input_tokens or 0) }} {{ "{:,}".format(conversation_usage.output_tokens or 0) }}