Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/askui/agent_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing_extensions import Self

from askui.agent_settings import AgentSettings
from askui.callbacks import ConversationCallback, UsageTrackingCallback
from askui.callbacks import ConversationCallback, ConversationStatisticsCallback
from askui.container import telemetry
from askui.locators.locators import Locator
from askui.models.shared.agent_message_param import MessageParam
Expand Down Expand Up @@ -78,7 +78,7 @@ def __init__(
speakers = Speakers()
_callbacks = list(callbacks or [])
_callbacks.append(
UsageTrackingCallback(
ConversationStatisticsCallback(
reporter=self._reporter,
pricing=self._vlm_provider.pricing,
)
Expand Down
4 changes: 2 additions & 2 deletions src/askui/callbacks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .conversation_callback import ConversationCallback
from .usage_tracking_callback import UsageTrackingCallback
from .conversation_statistics_callback import ConversationStatisticsCallback

__all__ = [
"ConversationCallback",
"UsageTrackingCallback",
"ConversationStatisticsCallback",
]
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
"""Callback for tracking token usage and reporting usage summaries."""
"""Callback for tracking per-conversation statistics (token usage, timing).

Emits a `UsageSummary` (with per-conversation and per-step breakdowns,
including start/end timestamps for each conversation) to a reporter when the
conversation ends.
"""

from __future__ import annotations

from datetime import datetime, timezone
from typing import TYPE_CHECKING

from opentelemetry import trace
Expand Down Expand Up @@ -172,15 +178,35 @@ class StepUsageSummary(UsageSummary):


class ConversationUsageSummary(UsageSummary):
"""Usage summary for one conversation including per-step breakdown."""
"""Usage summary for one conversation including per-step breakdown.

Args:
conversation_index (int): 1-based index of the conversation within the
current agent lifecycle.
conversation_id (str): Unique identifier of the conversation.
step_summaries (list[StepUsageSummary]): Per-step usage summaries.
started_at (datetime | None): UTC timestamp captured at
`on_conversation_start`. `None` if timing was not tracked.
ended_at (datetime | None): UTC timestamp captured at
`on_conversation_end`. `None` if timing was not tracked.
"""

conversation_index: int
conversation_id: str
step_summaries: list[StepUsageSummary] = Field(default_factory=list)
started_at: datetime | None = None
ended_at: datetime | None = None


class ConversationStatisticsCallback(ConversationCallback):
"""Tracks per-conversation statistics (token usage per step and wall-clock
timing) and reports a summary at conversation end.

class UsageTrackingCallback(ConversationCallback):
"""Tracks token usage per step and reports a summary at conversation end.
The reported `UsageSummary` contains, for each conversation, the raw
``started_at`` and ``ended_at`` UTC timestamps alongside token usage.
Downstream consumers (e.g. `SimpleHtmlReporter`) are responsible for
deriving human-readable durations from those timestamps so the raw values
remain available for other uses.

Args:
reporter: Reporter to write the final usage summary to.
Expand All @@ -199,12 +225,14 @@ def __init__(
self._per_conversation_summaries: list[ConversationUsageSummary] = []
self._per_step_summaries: list[StepUsageSummary] = []
self._conversation_index: int = 0
self._conversation_started_at: datetime | None = None

@override
def on_conversation_start(self, conversation: Conversation) -> None:
self._per_conversation_usage = UsageSummary.create_from(self._summary)
self._per_step_summaries = []
self._conversation_index += 1
self._conversation_started_at = datetime.now(tz=timezone.utc)

@override
def on_step_end(
Expand Down Expand Up @@ -237,9 +265,12 @@ def on_conversation_end(self, conversation: Conversation) -> None:
generated_steps: list[StepUsageSummary] = [
step_summary.generate() for step_summary in self._per_step_summaries
]
ended_at = datetime.now(tz=timezone.utc)
conversation_summary = self._create_conversation_summary(
conversation=conversation,
generated_step_summaries=generated_steps,
started_at=self._conversation_started_at,
ended_at=ended_at,
)
self._per_conversation_summaries.append(conversation_summary)
self._summary.per_conversation_summaries = list(
Expand Down Expand Up @@ -275,11 +306,15 @@ def _create_conversation_summary(
self,
conversation: Conversation,
generated_step_summaries: list[StepUsageSummary],
started_at: datetime | None = None,
ended_at: datetime | None = None,
) -> ConversationUsageSummary:
conversation_summary = ConversationUsageSummary(
conversation_index=self._conversation_index,
conversation_id=conversation.conversation_id,
step_summaries=generated_step_summaries,
started_at=started_at,
ended_at=ended_at,
input_tokens=self._per_conversation_usage.input_tokens,
output_tokens=self._per_conversation_usage.output_tokens,
cache_creation_input_tokens=(
Expand Down
63 changes: 58 additions & 5 deletions src/askui/reporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@
if TYPE_CHECKING:
from PIL import Image

from askui.callbacks.usage_tracking_callback import UsageSummary
from askui.callbacks.conversation_statistics_callback import (
ConversationUsageSummary,
UsageSummary,
)


def normalize_to_pil_images(
Expand All @@ -37,6 +40,27 @@ def normalize_to_pil_images(
return [image]


def _format_duration(seconds: float) -> str:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice if we have a time object. Then we don't need to implement the format logic by our self

"""Format a duration given in seconds as ``HH:MM:SS`` or
``HH:MM:SS.mmm`` for sub-second precision.

Used by `SimpleHtmlReporter` to render both the overall execution time and
per-conversation durations consistently.
"""
total_seconds = max(float(seconds), 0.0)
whole_seconds = int(total_seconds)
millis = int(round((total_seconds - whole_seconds) * 1000))
if millis == 1000:
whole_seconds += 1
millis = 0
hours, remainder = divmod(whole_seconds, 3600)
minutes, secs = divmod(remainder, 60)
base = f"{hours:02d}:{minutes:02d}:{secs:02d}"
if whole_seconds == 0 and millis > 0:
return f"{base}.{millis:03d}"
return base


def truncate_base64_images(content: Any) -> Any:
"""Replace base64 image data with a placeholder to keep reports readable.

Expand Down Expand Up @@ -1003,13 +1027,17 @@ def generate(self) -> None:
</p>
<div class="usage-breakdown-list">
{% for conversation_usage in usage_summary.per_conversation_summaries %}
{% set conversation_duration = format_conversation_duration(conversation_usage) %}
<details class="usage-breakdown-item">
<summary>
<span class="usage-breakdown-title">
Conversation #{{ conversation_usage.conversation_index }}
</span>
<span class="usage-breakdown-meta">
{{ conversation_usage.step_summaries | length }} step(s),
{% if conversation_duration is not none %}
Duration: {{ conversation_duration }},
{% endif %}
Input {{ "{:,}".format(conversation_usage.input_tokens or 0) }},
Output {{ "{:,}".format(conversation_usage.output_tokens or 0) }},
Cache Create {{ "{:,}".format(conversation_usage.cache_creation_input_tokens or 0) }},
Expand All @@ -1026,6 +1054,9 @@ def generate(self) -> None:
<table class="nested-table">
<tr>
<th>Conversation ID</th>
{% if conversation_duration is not none %}
<th>Duration</th>
{% endif %}
<th>Input Tokens</th>
<th>Output Tokens</th>
<th>Cache Create</th>
Expand All @@ -1036,6 +1067,9 @@ def generate(self) -> None:
</tr>
<tr class="system">
<td class="mono">{{ conversation_usage.conversation_id }}</td>
{% if conversation_duration is not none %}
<td>{{ conversation_duration }}</td>
{% endif %}
<td>{{ "{:,}".format(conversation_usage.input_tokens or 0) }}</td>
<td>{{ "{:,}".format(conversation_usage.output_tokens or 0) }}</td>
<td>{{ "{:,}".format(conversation_usage.cache_creation_input_tokens or 0) }}</td>
Expand Down Expand Up @@ -1141,10 +1175,28 @@ def generate(self) -> None:
end_time = datetime.now(tz=timezone.utc)
execution_time_formatted: str | None = None
if self._start_time is not None:
total_secs = int((end_time - self._start_time).total_seconds())
hours, remainder = divmod(total_secs, 3600)
minutes, secs = divmod(remainder, 60)
execution_time_formatted = f"{hours:02d}:{minutes:02d}:{secs:02d}"
execution_time_formatted = _format_duration(
(end_time - self._start_time).total_seconds()
)

def _format_conversation_duration(
conversation_usage: "ConversationUsageSummary",
) -> str | None:
"""Derive the formatted conversation duration from stored timestamps.

Returns ``None`` if either ``started_at`` or ``ended_at`` is missing
so the template can skip rendering.
"""
if (
conversation_usage.started_at is None
or conversation_usage.ended_at is None
):
return None
return _format_duration(
(
conversation_usage.ended_at - conversation_usage.started_at
).total_seconds()
)

html = template.render(
timestamp=end_time,
Expand All @@ -1153,6 +1205,7 @@ def generate(self) -> None:
usage_summary=self.usage_summary,
cache_original_usage=self.cache_original_usage,
execution_time_formatted=execution_time_formatted,
format_conversation_duration=_format_conversation_duration,
)

report_path = (
Expand Down
19 changes: 14 additions & 5 deletions tests/unit/model_providers/test_model_pricing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

import pytest

from askui.callbacks.usage_tracking_callback import (
from askui.callbacks.conversation_statistics_callback import (
ConversationStatisticsCallback,
UsageSummary,
UsageTrackingCallback,
)
from askui.models.shared.agent_message_param import UsageParam
from askui.speaker.speaker import SpeakerResult
Expand Down Expand Up @@ -98,12 +98,12 @@ def _assert_close(
assert abs(actual - expected) <= tolerance


class TestUsageTrackingCallbackCost:
class TestConversationStatisticsCallbackCost:
def _make_callback(
self, pricing: ModelPricing | None = None
) -> tuple[UsageTrackingCallback, MagicMock]:
) -> tuple[ConversationStatisticsCallback, MagicMock]:
reporter = MagicMock()
callback = UsageTrackingCallback(reporter=reporter, pricing=pricing)
callback = ConversationStatisticsCallback(reporter=reporter, pricing=pricing)
return callback, reporter

@pytest.mark.parametrize(
Expand Down Expand Up @@ -245,6 +245,9 @@ def test_tracks_per_step_per_conversation_and_total_usage(self) -> None:
assert per_conversation_summary.output_tokens == 30
_assert_close(per_conversation_summary.total_cost, 0.0009)
assert len(per_conversation_summary.step_summaries) == 2
assert per_conversation_summary.started_at is not None
assert per_conversation_summary.ended_at is not None
assert per_conversation_summary.ended_at >= per_conversation_summary.started_at

first_step = per_conversation_summary.step_summaries[0]
assert first_step.step_index == 0
Expand Down Expand Up @@ -301,6 +304,12 @@ def test_accumulates_multiple_conversations(self) -> None:
assert len(summary.per_conversation_summaries) == 2
assert summary.per_conversation_summaries[0].conversation_id == "conversation-1"
assert summary.per_conversation_summaries[1].conversation_id == "conversation-2"
for per_conversation_summary in summary.per_conversation_summaries:
assert per_conversation_summary.started_at is not None
assert per_conversation_summary.ended_at is not None
assert (
per_conversation_summary.ended_at >= per_conversation_summary.started_at
)

def test_includes_cache_costs_from_provider_pricing(self) -> None:
pricing = ModelPricing(
Expand Down
Loading