diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index 747ccb01..11f652b3 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -216,7 +216,7 @@ class EvalScorerArgs(SerializableDataClass, Generic[Input, Output, Expected]): metadata: Metadata | None = None -OneOrMoreScores = float | int | bool | None | ScoreLike | list[ScoreLike] +OneOrMoreScores = float | int | bool | None | ScoreLike | Sequence[ScoreLike] OneOrMoreClassifications = None | Classification | Mapping[str, Any] | list[Classification | Mapping[str, Any]] diff --git a/py/src/braintrust/type_tests/test_autoevals_scorers.py b/py/src/braintrust/type_tests/test_autoevals_scorers.py index a0896159..8599431c 100644 --- a/py/src/braintrust/type_tests/test_autoevals_scorers.py +++ b/py/src/braintrust/type_tests/test_autoevals_scorers.py @@ -2,7 +2,7 @@ import pytest from autoevals import Levenshtein # type: ignore[import-untyped] -from braintrust.framework import Eval, EvalAsync, EvalCase, EvalScorer +from braintrust.framework import Eval, EvalAsync, EvalCase, EvalScorer, Score def accepts_autoevals_scorer( @@ -36,6 +36,25 @@ async def autoevals_task_async(input: str) -> str: ] +def test_eval_accepts_autoevals_scorers_typed_sequence(): + def scorer(input: str, output: str, expected: str | None = None) -> list[Score]: + return [Score(name="match", score=1.0)] + + typed_scorer: EvalScorer[str, str, str] = scorer + + result = Eval( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task, + scores=[typed_scorer], + no_send_logs=True, + ) + + score = result.results[0].scores["match"] + assert score is not None + assert score > 0 + + def test_eval_accepts_autoevals_scorers_typed(): result = Eval( "test-autoevals-scorers",