diff --git a/avidtools/connectors/inspect.py b/avidtools/connectors/inspect.py index 9addd5a..93645e2 100644 --- a/avidtools/connectors/inspect.py +++ b/avidtools/connectors/inspect.py @@ -20,7 +20,7 @@ Reference, ) from ..datamodels.enums import ClassEnum, MethodEnum, TypeEnum -from .utils import apply_review_normalizations +from .utils import apply_review_normalizations, choose_model_subject_label try: from inspect_ai.log import read_eval_log, EvalLog @@ -320,12 +320,16 @@ def _build_new_description( model_name: str, overview: str, scoring: str, + subject_label: str, ) -> str: """Build standardized normalized report description text.""" + subject_label_display = "LLM" if subject_label == "llm" else "AI system" + return ( f"{overview}\n\n" - f"We evaluated the LLM {model_name} on this benchmark.\n\n" + f"The {subject_label_display} {model_name} was evaluated " + "on this benchmark.\n\n" "## Measurement details\n\n" f"{scoring}" ) @@ -359,20 +363,22 @@ def normalize_report_data(report: dict): model_name = match.group(1) benchmark = match.group(2) + apply_review_normalizations(report, preferred_model_name=model_name) + _, overview, scoring = _fetch_sections(benchmark) overview = _first_line(overview) + subject_label = choose_model_subject_label(report) description = report.setdefault("description", {}) description["value"] = _build_new_description( model_name=model_name, overview=overview, scoring=scoring, + subject_label=subject_label, ) if "lang" not in description: description["lang"] = "eng" - apply_review_normalizations(report, preferred_model_name=model_name) - def process_report(file_path: Path): """Load, normalize, and rewrite a single Inspect report file.""" diff --git a/avidtools/connectors/utils.py b/avidtools/connectors/utils.py index 28ad60d..77d3195 100644 --- a/avidtools/connectors/utils.py +++ b/avidtools/connectors/utils.py @@ -232,18 +232,27 @@ def apply_review_normalizations( def choose_model_subject_label(report: dict) -> str: """Choose whether descriptions should refer to an LLM or AI system.""" - model_names = extract_model_names(report) - if any(name.strip() for name in model_names): - return "llm" - affects = report.get("affects", {}) artifacts = affects.get("artifacts") if isinstance(artifacts, list): + has_system_artifact = False + has_model_artifact = False for artifact in artifacts: if not isinstance(artifact, dict): continue artifact_type = str(artifact.get("type", "")).strip().lower() - if artifact_type in {"model", "llm", "language model"}: - return "llm" + if artifact_type == "system": + has_system_artifact = True + elif artifact_type in {"model", "llm", "language model"}: + has_model_artifact = True + + if has_system_artifact and not has_model_artifact: + return "AI system" + if has_model_artifact: + return "llm" + + model_names = extract_model_names(report) + if any(name.strip() for name in model_names): + return "llm" return "AI system" diff --git a/tests/unit/connectors/test_normalize_utils.py b/tests/unit/connectors/test_normalize_utils.py index cdd9002..291f831 100644 --- a/tests/unit/connectors/test_normalize_utils.py +++ b/tests/unit/connectors/test_normalize_utils.py @@ -1,6 +1,9 @@ """Unit tests for normalization utilities.""" -from avidtools.connectors.utils import apply_normalizations +from avidtools.connectors.utils import ( + apply_normalizations, + choose_model_subject_label, +) def test_apply_normalizations_sets_openai_deployer_for_gpt_model(): @@ -37,3 +40,29 @@ def test_apply_normalizations_sets_together_ai_deployer(): assert updated is True assert report["affects"]["deployer"] == ["Together AI"] assert report["affects"]["developer"] == ["Mistral"] + + +def test_choose_model_subject_label_prefers_ai_system_for_system_artifacts(): + """System artifact type should map description subject to AI system.""" + report = { + "affects": { + "artifacts": [ + {"type": "System", "name": "gpt-4o-mini-2024-07-18"} + ] + } + } + + assert choose_model_subject_label(report) == "AI system" + + +def test_choose_model_subject_label_uses_llm_for_model_artifacts(): + """Model artifact type should keep description subject as LLM.""" + report = { + "affects": { + "artifacts": [ + {"type": "Model", "name": "gpt-4o-mini-2024-07-18"} + ] + } + } + + assert choose_model_subject_label(report) == "llm"