Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions avidtools/connectors/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
Reference,
)
from ..datamodels.enums import ClassEnum, MethodEnum, TypeEnum
from .utils import apply_review_normalizations
from .utils import apply_review_normalizations, choose_model_subject_label

try:
from inspect_ai.log import read_eval_log, EvalLog
Expand Down Expand Up @@ -320,12 +320,16 @@ def _build_new_description(
model_name: str,
overview: str,
scoring: str,
subject_label: str,
) -> str:
"""Build standardized normalized report description text."""

subject_label_display = "LLM" if subject_label == "llm" else "AI system"

return (
f"{overview}\n\n"
f"We evaluated the LLM {model_name} on this benchmark.\n\n"
f"The {subject_label_display} {model_name} was evaluated "
"on this benchmark.\n\n"
"## Measurement details\n\n"
f"{scoring}"
)
Expand Down Expand Up @@ -359,20 +363,22 @@ def normalize_report_data(report: dict):
model_name = match.group(1)
benchmark = match.group(2)

apply_review_normalizations(report, preferred_model_name=model_name)

_, overview, scoring = _fetch_sections(benchmark)
overview = _first_line(overview)
subject_label = choose_model_subject_label(report)

description = report.setdefault("description", {})
description["value"] = _build_new_description(
model_name=model_name,
overview=overview,
scoring=scoring,
subject_label=subject_label,
)
if "lang" not in description:
description["lang"] = "eng"

apply_review_normalizations(report, preferred_model_name=model_name)


def process_report(file_path: Path):
"""Load, normalize, and rewrite a single Inspect report file."""
Expand Down
21 changes: 15 additions & 6 deletions avidtools/connectors/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,18 +232,27 @@ def apply_review_normalizations(
def choose_model_subject_label(report: dict) -> str:
"""Choose whether descriptions should refer to an LLM or AI system."""

model_names = extract_model_names(report)
if any(name.strip() for name in model_names):
return "llm"

affects = report.get("affects", {})
artifacts = affects.get("artifacts")
if isinstance(artifacts, list):
has_system_artifact = False
has_model_artifact = False
for artifact in artifacts:
if not isinstance(artifact, dict):
continue
artifact_type = str(artifact.get("type", "")).strip().lower()
if artifact_type in {"model", "llm", "language model"}:
return "llm"
if artifact_type == "system":
has_system_artifact = True
elif artifact_type in {"model", "llm", "language model"}:
has_model_artifact = True

if has_system_artifact and not has_model_artifact:
return "AI system"
if has_model_artifact:
return "llm"

model_names = extract_model_names(report)
if any(name.strip() for name in model_names):
return "llm"

return "AI system"
31 changes: 30 additions & 1 deletion tests/unit/connectors/test_normalize_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""Unit tests for normalization utilities."""

from avidtools.connectors.utils import apply_normalizations
from avidtools.connectors.utils import (
apply_normalizations,
choose_model_subject_label,
)


def test_apply_normalizations_sets_openai_deployer_for_gpt_model():
Expand Down Expand Up @@ -37,3 +40,29 @@ def test_apply_normalizations_sets_together_ai_deployer():
assert updated is True
assert report["affects"]["deployer"] == ["Together AI"]
assert report["affects"]["developer"] == ["Mistral"]


def test_choose_model_subject_label_prefers_ai_system_for_system_artifacts():
"""System artifact type should map description subject to AI system."""
report = {
"affects": {
"artifacts": [
{"type": "System", "name": "gpt-4o-mini-2024-07-18"}
]
}
}

assert choose_model_subject_label(report) == "AI system"


def test_choose_model_subject_label_uses_llm_for_model_artifacts():
"""Model artifact type should keep description subject as LLM."""
report = {
"affects": {
"artifacts": [
{"type": "Model", "name": "gpt-4o-mini-2024-07-18"}
]
}
}

assert choose_model_subject_label(report) == "llm"