From 7002e3191f8a879767d6dea6b1583ba26d3bae55 Mon Sep 17 00:00:00 2001 From: shubhobm Date: Tue, 17 Mar 2026 18:12:27 +0530 Subject: [PATCH 1/6] wip --- avidtools/connectors/inspect.py | 85 +++++++++++++++++++++++++-- tests/unit/connectors/test_inspect.py | 40 +++++++++++++ 2 files changed, 119 insertions(+), 6 deletions(-) diff --git a/avidtools/connectors/inspect.py b/avidtools/connectors/inspect.py index f306888..917d0e1 100644 --- a/avidtools/connectors/inspect.py +++ b/avidtools/connectors/inspect.py @@ -4,7 +4,7 @@ import re from html import unescape from pathlib import Path -from typing import Any, List, Optional, Tuple +from typing import Any, Iterable, List, Optional, Tuple from urllib.parse import quote from urllib.error import HTTPError, URLError from urllib.request import urlopen @@ -44,6 +44,7 @@ def read_eval_log(file_path): "meta-llama": "Meta", "mistralai": "Mistral AI", "cohere": "Cohere", + "together": "Together AI", } SITE_ROOT = "https://ukgovernmentbeis.github.io/inspect_evals" @@ -85,6 +86,7 @@ def upload_eval_log_to_s3( key_prefix: str = "", region: Optional[str] = None, endpoint_url: Optional[str] = None, + skip_if_exists: bool = True, ) -> str: """Upload an Inspect eval log to S3 and return its URL.""" @@ -110,7 +112,13 @@ def upload_eval_log_to_s3( client_kwargs["endpoint_url"] = endpoint_url s3_client = boto3.client("s3", **client_kwargs) - s3_client.upload_file(str(source_path), bucket, key) + if skip_if_exists: + try: + s3_client.head_object(Bucket=bucket, Key=key) + except Exception: + s3_client.upload_file(str(source_path), bucket, key) + else: + s3_client.upload_file(str(source_path), bucket, key) quoted_key = quote(key, safe="/") if endpoint_url: @@ -120,6 +128,46 @@ def upload_eval_log_to_s3( return f"https://{bucket}.s3.amazonaws.com/{quoted_key}" +def _report_payload(report: Report) -> dict: + if hasattr(report, "model_dump"): + payload = report.model_dump(mode="json") + else: + payload = report.dict() + + metrics = payload.get("metrics") + if isinstance(metrics, list): + flat_metrics = [] + for metric in metrics: + if not isinstance(metric, dict): + continue + detection = metric.get("detection_method", {}) + results = metric.get("results", {}) + scorer = results.get("scorer") or detection.get("name") + flat_metrics.append( + { + "scorer": scorer, + "metrics": metric.get("name"), + "value": results.get("value"), + } + ) + payload["metrics"] = flat_metrics + + return payload + + +def write_reports_jsonl(reports: Iterable[Report], output_path: Path) -> int: + """Write reports to a JSONL file and return count written.""" + + output_path.parent.mkdir(parents=True, exist_ok=True) + count = 0 + with output_path.open("w", encoding="utf-8") as file_obj: + for report in reports: + file_obj.write(json.dumps(_report_payload(report))) + file_obj.write("\n") + count += 1 + return count + + def convert_eval_log( file_path: str, normalize: bool = False, @@ -164,7 +212,7 @@ def convert_eval_log( endpoint_url=s3_endpoint_url, ) - report = Report() + report = Report(data_version="0.3.1") model_prefix = eval_log.eval.model.split("/", 1)[0] developer_name = human_readable_name.get( model_prefix, @@ -188,9 +236,8 @@ def convert_eval_log( description=LangValue(lang="eng", value=description_value), ) - dataset_label = ( - f"Inspect Evaluation Log for dataset: {eval_log.eval.dataset.name}" - ) + dataset_name = getattr(eval_log.eval.dataset, "name", None) or task + dataset_label = f"Inspect Evaluation Log for dataset: {dataset_name}" dataset_location = ( eval_log.eval.dataset.location if getattr(eval_log.eval.dataset, "location", None) @@ -262,6 +309,30 @@ def convert_eval_log( return [report] +def convert_eval_logs( + file_paths: Iterable[Path], + normalize: bool = False, + s3_bucket: Optional[str] = None, + s3_key_prefix: str = "", + s3_region: Optional[str] = None, + s3_endpoint_url: Optional[str] = None, +) -> List[Report]: + """Convert multiple Inspect eval logs into AVID reports.""" + + all_reports: List[Report] = [] + for file_path in file_paths: + reports = convert_eval_log( + str(file_path), + normalize=normalize, + s3_bucket=s3_bucket, + s3_key_prefix=s3_key_prefix, + s3_region=s3_region, + s3_endpoint_url=s3_endpoint_url, + ) + all_reports.extend(reports) + return all_reports + + def _clean_html_to_text(fragment: str) -> str: """Strip HTML tags and normalize whitespace for section extraction.""" @@ -444,6 +515,8 @@ def _first_line(text: str) -> str: def normalize_report_data(report: dict): """Apply Inspect normalize transformations to a report dictionary.""" + report.setdefault("data_version", "0.3.1") + problem_desc = ( report.get("problemtype", {}) .get("description", {}) diff --git a/tests/unit/connectors/test_inspect.py b/tests/unit/connectors/test_inspect.py index 618079a..e121ab3 100644 --- a/tests/unit/connectors/test_inspect.py +++ b/tests/unit/connectors/test_inspect.py @@ -3,12 +3,15 @@ """ import pytest +from pathlib import Path from unittest.mock import Mock, patch from urllib.error import URLError from avidtools.connectors.inspect import ( import_eval_log, convert_eval_log, + convert_eval_logs, + write_reports_jsonl, human_readable_name, normalize_report_data, UnsupportedInspectBenchmarkError, @@ -88,6 +91,7 @@ def test_human_readable_name_mapping(self): assert human_readable_name["google"] == "Google" assert human_readable_name["huggingface"] == "Hugging Face" assert human_readable_name["meta-llama"] == "Meta" + assert human_readable_name["together"] == "Together AI" @patch('avidtools.connectors.inspect.import_eval_log') def test_convert_eval_log_basic(self, mock_import): @@ -103,6 +107,7 @@ def test_convert_eval_log_basic(self, mock_import): report = reports[0] assert report.data_type == "AVID" + assert report.data_version == "0.3.1" @patch('avidtools.connectors.inspect.import_eval_log') def test_convert_eval_log_affects(self, mock_import): @@ -277,6 +282,7 @@ def test_convert_eval_log_uses_s3_reference_url( ) assert len(reports) == 1 + assert len(reports[0].references) == 1 assert reports[0].references[0].url == ( "https://bucket.s3.amazonaws.com/run.eval" ) @@ -288,6 +294,40 @@ def test_convert_eval_log_uses_s3_reference_url( endpoint_url=None, ) + @patch('avidtools.connectors.inspect.convert_eval_log') + def test_convert_eval_logs_aggregates_reports(self, mock_convert): + """Batch conversion helper should aggregate per-file results.""" + report1 = Report() + report2 = Report() + mock_convert.side_effect = [[report1], [report2]] + + reports = convert_eval_logs([ + Path("/path/one.eval"), + Path("/path/two.eval"), + ]) + + assert len(reports) == 2 + assert reports[0] is report1 + assert reports[1] is report2 + + @patch('avidtools.connectors.inspect.import_eval_log') + def test_write_reports_jsonl_writes_all_records(self, mock_import, tmp_path): + """JSONL writer should serialize one line per report.""" + mock_import.return_value = MockEvalLog() + output = tmp_path / "reports.jsonl" + report = convert_eval_log("/path/to/eval.json")[0] + count = write_reports_jsonl([report, Report()], output) + + assert count == 2 + lines = output.read_text(encoding="utf-8").strip().splitlines() + assert len(lines) == 2 + first = __import__("json").loads(lines[0]) + assert first["metrics"][0] == { + "scorer": "accuracy", + "metrics": "accuracy", + "value": 0.95, + } + @patch('avidtools.connectors.inspect.urlopen') def test_fetch_sections_raises_on_unresolved_benchmark(self, mock_urlopen): """Unresolvable benchmark should raise UnsupportedInspectBenchmarkError.""" From df7f36aaa6a444c1787c8f8d54b431364f0e4f2a Mon Sep 17 00:00:00 2001 From: shubhobm Date: Tue, 17 Mar 2026 18:32:48 +0530 Subject: [PATCH 2/6] together developers --- avidtools/connectors/inspect.py | 45 +++++++++++++++++++++++---- tests/unit/connectors/test_inspect.py | 28 +++++++++++++++++ 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/avidtools/connectors/inspect.py b/avidtools/connectors/inspect.py index 917d0e1..360776f 100644 --- a/avidtools/connectors/inspect.py +++ b/avidtools/connectors/inspect.py @@ -47,6 +47,19 @@ def read_eval_log(file_path): "together": "Together AI", } +_together_developer_name = { + "openai": "OpenAI", + "meta-llama": "Meta", + "mistralai": "Mistral AI", + "google": "Google", + "deepseek-ai": "DeepSeek", + "qwen": "Qwen", + "moonshotai": "Moonshot AI", + "minimaxai": "Minimax", + "liquidai": "Liquid AI", + "essentialai": "Essential AI", +} + SITE_ROOT = "https://ukgovernmentbeis.github.io/inspect_evals" CYSE2_URL = ( "https://ukgovernmentbeis.github.io/inspect_evals/evals/" @@ -80,6 +93,29 @@ def import_eval_log(file_path: str) -> Any: return read_eval_log(file_path) +def _resolve_parties_from_model(eval_model: str) -> Tuple[str, str, str]: + """Resolve developer, deployer, and artifact model name from eval model.""" + + if eval_model.startswith("together/"): + parts = eval_model.split("/") + if len(parts) >= 3: + together_dev = parts[1] + model_name = parts[-1] + developer_name = _together_developer_name.get( + together_dev.lower(), + together_dev, + ) + return developer_name, "Together AI", model_name + + model_prefix = eval_model.split("/", 1)[0] + developer_name = human_readable_name.get( + model_prefix, + model_prefix.replace("-", " ").title(), + ) + model_name = eval_model.rsplit("/", 1)[-1] + return developer_name, eval_model, model_name + + def upload_eval_log_to_s3( file_path: str, bucket: str, @@ -213,16 +249,13 @@ def convert_eval_log( ) report = Report(data_version="0.3.1") - model_prefix = eval_log.eval.model.split("/", 1)[0] - developer_name = human_readable_name.get( - model_prefix, - model_prefix.replace("-", " ").title(), + developer_name, deployer_name, model_name = _resolve_parties_from_model( + eval_log.eval.model ) task = eval_log.eval.task.rsplit("/", 1)[-1] - model_name = eval_log.eval.model.rsplit("/", 1)[-1] report.affects = Affects( developer=[developer_name], - deployer=[eval_log.eval.model], + deployer=[deployer_name], artifacts=[Artifact(type=ArtifactTypeEnum.model, name=model_name)], ) diff --git a/tests/unit/connectors/test_inspect.py b/tests/unit/connectors/test_inspect.py index e121ab3..093df11 100644 --- a/tests/unit/connectors/test_inspect.py +++ b/tests/unit/connectors/test_inspect.py @@ -231,6 +231,34 @@ def test_convert_eval_log_different_model(self, mock_import): assert report.affects.deployer == ["anthropic/claude-3"] assert report.affects.artifacts[0].name == "claude-3" + @patch('avidtools.connectors.inspect.import_eval_log') + def test_convert_eval_log_together_model_mapping(self, mock_import): + """Together model IDs should map developer and deployer correctly.""" + mock_eval_log = MockEvalLog() + mock_eval_log.eval.model = "together/MiniMaxAI/MiniMax-M2.5" + mock_import.return_value = mock_eval_log + + reports = convert_eval_log("/path/to/eval.json") + report = reports[0] + + assert report.affects.developer == ["MiniMax"] + assert report.affects.deployer == ["Together AI"] + assert report.affects.artifacts[0].name == "MiniMax-M2.5" + + @patch('avidtools.connectors.inspect.import_eval_log') + def test_convert_eval_log_together_openai_model_mapping(self, mock_import): + """Together OpenAI model IDs should map developer to OpenAI.""" + mock_eval_log = MockEvalLog() + mock_eval_log.eval.model = "together/openai/gpt-oss-20b" + mock_import.return_value = mock_eval_log + + reports = convert_eval_log("/path/to/eval.json") + report = reports[0] + + assert report.affects.developer == ["OpenAI"] + assert report.affects.deployer == ["Together AI"] + assert report.affects.artifacts[0].name == "gpt-oss-20b" + @patch('avidtools.connectors.inspect.import_eval_log') def test_convert_eval_log_missing_dataset_location_uses_file_uri( self, From 8c90cffcd18bce6e07c80028cfc2cb3d089a51f0 Mon Sep 17 00:00:00 2001 From: shubhobm Date: Wed, 18 Mar 2026 12:51:45 +0530 Subject: [PATCH 3/6] // --- tests/unit/connectors/test_inspect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/connectors/test_inspect.py b/tests/unit/connectors/test_inspect.py index 093df11..00658ab 100644 --- a/tests/unit/connectors/test_inspect.py +++ b/tests/unit/connectors/test_inspect.py @@ -241,7 +241,7 @@ def test_convert_eval_log_together_model_mapping(self, mock_import): reports = convert_eval_log("/path/to/eval.json") report = reports[0] - assert report.affects.developer == ["MiniMax"] + assert report.affects.developer == ["Minimax"] assert report.affects.deployer == ["Together AI"] assert report.affects.artifacts[0].name == "MiniMax-M2.5" From e0eadd68d5699dc91b31618a8ca9bc0cc1594205 Mon Sep 17 00:00:00 2001 From: shubhobm Date: Wed, 18 Mar 2026 13:43:22 +0530 Subject: [PATCH 4/6] fix(connectors): resolve developer/deployer via _resolve_parties_from_model in garak; add reported_date from completed_at in inspect --- avidtools/connectors/garak.py | 84 ++++++++++++++++++++------------- avidtools/connectors/inspect.py | 10 ++++ 2 files changed, 61 insertions(+), 33 deletions(-) diff --git a/avidtools/connectors/garak.py b/avidtools/connectors/garak.py index ce7db13..93f9c07 100644 --- a/avidtools/connectors/garak.py +++ b/avidtools/connectors/garak.py @@ -10,6 +10,7 @@ from urllib.error import HTTPError, URLError from urllib.request import Request, urlopen +from .inspect import _resolve_parties_from_model from .utils import ( apply_normalizations, choose_model_subject_label, @@ -616,13 +617,36 @@ async def summarize_one(probe_name: str): return probe_cache, module_cache -def _shorten_artifact_model_names(report: dict) -> Optional[str]: +def _apply_party_normalization(report: dict) -> Optional[str]: + """Normalize developer, deployer, and artifact names. + + For Together AI deployments (deployer 'together' or 'litellm'), the full + model path is reconstructed and passed to ``_resolve_parties_from_model`` + so that human-readable developer and deployer names are derived alongside + the shortened model artifact name. + + For all other models the artifact name is shortened to its last path + component and the deployer field is left unchanged. + + Returns the preferred (shortened) model artifact name. + """ affects = report.setdefault("affects", {}) + deployer_raw = to_list(affects.get("deployer")) artifacts = affects.get("artifacts") if not isinstance(artifacts, list): return None - preferred_model = None + _together_deployers = {"together", "litellm"} + is_together = any( + v.strip().lower() in _together_deployers for v in deployer_raw + ) + + preferred_model: Optional[str] = None + dev_names: list = [] + dep_names: list = [] + seen_dev: set = set() + seen_dep: set = set() + for artifact in artifacts: if not isinstance(artifact, dict): continue @@ -630,38 +654,33 @@ def _shorten_artifact_model_names(report: dict) -> Optional[str]: if not isinstance(name, str): continue - shortened = name.split("/", 1)[1] if "/" in name else name - artifact["name"] = shortened - if preferred_model is None: - preferred_model = shortened - - return preferred_model + if is_together and "/" in name: + full_path = f"together/{name}" + developer_name, deployer_name, model_name = ( + _resolve_parties_from_model(full_path) + ) + if developer_name.lower() not in seen_dev: + seen_dev.add(developer_name.lower()) + dev_names.append(developer_name) + if deployer_name.lower() not in seen_dep: + seen_dep.add(deployer_name.lower()) + dep_names.append(deployer_name) + else: + model_name = name.split("/", 1)[1] if "/" in name else name + artifact["name"] = model_name + if preferred_model is None: + preferred_model = model_name -def _apply_litellm_deployer_mapping(report: dict): - affects = report.setdefault("affects", {}) - deployer = to_list(affects.get("deployer")) + if dev_names: + affects["developer"] = dev_names + if dep_names: + affects["deployer"] = dep_names + elif is_together and not dep_names: + # fallback if no artifacts with org/model pattern were found + affects["deployer"] = ["Together AI"] - mapped = [] - changed = False - for value in deployer: - normalized = value.strip().lower() - if normalized == "litellm" or normalized == "together": - mapped.append("Together AI") - changed = True - else: - mapped.append(value) - - if changed: - deduped = [] - seen = set() - for value in mapped: - key = value.lower() - if key in seen: - continue - seen.add(key) - deduped.append(value) - affects["deployer"] = deduped + return preferred_model def _rebuild_text_descriptions( @@ -771,8 +790,7 @@ def _normalize_report( ): """Apply Garak-specific normalize transforms to a single report.""" - preferred_model_name = _shorten_artifact_model_names(report) - _apply_litellm_deployer_mapping(report) + preferred_model_name = _apply_party_normalization(report) apply_normalizations( report, preferred_model_name=preferred_model_name, diff --git a/avidtools/connectors/inspect.py b/avidtools/connectors/inspect.py index 360776f..09a4ff8 100644 --- a/avidtools/connectors/inspect.py +++ b/avidtools/connectors/inspect.py @@ -2,6 +2,7 @@ import json import re +from datetime import date, datetime from html import unescape from pathlib import Path from typing import Any, Iterable, List, Optional, Tuple @@ -330,6 +331,15 @@ def convert_eval_log( ) report.description = LangValue(lang="eng", value=full_description) + completed_at = getattr(eval_log.stats, "completed_at", None) + if completed_at: + try: + report.reported_date = datetime.fromisoformat( + str(completed_at) + ).date() + except (ValueError, TypeError): + pass + if normalize: report_payload = ( report.model_dump() From d5d3d554b29fcf254e277905ee37d2f99b1443c8 Mon Sep 17 00:00:00 2001 From: shubhobm Date: Wed, 18 Mar 2026 13:58:16 +0530 Subject: [PATCH 5/6] fix(lint): remove unused date import in inspect connector --- avidtools/connectors/inspect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/avidtools/connectors/inspect.py b/avidtools/connectors/inspect.py index 09a4ff8..1083f9f 100644 --- a/avidtools/connectors/inspect.py +++ b/avidtools/connectors/inspect.py @@ -2,7 +2,7 @@ import json import re -from datetime import date, datetime +from datetime import datetime from html import unescape from pathlib import Path from typing import Any, Iterable, List, Optional, Tuple From 5750abf35a53752e1c7b28316516cf58fd473568 Mon Sep 17 00:00:00 2001 From: shubhobm Date: Wed, 18 Mar 2026 14:01:46 +0530 Subject: [PATCH 6/6] test(inspect): add stats.completed_at to MockEvalLog; add reported_date assertion and test --- tests/unit/connectors/test_inspect.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/unit/connectors/test_inspect.py b/tests/unit/connectors/test_inspect.py index 00658ab..5b2e08e 100644 --- a/tests/unit/connectors/test_inspect.py +++ b/tests/unit/connectors/test_inspect.py @@ -58,6 +58,10 @@ def __init__(self): self.results = Mock() self.results.scores = [mock_score] + # Stats with completion timestamp + self.stats = Mock() + self.stats.completed_at = "2024-03-15T10:30:00+00:00" + class TestInspectConnector: """Test cases for Inspect connector functions.""" @@ -108,6 +112,19 @@ def test_convert_eval_log_basic(self, mock_import): report = reports[0] assert report.data_type == "AVID" assert report.data_version == "0.3.1" + assert report.reported_date is not None + + @patch('avidtools.connectors.inspect.import_eval_log') + def test_convert_eval_log_reported_date(self, mock_import): + """Test that reported_date is set from eval log completed_at.""" + from datetime import date + mock_eval_log = MockEvalLog() + mock_import.return_value = mock_eval_log + + reports = convert_eval_log("/path/to/eval.json") + report = reports[0] + + assert report.reported_date == date(2024, 3, 15) @patch('avidtools.connectors.inspect.import_eval_log') def test_convert_eval_log_affects(self, mock_import):