diff --git a/scripts/__tests__/bench-matrix.test.mjs b/scripts/__tests__/bench-matrix.test.mjs index 1a13347..c1798c3 100644 --- a/scripts/__tests__/bench-matrix.test.mjs +++ b/scripts/__tests__/bench-matrix.test.mjs @@ -2955,3 +2955,48 @@ test("H21 evidence array includes comparator entries when comparator runs are pr "tanstack", ]); }); + +test("findRunSeries isolates runs by scale so a different-scale run cannot pollute the verdict", () => { + const makeRun = (scale, timestamp, latency) => ({ + adapterId: "pretable", + profile: "default", + scenarioId: "S2", + scale, + scriptName: "select-range-extend", + browserName: "chromium", + browserVersion: "123.0", + timestamp, + seed: 1, + rowCount: 3000, + viewport: { width: 1440, height: 900 }, + fontStack: "system-ui", + deviceScaleFactor: 1, + status: "completed", + notes: ["interaction mode: select-range-extend"], + tracePath: "status/traces/x.trace.zip", + metrics: { + interaction_latency_ms: latency, + settle_duration_ms: 16, + post_interaction_blank_gap_frames: 0, + post_interaction_anchor_shift_px: 0, + post_interaction_row_height_error_p95_px: 0, + result_row_count: 3000, + selected_row_preserved: 1, + focused_row_preserved: 1, + dom_nodes_peak: 400, + }, + }); + + // H16 matches pretable/S2/hypothesis/select-range-extend. A bad "dev"-scale + // run for the same scenario+script must be excluded — before the scale + // filter it would have been aggregated in, dragging latency over budget and + // flipping the verdict to failing. + const result = evaluateH16([ + makeRun("hypothesis", "2026-06-05T22:00:00.000Z", 10), + makeRun("dev", "2026-06-05T22:00:01.000Z", 120), + ]); + + assert.equal(result.status, "satisfied"); + assert.equal(result.evidence[0].sampleCount, 1); + assert.equal(result.evidence[0].metrics.interaction_latency_ms, 10); +}); diff --git a/scripts/bench-matrix.mjs b/scripts/bench-matrix.mjs index 1cfc434..d12cfc8 100644 --- a/scripts/bench-matrix.mjs +++ b/scripts/bench-matrix.mjs @@ -1646,6 +1646,7 @@ function findRunSeries(runs, matcher) { run.status === "completed" && (matcher.adapterId === undefined || run.adapterId === matcher.adapterId) && + (matcher.scale === undefined || run.scale === matcher.scale) && run.scenarioId === matcher.scenarioId && run.scriptName === matcher.scriptName, ) @@ -1658,6 +1659,7 @@ function groupRunSeries(runs, matcher) { for (const run of runs) { if ( run.status !== "completed" || + (matcher.scale !== undefined && run.scale !== matcher.scale) || run.scenarioId !== matcher.scenarioId || run.scriptName !== matcher.scriptName ) {