From b9a6a17eeb8fc02178ef51b86f1bb99d4bd1a7c7 Mon Sep 17 00:00:00 2001 From: Andrew Scott Date: Mon, 29 Sep 2025 12:46:12 -0700 Subject: [PATCH] feat(runner): add support for running and repairing tests This commit introduces the ability to run tests against the generated code as part of the evaluation process. A new optional `testCommand` can be in the environment configuration. If provided, this command will be executed after a successful build. If the tests fail, the tool will attempt to repair the code using the LLM, similar to how build failures are handled. The number of repair attempts is configurable. The report has been updated to display the test results for each run, including whether the tests passed, failed, or passed after repair. The summary view also includes aggregated statistics about the test results. --- docs/environment-reference.md | 5 + .../pages/report-viewer/report-viewer.html | 64 +++++++- .../app/pages/report-viewer/report-viewer.ts | 30 +++- runner/configuration/constants.ts | 8 +- runner/configuration/environment-config.ts | 5 - runner/configuration/package-managers.ts | 4 + runner/eval-cli.ts | 20 +-- runner/orchestration/build-serve-loop.ts | 150 +++++++++++++----- runner/orchestration/codegen.ts | 17 +- runner/orchestration/executors/executor.ts | 14 ++ .../executors/local-executor-config.ts | 6 +- .../orchestration/executors/local-executor.ts | 48 +++++- runner/orchestration/generate.ts | 11 +- .../{build-repair.ts => repair.ts} | 64 ++++---- runner/orchestration/test-worker.ts | 42 +++++ runner/progress/dynamic-progress-logger.ts | 1 + runner/progress/progress-logger.ts | 11 +- .../successful-tests-rating.ts | 28 ++++ runner/ratings/rate-code.ts | 9 ++ runner/ratings/rating-types.ts | 3 + runner/ratings/stats.ts | 24 +++ runner/shared-interfaces.ts | 34 +++- 22 files changed, 487 insertions(+), 111 deletions(-) create mode 100644 runner/configuration/package-managers.ts rename runner/orchestration/{build-repair.ts => repair.ts} (94%) create mode 100644 runner/orchestration/test-worker.ts create mode 100644 runner/ratings/built-in-ratings/successful-tests-rating.ts diff --git a/docs/environment-reference.md b/docs/environment-reference.md index 3b3f7ba..c37fbea 100644 --- a/docs/environment-reference.md +++ b/docs/environment-reference.md @@ -179,3 +179,8 @@ Defaults to ` run build`. Command used to start a local dev server as a part of the evaluation. Defaults to ` run start --port 0`. + +### `testCommand` + +Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 4 minutes. + diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html index d7c7603..08369b9 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.html +++ b/report-app/src/app/pages/report-viewer/report-viewer.html @@ -73,6 +73,20 @@

+ @if (overview.stats.tests) { +
+

+ quiz + Tests +

+
+ +
+
+ } @if (overview.stats.runtime) {

@@ -276,9 +290,19 @@

Generated applications

Initial build failed } - @if (hasBuildFailureDuringA11yRepair(result)) { + @if (hasBuildFailureDuringTestRepair(result)) { Build failed after a11y repair } + + @if (finalAttempt.testResult) { + @if (finalAttempt.testResult.passed) { + @if ((result.testRepairAttempts || 0) > 0) { + Tests passed after repair + } + } @else { + Tests failed + } + }
@@ -350,12 +374,36 @@
+ @if (result.testResult) { +
+

Test Results

+
+ @if (result.testResult.passed) { + โœ” Tests passed + @if ((result.testRepairAttempts || 0) > 0) { +  after {{ result.testRepairAttempts }} repair attempt(s) + } + } @else { + โœ˜ Tests failed + } +
+ + @if (result.testResult.output && !result.testResult.passed) { +
+ See Test Output +
{{ result.testResult.output }}
+
+ } +
+ } +

Additional info

@for (attempt of result.attemptDetails; track attempt) { @let isBuilt = attempt.buildResult.status === 'success'; @let axeViolations = attempt.serveTestingResult?.axeViolations; @let hasAxeViolations = axeViolations && axeViolations.length > 0; + @let testsFailed = attempt.testResult?.passed === false; @@ -380,6 +428,15 @@

Additional info

>A11y } + + @if (attempt.testResult) { + Tests + }
@if (expansionPanel.opened()) { @@ -416,6 +473,11 @@

A11y Violations

} + @if (testsFailed) { +

Failed Tests

+
{{ attempt.testResult?.output }}
+ } +

Generated Code

@for (file of attempt.outputFiles; track file) { diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts index 08e7109..32c5cc0 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.ts +++ b/report-app/src/app/pages/report-viewer/report-viewer.ts @@ -21,6 +21,7 @@ import { LlmResponseFile, RunInfo, RunSummaryBuilds, + RunSummaryTests, RuntimeStats, ScoreBucket, SkippedIndividualAssessment, @@ -265,6 +266,31 @@ export class ReportViewer { ]; } + protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData { + return [ + { + label: 'Passed', + color: ScoreCssVariable.excellent, + value: tests.successfulInitialTests, + }, + { + label: 'Passed after repair', + color: ScoreCssVariable.great, + value: tests.successfulTestsAfterRepair, + }, + { + label: 'Failed', + color: ScoreCssVariable.poor, + value: tests.failedTests, + }, + { + label: 'No tests run', + color: ScoreCssVariable.neutral, + value: tests.noTestsRun, + }, + ]; + } + protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData { return buckets.map(b => ({ label: b.nameWithLabels, @@ -400,7 +426,7 @@ export class ReportViewer { return `wcs run --prompt=${result.promptDef.name} --env=`; } - protected hasBuildFailureDuringA11yRepair(result: AssessmentResult): boolean { - return result.attemptDetails.some(attempt => attempt.buildFailedDuringA11yRepair); + protected hasBuildFailureDuringTestRepair(result: AssessmentResult): boolean { + return result.attemptDetails.some(attempt => attempt.buildFailedDuringTestRepair); } } diff --git a/runner/configuration/constants.ts b/runner/configuration/constants.ts index 3151ec1..422a2dd 100644 --- a/runner/configuration/constants.ts +++ b/runner/configuration/constants.ts @@ -25,7 +25,13 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output'); * providing the build output and the code that causes the problem. */ // Note: When updating, also adjust the default description in `README.md`. -export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1; +export const DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS = 1; + +/** + * Number of times we'll try to ask LLM to repair test failures + * E.g. Axe violations, or test command failures + */ +export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1; /** Name of the folder where we store all generated reports */ export const REPORTS_ROOT_DIR = join(rootDir, 'reports'); diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts index a959738..085a342 100644 --- a/runner/configuration/environment-config.ts +++ b/runner/configuration/environment-config.ts @@ -73,11 +73,6 @@ export const environmentConfigSchema = z.object({ export type EnvironmentConfig = z.infer & Partial; -/** Package managers that are currently supported. */ -export function getPossiblePackageManagers() { - return ['npm', 'pnpm', 'yarn'] as const; -} - /** Asserts that the specified data is a valid environment config. */ export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig { const validationResult = environmentConfigSchema diff --git a/runner/configuration/package-managers.ts b/runner/configuration/package-managers.ts new file mode 100644 index 0000000..6929cd2 --- /dev/null +++ b/runner/configuration/package-managers.ts @@ -0,0 +1,4 @@ +/** Package managers that are currently supported. */ +export function getPossiblePackageManagers() { + return ['npm', 'pnpm', 'yarn'] as const; +} diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts index 39077ca..210fe26 100644 --- a/runner/eval-cli.ts +++ b/runner/eval-cli.ts @@ -3,7 +3,8 @@ import chalk from 'chalk'; import { BUILT_IN_ENVIRONMENTS, DEFAULT_AUTORATER_MODEL_NAME, - DEFAULT_MAX_REPAIR_ATTEMPTS, + DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS, + DEFAULT_MAX_TEST_REPAIR_ATTEMPTS, DEFAULT_MODEL_NAME, } from './configuration/constants.js'; import {generateCodeAndAssess} from './orchestration/generate.js'; @@ -37,9 +38,9 @@ interface Options { enableUserJourneyTesting?: boolean; enableAutoCsp?: boolean; autoraterModel?: string; - a11yRepairAttempts?: number; logging?: 'text-only' | 'dynamic'; skipLighthouse?: boolean; + maxTestRepairAttempts?: number; maxBuildRepairAttempts?: number; } @@ -151,11 +152,6 @@ function builder(argv: Argv): Argv { default: DEFAULT_AUTORATER_MODEL_NAME, description: 'Model to use when automatically rating generated code', }) - .option('a11y-repair-attempts', { - type: 'number', - default: 0, - description: 'Number of repair attempts for discovered a11y violations', - }) .option('skip-lighthouse', { type: 'boolean', default: false, @@ -163,9 +159,15 @@ function builder(argv: Argv): Argv { }) .option('max-build-repair-attempts', { type: 'number', - default: DEFAULT_MAX_REPAIR_ATTEMPTS, + default: DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS, description: 'Number of repair attempts when build errors are discovered', }) + .option('max-test-repair-attempts', { + type: 'number', + default: DEFAULT_MAX_TEST_REPAIR_ATTEMPTS, + description: + 'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)', + }) .strict() .version(false) .help() @@ -209,9 +211,9 @@ async function handler(cliArgs: Arguments): Promise { logging: cliArgs.logging, autoraterModel: cliArgs.autoraterModel, skipAiSummary: cliArgs.skipAiSummary, - a11yRepairAttempts: cliArgs.a11yRepairAttempts, skipLighthouse: cliArgs.skipLighthouse, maxBuildRepairAttempts: cliArgs.maxBuildRepairAttempts, + maxTestRepairAttempts: cliArgs.maxTestRepairAttempts, }); logReportToConsole(runInfo); diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts index f543add..957f58c 100644 --- a/runner/orchestration/build-serve-loop.ts +++ b/runner/orchestration/build-serve-loop.ts @@ -10,15 +10,21 @@ import { } from '../shared-interfaces.js'; import {ProgressLogger} from '../progress/progress-logger.js'; import {runBuild} from './build-worker.js'; -import {repairAndBuild} from './build-repair.js'; import {EvalID} from './executors/executor.js'; import {serveAndTestApp} from './serve-testing-worker.js'; +import {runTest} from './test-worker.js'; import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js'; -import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js'; +import { + DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS, + DEFAULT_MAX_TEST_REPAIR_ATTEMPTS, +} from '../configuration/constants.js'; +import {repairAndBuild} from './repair.js'; /** - * Attempts to build the code that an LLM generated. If the build fails, attempts - * to fix the breakage and build again. + * Attempts to build and test the code that an LLM generated. + * + * * If the build fails, attempts to fix the breakage and build again. + * * If tests fail (like Axe or project tests), we may repair and retry. * * @param config Assessment config. * @param evalID ID of the eval being attempted for build. @@ -34,7 +40,7 @@ import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js'; * @param abortSignal Signal to fire when the build should be aborted. * @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls). */ -export async function attemptBuild( +export async function attemptBuildAndTest( config: AssessmentConfig, evalID: EvalID, env: Environment, @@ -59,8 +65,9 @@ export async function attemptBuild( ); let repairAttempts = 0; const maxRepairAttempts = (await env.executor.shouldRepairFailedBuilds(evalID)) - ? (config.maxBuildRepairAttempts ?? DEFAULT_MAX_REPAIR_ATTEMPTS) + ? (config.maxBuildRepairAttempts ?? DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS) : 0; + const maxTestRepairAttempts = config.maxTestRepairAttempts ?? DEFAULT_MAX_TEST_REPAIR_ATTEMPTS; const initialAttempt = { outputFiles: initialResponse.files, @@ -94,13 +101,18 @@ export async function attemptBuild( rootPromptDef, directory, lastAttempt.outputFiles, - lastAttempt.buildResult.message, - 'There are the following build errors:', + [ + { + errorContext: 'There are the following build errors:', + errorMessage: lastAttempt.buildResult.message, + }, + ], contextFiles, abortSignal, workerConcurrencyQueue, repairAttempts, progress, + 'build', ); attemptDetails.push(attempt); @@ -121,31 +133,69 @@ export async function attemptBuild( progress, userJourneyAgentTaskInput, ); + const testResult = await runTest( + env, + evalID, + directory, + rootPromptDef, + abortSignal, + workerConcurrencyQueue, + progress, + ); + + if (testResult !== null) { + lastAttempt.testResult = testResult; + } } - // Attempt to repair axe testing. This only runs when the last build - // passed and serving did run. Note: By default, we don't run axe repair + // Attempt to repair testing. This only runs when the last build + // passed and serving did run. Note: By default, we don't run repair // attempts as it's not commonly done by LLMs in the ecosystem. let axeRepairAttempts = 0; - while ( - lastAttempt.serveTestingResult && - (lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 && - axeRepairAttempts < (config.a11yRepairAttempts ?? 0) - ) { - axeRepairAttempts++; - progress.log( - rootPromptDef, - 'build', - `Trying to repair axe accessibility violations (attempt #${axeRepairAttempts + 1})...`, - ); + let testRepairAttempts = 0; + for (let testRepairAttempt = 0; testRepairAttempt < maxTestRepairAttempts; testRepairAttempt++) { + const hasAxeFailure = + lastAttempt.serveTestingResult && lastAttempt.serveTestingResult.axeViolations?.length; + const hasTestFailure = lastAttempt.testResult && !lastAttempt.testResult.passed; + if (!hasAxeFailure && !hasTestFailure) { + break; + } - const axeViolationsError = JSON.stringify( - lastAttempt.serveTestingResult.axeViolations, - null, - 2, - ); + const attemptId = testRepairAttempt + repairAttempts + 1; - progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations'); + const errors: Array<{errorContext: string; errorMessage: string}> = []; + if (hasAxeFailure) { + axeRepairAttempts++; + progress.log( + rootPromptDef, + 'build', + `Trying to repair axe accessibility violations (attempt #${attemptId})...`, + ); + const axeViolationsError = JSON.stringify( + lastAttempt.serveTestingResult!.axeViolations, + null, + 2, + ); + progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations'); + errors.push({ + errorContext: + 'There are the following accessibility errors from axe accessibility violations:', + errorMessage: axeViolationsError, + }); + } + if (hasTestFailure) { + testRepairAttempts++; + progress.log( + rootPromptDef, + 'test', + `Trying to repair test failures (attempt #${attemptId})...`, + ); + + errors.push({ + errorContext: 'Application tests failed. Attempt to fix them. Test output was:', + errorMessage: lastAttempt.testResult!.output, + }); + } const attempt = await repairAndBuild( evalID, @@ -154,28 +204,28 @@ export async function attemptBuild( rootPromptDef, directory, lastAttempt.outputFiles, - axeViolationsError, - 'There are the following accessibility errors from axe accessibility violations:', + errors, contextFiles, abortSignal, workerConcurrencyQueue, - axeRepairAttempts + repairAttempts, + attemptId, progress, + 'test', ); let hasBuildFailure = attempt.buildResult.status !== BuildResultStatus.SUCCESS; - attempt.buildFailedDuringA11yRepair = hasBuildFailure; + attempt.buildFailedDuringTestRepair = hasBuildFailure; attemptDetails.push(attempt); lastAttempt = attempt; + // If we somehow introduced build errors via the repair loop, we abort + // further repairs and capture the failed build. This is useful insight + // as LLMs seem to regress when asked to repair violations. + if (hasBuildFailure) { + break; + } - // If we somehow introduced build errors via the Axe repair loop, we abort - // further a11y repairs and capture the failed build. This is useful insight - // as LLMs seem to regress when asked to repair a11y violations. - if (hasBuildFailure) break; - - // Re-run serving & tests after Axe repair. - // This allows us to check if we fixed the violations. - attempt.serveTestingResult = await serveAndTestApp( + // Re-run serving & tests after repair. + lastAttempt.serveTestingResult = await serveAndTestApp( config, evalID, directory, @@ -186,10 +236,26 @@ export async function attemptBuild( progress, userJourneyAgentTaskInput, ); + const testResult = await runTest( + env, + evalID, + directory, + rootPromptDef, + abortSignal, + workerConcurrencyQueue, + progress, + ); + + if (testResult !== null) { + lastAttempt.testResult = testResult; + } - if (attempt.serveTestingResult.axeViolations?.length === 0) { + if (hasAxeFailure && lastAttempt.serveTestingResult.axeViolations?.length === 0) { progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`); } + if (hasTestFailure && lastAttempt.testResult?.passed) { + progress.log(rootPromptDef, 'success', `Successfully fixed all test failures`); + } } return { @@ -197,6 +263,8 @@ export async function attemptBuild( serveTestingResult: lastAttempt.serveTestingResult, outputFiles: lastAttempt.outputFiles, repairAttempts, - axeRepairAttempts, + axeRepairAttempts: axeRepairAttempts, + testResult: lastAttempt.testResult, + testRepairAttempts: testRepairAttempts, }; } diff --git a/runner/orchestration/codegen.ts b/runner/orchestration/codegen.ts index bacf398..47f6cbe 100644 --- a/runner/orchestration/codegen.ts +++ b/runner/orchestration/codegen.ts @@ -9,10 +9,8 @@ import { } from '../shared-interfaces.js'; import {LlmRunner, LocalLlmGenerateFilesContext, PromptDataMessage} from '../codegen/llm-runner.js'; import {Environment} from '../configuration/environment.js'; -import {getPossiblePackageManagers} from '../configuration/environment-config.js'; import {ProgressLogger} from '../progress/progress-logger.js'; import {EvalID} from './executors/executor.js'; -import {LocalExecutor} from './executors/local-executor.js'; /** * Generates code using the configured AI model based on the provided prompt. @@ -94,18 +92,17 @@ export async function repairCodeWithAI( promptDef: RootPromptDefinition, directory: string, appFiles: LlmResponseFile[], - errorMessage: string, - errorContext: string, + errors: Array<{errorContext: string; errorMessage: string}>, contextFiles: LlmContextFile[], abortSignal: AbortSignal, progress: ProgressLogger, + repairType: 'build' | 'test', ): Promise { const repairSystemInstructions = env.systemPromptRepair(); const repairPrompt = [ - errorContext, - '```', - errorMessage, - '```', + ...errors.map(({errorContext, errorMessage}) => + [errorContext, '```', errorMessage, '```'].join('\n'), + ), '', 'In the following source code:', ...appFiles.map(file => `${file.filePath}:\n\`\`\`\n${file.code}\`\`\`\n\n`), @@ -118,13 +115,13 @@ export async function repairCodeWithAI( combinedPrompt: `${repairSystemInstructions}\n${repairPrompt}`, }; - progress.log(promptDef, 'codegen', 'Repairing code with AI'); + progress.log(promptDef, 'codegen', `Repairing ${repairType} failures with AI`); const response = await env.executor.generateRepairFiles( evalID, context, model, - errorMessage, + errors.map(ec => ec.errorMessage).join('\n'), appFiles, contextFiles, abortSignal, diff --git a/runner/orchestration/executors/executor.ts b/runner/orchestration/executors/executor.ts index d6a37e1..297cd89 100644 --- a/runner/orchestration/executors/executor.ts +++ b/runner/orchestration/executors/executor.ts @@ -6,6 +6,7 @@ import { LlmResponse, LlmResponseFile, RootPromptDefinition, + TestExecutionResult, } from '../../shared-interfaces.js'; import {BuildResult} from '../../workers/builder/builder-types.js'; import z from 'zod'; @@ -72,6 +73,19 @@ export const executorSchema = z.object({ ]), z.promise(z.custom()), ), + executeProjectTests: z.function( + z.tuple([ + z.custom().describe('ID of the eval'), + z.string().describe('Path to the application directory'), + z.custom().describe('Root prompt definition'), + z + .custom() + .describe('Worker concurrency queue. Use this for limiting local workers.'), + z.custom().describe('Abort Signal to fire when tests should be canceled.'), + z.custom().describe('Progress logger'), + ]), + z.promise(z.custom().nullable()), + ), finalizeEval: z.function( z.tuple([z.custom().describe('ID of the eval')]), z.promise(z.void()), diff --git a/runner/orchestration/executors/local-executor-config.ts b/runner/orchestration/executors/local-executor-config.ts index d90cfbb..ae6df7c 100644 --- a/runner/orchestration/executors/local-executor-config.ts +++ b/runner/orchestration/executors/local-executor-config.ts @@ -1,6 +1,6 @@ import z from 'zod'; import {mcpServerOptionsSchema} from '../../codegen/llm-runner.js'; -import {getPossiblePackageManagers} from '../../configuration/environment-config.js'; +import {getPossiblePackageManagers} from '../../configuration/package-managers.js'; export const localExecutorConfigSchema = z.strictObject({ /** MCP servers that can be started for this environment. */ @@ -24,6 +24,10 @@ export const localExecutorConfigSchema = z.strictObject({ * Defaults to ` run start --port 0`. */ serveCommand: z.string().optional(), + /** + * Optional command for executing project tests. + */ + testCommand: z.string().optional(), /** * Whether to skip installing dependencies when running evals in the environment. * Useful if you're managing dependencies yourself. diff --git a/runner/orchestration/executors/local-executor.ts b/runner/orchestration/executors/local-executor.ts index 7c3dcf8..afbcccf 100644 --- a/runner/orchestration/executors/local-executor.ts +++ b/runner/orchestration/executors/local-executor.ts @@ -10,6 +10,7 @@ import { LlmResponse, LlmResponseFile, RootPromptDefinition, + TestExecutionResult, } from '../../shared-interfaces.js'; import {killChildProcessGracefully} from '../../utils/kill-gracefully.js'; import { @@ -21,7 +22,10 @@ import {serveApp} from '../../workers/serve-testing/serve-app.js'; import {generateCodeWithAI} from '../codegen.js'; import {EvalID, Executor} from './executor.js'; import {LocalExecutorConfig} from './local-executor-config.js'; -import {getPossiblePackageManagers} from '../../configuration/environment-config.js'; +import {getPossiblePackageManagers} from '../../configuration/package-managers.js'; +import {callWithTimeout} from '../../utils/timeout.js'; +import {executeCommand} from '../../utils/exec.js'; +import {cleanupBuildMessage} from '../../workers/builder/worker.js'; let uniqueIDs = 0; @@ -117,6 +121,48 @@ export class LocalExecutor implements Executor { ); } + async executeProjectTests( + _id: EvalID, + appDirectoryPath: string, + rootPromptDef: RootPromptDefinition, + workerConcurrencyQueue: PQueue, + abortSignal: AbortSignal, + progress: ProgressLogger, + ): Promise { + if (!this.config.testCommand) { + return Promise.resolve(null); + } + const testCommand = this.config.testCommand; + + let output: string; + let passed: boolean; + + try { + // Run the test command inside the temporary project directory + // Also add to the worker concurrency queue to not overload local systems. + const stdout = await workerConcurrencyQueue.add(() => + callWithTimeout( + `Testing ${rootPromptDef.name}`, + timeoutAbort => + executeCommand(testCommand, appDirectoryPath, undefined, { + abortSignal: AbortSignal.any([abortSignal, timeoutAbort]), + }), + 4, // 4min. This is a safety boundary. Lots of parallelism can slow-down. + ), + ); + output = stdout; + passed = true; + } catch (error: any) { + output = error.message; + passed = false; + } + + return { + passed, + output: cleanupBuildMessage(output), + } satisfies TestExecutionResult; + } + async serveWebApplication( _id: EvalID, appDirectoryPath: string, diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts index 9edf72b..dd076b0 100644 --- a/runner/orchestration/generate.ts +++ b/runner/orchestration/generate.ts @@ -31,7 +31,6 @@ import { } from '../shared-interfaces.js'; import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js'; import {callWithTimeout} from '../utils/timeout.js'; -import {attemptBuild} from './build-serve-loop.js'; import {createLlmResponseTokenUsageMessage} from './codegen.js'; import {generateUserJourneysForApp} from './user-journeys.js'; import {resolveContextFiles, setupProjectStructure, writeResponseFiles} from './file-system.js'; @@ -48,6 +47,7 @@ import {getRunnerByName} from '../codegen/runner-creation.js'; import {summarizeReportWithAI} from '../reporting/report-ai-summary.js'; import {LocalExecutor} from './executors/local-executor.js'; import {EvalID} from './executors/executor.js'; +import {attemptBuildAndTest} from './build-serve-loop.js'; /** * Orchestrates the entire assessment process for each prompt defined in the `prompts` array. @@ -56,7 +56,8 @@ import {EvalID} from './executors/executor.js'; * 1. Makes a request to Gemini to generate code. * 2. Attempts to build it in a template Angular project. * 3. If the build fails, it makes a number of "fix it" Gemini requests. - * 4. Runs other validations and computes a score for generated output. + * 4. If configured, runs unit tests and attempts to repair test failures. + * 5. Runs other validations and computes a score for generated output. * * @returns A Promise that resolves to an array of AssessmentResult objects, * each containing the prompt, generated code, and final validation status. @@ -345,7 +346,7 @@ async function startEvaluationTask( // Try to build the files in the root prompt directory. // This will also attempt to fix issues with the generated code. - const attempt = await attemptBuild( + const attempt = await attemptBuildAndTest( config, evalID, env, @@ -378,6 +379,8 @@ async function startEvaluationTask( abortSignal, progress, config.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME, + attempt.testResult ?? null, + attempt.testRepairAttempts, ); results.push({ @@ -395,6 +398,8 @@ async function startEvaluationTask( userJourneys: userJourneys, axeRepairAttempts: attempt.axeRepairAttempts, toolLogs, + testResult: attempt.testResult ?? null, + testRepairAttempts: attempt.testRepairAttempts, } satisfies AssessmentResult); } diff --git a/runner/orchestration/build-repair.ts b/runner/orchestration/repair.ts similarity index 94% rename from runner/orchestration/build-repair.ts rename to runner/orchestration/repair.ts index 5e6b9e8..c7b52ac 100644 --- a/runner/orchestration/build-repair.ts +++ b/runner/orchestration/repair.ts @@ -1,3 +1,4 @@ +import {Environment} from '../configuration/environment.js'; import PQueue from 'p-queue'; import { AttemptDetails, @@ -6,12 +7,11 @@ import { LlmResponseFile, RootPromptDefinition, } from '../shared-interfaces.js'; -import {Environment} from '../configuration/environment.js'; -import {repairCodeWithAI} from './codegen.js'; -import {writeResponseFiles} from './file-system.js'; import {runBuild} from './build-worker.js'; import {ProgressLogger} from '../progress/progress-logger.js'; -import {EvalID, Executor} from './executors/executor.js'; +import {EvalID} from './executors/executor.js'; +import {repairCodeWithAI} from './codegen.js'; +import {writeResponseFiles} from './file-system.js'; /** * Calls the LLM to repair code, handles the response, and attempts to build the project again. @@ -22,12 +22,11 @@ import {EvalID, Executor} from './executors/executor.js'; * @param directory The working directory. * @param finalOutputFiles The list of output files to be modified. * @param errorMessage The error message from the failed build. - * @param errorContext Additional context for the error. + * @param errors Additional context for the error. * @param contextFiles A list of context files for the LLM. * @param abortSignal An AbortSignal to cancel the operation. * @param workerConcurrencyQueue The queue for managing worker concurrency. * @param attempts The current attempt number. - * @param repairType The type of repair being performed. * @returns A promise that resolves to the new BuildResult. */ export async function repairAndBuild( @@ -37,13 +36,13 @@ export async function repairAndBuild( rootPromptDef: RootPromptDefinition, directory: string, previousAttemptFiles: LlmResponseFile[], - errorMessage: string, - errorContext: string, + errors: Array<{errorContext: string; errorMessage: string}>, contextFiles: LlmContextFile[], abortSignal: AbortSignal, workerConcurrencyQueue: PQueue, attempts: number, progress: ProgressLogger, + repairType: 'build' | 'test', ): Promise { const repairResponse = await repairCodeWithAI( evalID, @@ -52,11 +51,11 @@ export async function repairAndBuild( rootPromptDef, directory, previousAttemptFiles, - errorMessage, - errorContext, + errors, contextFiles, abortSignal, progress, + repairType, ); return await handleRepairResponse( @@ -73,6 +72,27 @@ export async function repairAndBuild( ); } +/** + * Merges a set of new or updated files from a repair attempt into the + * current set of files. + * @param repairOutputFiles The array of new or updated files to merge. + * @param finalFiles The array of files to be updated. + */ +function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) { + // Merge the repair response into the original files. Otherwise we may end up dropping + // files that were valid in the initial response and the LLM decided not to touch, because + // they're still valid. + for (const file of repairOutputFiles) { + const existingFile = finalFiles.find(f => f.filePath === file.filePath); + + if (existingFile) { + existingFile.code = file.code; + } else { + finalFiles.push(file); + } + } +} + /** * Processes an LLM repair response by merging the suggested file changes, * writing them to disk, rebuilding the application, and logging the outcome. @@ -88,7 +108,7 @@ async function handleRepairResponse( abortSignal: AbortSignal, attempts: number, progress: ProgressLogger, -) { +): Promise { if (!repairResponse.success) { progress.log( rootPromptDef, @@ -99,7 +119,6 @@ async function handleRepairResponse( // Stop trying to repair if AI can't suggest a fix (API request fails) throw new Error(`Repair request failed: ${repairResponse.errors.join('\n')}`); } - // Clone the previous files because `mergeRepairFiles` mutates the attempt files. // We don't want to change files of a previous attempt. const newAttemptFiles = previousAttemptFiles.map(f => ({...f})); @@ -126,24 +145,3 @@ async function handleRepairResponse( attempt: attempts, }; } - -/** - * Merges a set of new or updated files from a repair attempt into the - * current set of files. - * @param repairOutputFiles The array of new or updated files to merge. - * @param finalFiles The array of files to be updated. - */ -function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) { - // Merge the repair response into the original files. Otherwise we may end up dropping - // files that were valid in the initial response and the LLM decided not to touch, because - // they're still valid. - for (const file of repairOutputFiles) { - const existingFile = finalFiles.find(f => f.filePath === file.filePath); - - if (existingFile) { - existingFile.code = file.code; - } else { - finalFiles.push(file); - } - } -} diff --git a/runner/orchestration/test-worker.ts b/runner/orchestration/test-worker.ts new file mode 100644 index 0000000..df08d0a --- /dev/null +++ b/runner/orchestration/test-worker.ts @@ -0,0 +1,42 @@ +import PQueue from 'p-queue'; +import {RootPromptDefinition, TestExecutionResult} from '../shared-interfaces.js'; +import {ProgressLogger} from '../progress/progress-logger.js'; +import {Environment} from '../configuration/environment.js'; +import {EvalID} from './executors/executor.js'; + +export async function runTest( + env: Environment, + evalID: EvalID, + appDirectoryPath: string, + rootPromptDef: RootPromptDefinition, + abortSignal: AbortSignal, + workerConcurrencyQueue: PQueue, + progress: ProgressLogger, +): Promise { + progress.log(rootPromptDef, 'test', `Running tests`); + + try { + const result = await env.executor.executeProjectTests( + evalID, + appDirectoryPath, + rootPromptDef, + workerConcurrencyQueue, + abortSignal, + progress, + ); + if (result === null) { + return result; + } + + if (result.passed) { + progress.log(rootPromptDef, 'success', 'Tests have passed'); + } else { + progress.log(rootPromptDef, 'error', 'Tests have failed'); + } + + return result; + } catch (err) { + progress.log(rootPromptDef, 'error', `Error when executing tests`, err + ''); + throw err; + } +} diff --git a/runner/progress/dynamic-progress-logger.ts b/runner/progress/dynamic-progress-logger.ts index 949cf96..0e68632 100644 --- a/runner/progress/dynamic-progress-logger.ts +++ b/runner/progress/dynamic-progress-logger.ts @@ -148,6 +148,7 @@ export class DynamicProgressLogger implements ProgressLogger { switch (type) { case 'success': case 'serve-testing': + case 'test': case 'build': return chalk.green; case 'error': diff --git a/runner/progress/progress-logger.ts b/runner/progress/progress-logger.ts index c888aba..b029aa6 100644 --- a/runner/progress/progress-logger.ts +++ b/runner/progress/progress-logger.ts @@ -2,7 +2,14 @@ import {greenCheckmark, redX} from '../reporting/format.js'; import {AssessmentResult, RootPromptDefinition} from '../shared-interfaces.js'; /** Possible progress event types. */ -export type ProgressType = 'codegen' | 'build' | 'serve-testing' | 'success' | 'error' | 'eval'; +export type ProgressType = + | 'codegen' + | 'build' + | 'test' + | 'serve-testing' + | 'success' + | 'error' + | 'eval'; /** Maps a ProgressType to an icon that can represent it. */ export function progressTypeToIcon(type: ProgressType): string { @@ -12,6 +19,8 @@ export function progressTypeToIcon(type: ProgressType): string { return '๐Ÿค–'; case 'build': return '๐Ÿ”จ'; + case 'test': + return '๐Ÿงช'; case 'serve-testing': return '๐ŸŒŠ'; case 'success': diff --git a/runner/ratings/built-in-ratings/successful-tests-rating.ts b/runner/ratings/built-in-ratings/successful-tests-rating.ts new file mode 100644 index 0000000..2941fd3 --- /dev/null +++ b/runner/ratings/built-in-ratings/successful-tests-rating.ts @@ -0,0 +1,28 @@ +import {PerBuildRating, RatingKind, RatingCategory, RatingState} from '../rating-types.js'; + +/** Rating which verifies that unit tests pass successfully. */ +export const successfulTestsRating: PerBuildRating = { + name: 'Tests pass successfully', + description: 'Ensures tests run and pass without errors.', + id: 'common-successful-tests', + kind: RatingKind.PER_BUILD, + category: RatingCategory.MEDIUM_IMPACT, + scoreReduction: '30%', + // Reduce the amount of points in case we've had test repair attempts. + rate: ({testResult, testRepairAttempts}) => { + // If no test results are available, skip this rating + if (!testResult) { + return { + state: RatingState.SKIPPED, + message: 'Unit testing not configured.', + }; + } + + return { + state: RatingState.EXECUTED, + coefficient: testResult.passed + ? 1 / ((testRepairAttempts || 0) + 1) // Reduce score based on repair attempts + : 0, // No points if tests failed + }; + }, +}; diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts index 99d0874..c0500ec 100644 --- a/runner/ratings/rate-code.ts +++ b/runner/ratings/rate-code.ts @@ -8,6 +8,7 @@ import { IndividualAssessmentState, PromptDefinition, AssessmentCategory, + TestExecutionResult, } from '../shared-interfaces.js'; import { RatingState, @@ -56,6 +57,8 @@ export async function rateGeneratedCode( abortSignal: AbortSignal, progress: ProgressLogger, autoraterModel: string, + testResult: TestExecutionResult | null, + testRepairAttempts: number, ): Promise { let categorizedFiles: CategorizedFiles | null = null; let totalPoints = 0; @@ -93,6 +96,8 @@ export async function rateGeneratedCode( buildResult, serveTestingResult, repairAttempts, + testResult, + testRepairAttempts, outputFiles.length, axeRepairAttempts, ratingsResult, @@ -173,6 +178,8 @@ function runPerBuildRating( buildResult: BuildResult, serveResult: ServeTestingResult | null, repairAttempts: number, + testResult: TestExecutionResult | null, + testRepairAttempts: number, generatedFileCount: number, axeRepairAttempts: number, ratingsResult: RatingsResult, @@ -184,6 +191,8 @@ function runPerBuildRating( generatedFileCount, axeRepairAttempts, ratingsResult, + testResult, + testRepairAttempts, }); // If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment. diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts index fceb104..6dcbf1c 100644 --- a/runner/ratings/rating-types.ts +++ b/runner/ratings/rating-types.ts @@ -5,6 +5,7 @@ import type { LlmResponseFile, PromptDefinition, SkippedIndividualAssessment, + TestExecutionResult, Usage, } from '../shared-interfaces.js'; import {Environment} from '../configuration/environment.js'; @@ -64,6 +65,8 @@ const perBuildRatingSchema = z buildResult: z.custom(), serveResult: z.custom(), repairAttempts: z.number(), + testResult: z.custom(), + testRepairAttempts: z.number(), axeRepairAttempts: z.number(), generatedFileCount: z.number(), ratingsResult: z.record(z.custom()), diff --git a/runner/ratings/stats.ts b/runner/ratings/stats.ts index 7d94753..a97e927 100644 --- a/runner/ratings/stats.ts +++ b/runner/ratings/stats.ts @@ -25,6 +25,10 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag let successfulInitialBuilds = 0; let successfulBuildsAfterRepair = 0; let failedBuilds = 0; + let successfulInitialTests = 0; + let successfulTestsAfterRepair = 0; + let failedTests = 0; + let noTestsRun = 0; let runtimeStats: RuntimeStats | undefined; let accessibilityStats: | { @@ -59,6 +63,20 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag } } + // Calculate test statistics + if (result.testResult) { + if (result.testResult.passed) { + if ((result.testRepairAttempts || 0) === 0) { + successfulInitialTests++; + } else { + successfulTestsAfterRepair++; + } + } else { + failedTests++; + } + } else { + noTestsRun++; + } if (result.finalAttempt.serveTestingResult?.runtimeErrors != undefined) { runtimeStats ??= {appsWithErrors: 0, appsWithoutErrors: 0}; if (result.finalAttempt.serveTestingResult.runtimeErrors.trim() != '') { @@ -124,6 +142,12 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag failedBuilds, errorDistribution: Object.keys(errorDistribution).length > 0 ? errorDistribution : undefined, }, + tests: { + successfulInitialTests, + successfulTestsAfterRepair, + failedTests, + noTestsRun, + }, buckets, runtime: runtimeStats ? { diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts index e28c4b8..586cb32 100644 --- a/runner/shared-interfaces.ts +++ b/runner/shared-interfaces.ts @@ -27,8 +27,8 @@ export interface AssessmentConfig { enableAutoCsp?: boolean; logging?: 'text-only' | 'dynamic'; autoraterModel?: string; - a11yRepairAttempts?: number; skipLighthouse?: boolean; + maxTestRepairAttempts?: number; maxBuildRepairAttempts?: number; } @@ -248,8 +248,12 @@ export interface AttemptDetails { // Note: May not be set in older reports. reasoning?: string; - /** Whether the build failed during an accessibility repair attempt. */ - buildFailedDuringA11yRepair?: boolean; + /** Whether the build failed during an test repair attempt (a11y or unit). */ + buildFailedDuringTestRepair?: boolean; + /** Result of running tests for this attempt. */ + testResult?: TestExecutionResult; + /** The number of repair attempts made for tests in this attempt. */ + testRepairAttempts?: number; } /** Statistics related to the build process of the generated applications. */ @@ -264,6 +268,18 @@ export interface RunSummaryBuilds { errorDistribution?: Partial>; } +/** Statistics related to the test process of the generated applications. */ +export interface RunSummaryTests { + /** The number of applications that had tests run and all tests passed on the first attempt. */ + successfulInitialTests: number; + /** The number of applications that had tests run and all tests passed after repair attempts. */ + successfulTestsAfterRepair: number; + /** The number of applications that had tests run but tests failed even after repair attempts. */ + failedTests: number; + /** The number of applications that did not have tests run (no test command configured). */ + noTestsRun: number; +} + /** Buckets into which scores can be categorized. */ export interface ScoreBucket { /** Plain name of the bucket, e.g. "Good" */ @@ -298,6 +314,8 @@ export interface AggregatedRunStats { buckets: ScoreBucket[]; /** Runtime stats. Not present for reports that didn't request runtime error collection. */ runtime?: RuntimeStats; + /** Test stats. Not present for reports that didn't run tests or older reports. */ + tests?: RunSummaryTests; accessibility?: { appsWithErrors: number; @@ -476,6 +494,10 @@ export interface AssessmentResult { axeRepairAttempts: number; /** Tool requests logs (e.g. MCP requests and responses). */ toolLogs?: ToolLogEntry[]; + /** Result of running unit tests. */ + testResult: TestExecutionResult | null; + /** Number of repair attempts for tests. */ + testRepairAttempts?: number; } /** @@ -565,3 +587,9 @@ export interface LlmGenerateFilesRequest { /** Directory in which the generation will occur. */ directory: string; } + +/** Result of running tests. */ +export interface TestExecutionResult { + passed: boolean; + output: string; +}