Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion runner/configuration/environment-config.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import z from 'zod';
import {createMessageBuilder, fromError} from 'zod-validation-error/v3';
import {UserFacingError} from '../utils/errors.js';
import {ratingSchema} from '../ratings/rating-types.js';
import {RatingCategory, ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js';
import {EvalPrompt, EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
import {executorSchema} from '../orchestration/executors/executor.js';
import {
Expand All @@ -21,6 +21,11 @@ export const environmentConfigSchema = z.object({
clientSideFramework: z.string(),
/** Ratings to run when evaluating the environment. */
ratings: z.array(ratingSchema),
/**
* Map used to override fields for specific ratings. The key is the unique ID of
* the rating and the value are the override fields.
*/
ratingOverrides: z.record(z.string(), ratingOverrideSchema).optional(),
/** Path to the prompt used by the LLM for generating files. */
generationSystemPrompt: z.string(),
/**
Expand Down Expand Up @@ -72,6 +77,20 @@ export const environmentConfigSchema = z.object({
'Executor to be used for this environment. ' +
'If unset, a local executor is derived from the full environment configuration.',
),

/**
* Map used to override fields for specific rating categories. The key is the unique ID of
* the category and the value are the override fields.
*/
categoryOverrides: z
.record(
z.custom<RatingCategory>(),
z.object({
name: z.string().optional(),
maxPoints: z.number().optional(),
}),
)
.optional(),
});

/**
Expand Down
56 changes: 51 additions & 5 deletions runner/configuration/environment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import {readdirSync, readFileSync, statSync} from 'fs';
import {basename, extname, join, resolve} from 'path';
import {globSync} from 'tinyglobby';
import {Executor} from '../orchestration/executors/executor.js';
import {Rating} from '../ratings/rating-types.js';
import {Rating, RatingCategory} from '../ratings/rating-types.js';
import {
FrameworkInfo,
MultiStepPromptDefinition,
Expand Down Expand Up @@ -38,6 +38,12 @@ export class Environment {
readonly executor: Executor;
/** Timeout for a single eval prompt in minutes. */
readonly promptTimeoutMinutes: number | undefined;
/** Configuration for the individual rating categories. */
readonly ratingCategories: {
[RatingCategory.HIGH_IMPACT]: {name: string; maxPoints: number};
[RatingCategory.MEDIUM_IMPACT]: {name: string; maxPoints: number};
[RatingCategory.LOW_IMPACT]: {name: string; maxPoints: number};
};

constructor(
rootPath: string,
Expand Down Expand Up @@ -65,11 +71,12 @@ export class Environment {
this.isBuiltIn = rootPath.includes('node_modules');
this.executor = config.executor;
this.promptTimeoutMinutes = config.promptTimeoutMinutes;
this.ratingCategories = this.getRatingCategories(config);
}

/** Prompts that should be executed as a part of the evaluation. */
executablePrompts = lazy(async () => {
return this.resolveExecutablePrompts(this.config.executablePrompts, this.config.ratings);
return this.resolveExecutablePrompts(this.config.executablePrompts, this.config);
});

systemPromptGeneration = lazy(async () => {
Expand Down Expand Up @@ -166,15 +173,32 @@ export class Environment {

/**
* Resolves the prompt configuration into prompt definitions.
* @param rootPath Root path of the project.
* @param prompts Prompts to be resolved.
* @param envRatings Environment-level ratings.
* @param config Configuration for the environment.
*/
private async resolveExecutablePrompts(
prompts: EnvironmentConfig['executablePrompts'],
envRatings: Rating[],
config: EnvironmentConfig,
): Promise<RootPromptDefinition[]> {
const result: Promise<RootPromptDefinition>[] = [];
let envRatings: Rating[];

if (config.ratingOverrides) {
Object.keys(config.ratingOverrides).forEach(id => {
if (!config.ratings.some(rating => rating.id === id)) {
throw new UserFacingError(
`Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`,
);
}
});

envRatings = config.ratings.map(rating => {
const override = config.ratingOverrides![rating.id];
return override ? {...rating, ...override} : rating;
});
} else {
envRatings = config.ratings;
}

for (const def of prompts) {
if (def instanceof MultiStepPrompt) {
Expand Down Expand Up @@ -353,4 +377,26 @@ export class Environment {

return result;
}

private getRatingCategories(config: EnvironmentConfig) {
const overrides = config.categoryOverrides;

return {
[RatingCategory.HIGH_IMPACT]: {
name: 'High Impact',
maxPoints: 60,
...overrides?.[RatingCategory.HIGH_IMPACT],
},
[RatingCategory.MEDIUM_IMPACT]: {
name: 'Medium Impact',
maxPoints: 30,
...overrides?.[RatingCategory.MEDIUM_IMPACT],
},
[RatingCategory.LOW_IMPACT]: {
name: 'Low Impact',
maxPoints: 10,
...overrides?.[RatingCategory.LOW_IMPACT],
},
};
}
}
5 changes: 1 addition & 4 deletions runner/ratings/rate-code.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ import {
PerFileRatingContentType,
RatingKind,
RatingCategory,
POINTS_FOR_CATEGORIES,
Rating,
CATEGORY_NAMES,
RatingsResult,
} from './rating-types.js';
import {extractEmbeddedCodeFromTypeScript} from './embedded-languages.js';
Expand Down Expand Up @@ -82,10 +80,9 @@ export async function rateGeneratedCode(
RatingCategory.MEDIUM_IMPACT,
RatingCategory.LOW_IMPACT,
].map(category => ({
...environment.ratingCategories[category],
id: category,
name: CATEGORY_NAMES[category],
points: 0,
maxPoints: POINTS_FOR_CATEGORIES[category],
assessments: [],
}));

Expand Down
20 changes: 6 additions & 14 deletions runner/ratings/rating-types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,6 @@ export enum RatingCategory {
LOW_IMPACT = 'low-impact',
}

/** Points correspond to each `RatingCategory`. */
export const POINTS_FOR_CATEGORIES = {
[RatingCategory.HIGH_IMPACT]: 60,
[RatingCategory.MEDIUM_IMPACT]: 30,
[RatingCategory.LOW_IMPACT]: 10,
};

/** Display names for each `RatingCategory`. */
export const CATEGORY_NAMES = {
[RatingCategory.HIGH_IMPACT]: 'High Impact',
[RatingCategory.MEDIUM_IMPACT]: 'Medium Impact',
[RatingCategory.LOW_IMPACT]: 'Low Impact',
};

const ratingCommonContextFields = {
ratingsResult: z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),
prompt: z.custom<PromptDefinition>(),
Expand Down Expand Up @@ -126,6 +112,12 @@ export const ratingSchema = z.union([
llmBasedRatingSchema,
]);

export const ratingOverrideSchema = z.object({
category: z.custom<RatingCategory>().optional(),
scoreReduction: z.custom<`${number}%`>().optional(),
groupingLabels: z.array(z.string()).optional().optional(),
});

/** Result of a per-build rating. */
export type PerBuildRatingResult =
| {
Expand Down
56 changes: 34 additions & 22 deletions runner/reporting/report-ai-chat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,6 @@ import {
} from '../shared-interfaces.js';
import {BuildResultStatus} from '../workers/builder/builder-types.js';
import {BUCKET_CONFIG} from '../ratings/stats.js';
import {POINTS_FOR_CATEGORIES} from '../ratings/rating-types.js';

export const reportLlmEvalsToolContext = `## What is a report?
A report consists of many apps that were LLM generated. You will have information
about checks that failed for this LLM generated app.

Note that there may be multiple attempts for an app. E.g. an initial build may fail and
another attempt might have repaired the build failure. The last attempt reflects the final
state of the app. E.g. whether it does build, or if there are runtime errors.

## Scoring mechanism
Apps are rated based on their scores in the following buckets:
${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')}

The overall score of an app is determined based on score reductions.
There are three pillars: ${Object.keys(POINTS_FOR_CATEGORIES).join(', ')}
Pillars are a split up of a 100% perfect score, allowing for individual ratings
to be less impactful than others. The pillars are distributed as follows:
${Object.entries(POINTS_FOR_CATEGORIES).map(e => `* ${e[0]}: ${e[1]} points.`)}
Within pillars, the available score can be reduced by individual ratings.
`;

const defaultAiChatPrompt = `Strictly follow the instructions here.
- You are an expert in LLM-based code generation evaluation and quality assessments.
Expand Down Expand Up @@ -90,7 +69,7 @@ export async function chatWithReportAI(
${message}
\`\`\`

${reportLlmEvalsToolContext}
${getContextPrompt(assessmentsToProcess)}

### How many apps are there?
There are ${allAssessments.length} apps in this report.
Expand Down Expand Up @@ -193,3 +172,36 @@ function isAssessmentResultWithID(
): value is AssessmentResultFromReportServer {
return (value as Partial<AssessmentResultFromReportServer>).id !== undefined;
}

function getContextPrompt(assessments: AssessmentResultFromReportServer[] | AssessmentResult[]) {
let categoryCount = 0;
let pointsForCategories = {} as Record<string, number>;

// Deduce the categories from the first result since they're the same for the entire run.
if (assessments.length) {
assessments[0].score.categories.forEach(category => {
categoryCount++;
pointsForCategories[category.id] = category.maxPoints;
});
}

return `## What is a report?
A report consists of many apps that were LLM generated. You will have information
about checks that failed for this LLM generated app.

Note that there may be multiple attempts for an app. E.g. an initial build may fail and
another attempt might have repaired the build failure. The last attempt reflects the final
state of the app. E.g. whether it does build, or if there are runtime errors.

## Scoring mechanism
Apps are rated based on their scores in the following buckets:
${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')}

The overall score of an app is determined based on score reductions.
There are ${categoryCount} pillars: ${Object.keys(pointsForCategories).join(', ')}
Pillars are a split up of a 100% perfect score, allowing for individual ratings
to be less impactful than others. The pillars are distributed as follows:
${Object.entries(pointsForCategories).map(e => `* ${e[0]}: ${e[1]} points.`)}
Within pillars, the available score can be reduced by individual ratings.
`;
}
2 changes: 1 addition & 1 deletion runner/shared-interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ export interface LlmContextFile {
export interface AssessmentCategory {
/** Unique ID of the category. */
id: RatingCategory;
/** Display name of the cateogry. */
/** Display name of the category. */
name: string;
/** Points that have been awarded to the category. */
points: number;
Expand Down
Loading