Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions report-app/src/app/pages/report-viewer/report-viewer.html
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,18 @@ <h4>Repair System Prompt</h4>
</expansion-panel>
}

@if (report.details.summary.additionalAiAnalysis !== undefined) {
@for (item of report.details.summary.additionalAiAnalysis; track item) {
<expansion-panel size="large" class="root-section">
<expansion-panel-header>
<img src="gemini.webp" alt="Gemini Logo" height="30" width="30" />
{{item.name}}
</expansion-panel-header>
<div [innerHTML]="item.summary"></div>
</expansion-panel>
}
}

@if (missingDeps().length > 0) {
<expansion-panel size="large" class="root-section">
<expansion-panel-header>
Expand Down
19 changes: 19 additions & 0 deletions runner/configuration/environment-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
LocalExecutorConfig,
localExecutorConfigSchema,
} from '../orchestration/executors/local-executor-config.js';
import {RatingContextFilter, ReportContextFilter} from '../shared-interfaces.js';

export const environmentConfigSchema = z.object({
/** Display name for the environment. */
Expand Down Expand Up @@ -98,6 +99,24 @@ export const environmentConfigSchema = z.object({
* It's useful to ensure that the set of ratings hasn't changed between two runs.
*/
expectedRatingHash: z.string().optional(),

/**
* Prompts to use when for additional analysis of the eval results.
*/
analysisPrompts: z
.array(
z.object({
name: z.string(),
path: z.string(),
reportsFilter: z
.enum([ReportContextFilter.AllReports, ReportContextFilter.NonPerfectReports])
.optional(),
ratingsFilter: z
.enum([RatingContextFilter.AllRatings, RatingContextFilter.NonPerfectRatings])
.optional(),
}),
)
.optional(),
});

/**
Expand Down
36 changes: 33 additions & 3 deletions runner/configuration/environment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import {
FrameworkInfo,
MultiStepPromptDefinition,
PromptDefinition,
RatingContextFilter,
ReportContextFilter,
RootPromptDefinition,
} from '../shared-interfaces.js';
import {UserFacingError} from '../utils/errors.js';
Expand All @@ -22,6 +24,13 @@ interface CategoryConfig {
maxPoints: number;
}

interface AnalysisPrompt {
name: string;
prompt: string;
reportsFilter: ReportContextFilter;
ratingsFilter: RatingContextFilter;
}

/** Represents a single prompt evaluation environment. */
export class Environment {
/** Path at which the environment is defined. */
Expand Down Expand Up @@ -56,6 +65,9 @@ export class Environment {
*/
readonly ratingHash: string;

/** Additional analysis prompts defined by the user. */
readonly analysisPrompts: AnalysisPrompt[];

/** Ratings configured at the environment level. */
private readonly ratings: Rating[];

Expand Down Expand Up @@ -88,6 +100,7 @@ export class Environment {
this.ratingCategories = this.getRatingCategories(config);
this.ratings = this.resolveRatings(config);
this.ratingHash = this.getRatingHash(this.ratings, this.ratingCategories);
this.analysisPrompts = this.resolveAnalysisPrompts(config);
this.validateRatingHash(this.ratingHash, config);
}

Expand Down Expand Up @@ -262,7 +275,7 @@ export class Environment {
isEditing: boolean,
metadata: Metadata,
): Promise<PromptDefinition<Metadata>> {
const {result, contextFiles} = await this.renderEnvironmentPrompt(relativePath);
const {result, contextFiles} = this.renderEnvironmentPrompt(relativePath);

return {
name: name,
Expand Down Expand Up @@ -360,13 +373,13 @@ export class Environment {
}

/** Renders a prompt from a path relative to the environment config. */
private async renderEnvironmentPrompt(relativePath: string) {
private renderEnvironmentPrompt(relativePath: string) {
const path = resolve(this.rootPath, relativePath);
return this.renderPrompt(readFileSync(path, 'utf8'), path);
}

private async renderSystemPrompt(relativePath: string) {
const result = await this.renderEnvironmentPrompt(relativePath);
const result = this.renderEnvironmentPrompt(relativePath);

// Optional hooks for post processing environment system prompts. Useful for e.g.
// supporting `@` references from Gemini CLI or inside g3.
Expand Down Expand Up @@ -446,4 +459,21 @@ export class Environment {
);
}
}

private resolveAnalysisPrompts(config: EnvironmentConfig): AnalysisPrompt[] {
const result: AnalysisPrompt[] = [];

config.analysisPrompts?.forEach(({name, path, reportsFilter, ratingsFilter}) => {
const prompt = this.renderEnvironmentPrompt(path).result;

result.push({
name,
prompt,
reportsFilter: reportsFilter ?? ReportContextFilter.NonPerfectReports,
ratingsFilter: ratingsFilter ?? RatingContextFilter.NonPerfectRatings,
});
});

return result;
}
}
40 changes: 39 additions & 1 deletion runner/orchestration/generate-summary.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
import {Environment} from '../configuration/environment.js';
import {redX} from '../reporting/format.js';
import {chatWithReportAI} from '../reporting/report-ai-chat.js';
import {summarizeReportWithAI} from '../reporting/report-ai-summary.js';
import {AssessmentResult, CompletionStats, RunSummary} from '../shared-interfaces.js';

Expand Down Expand Up @@ -43,7 +44,7 @@ export async function prepareSummary(

let aiSummary: string | undefined = undefined;
if (generateAiSummaryLlm) {
console.log(`✨ Generating AI summary for evaluation run..`);
console.log(`✨ Generating AI summary for evaluation run...`);
try {
const result = await summarizeReportWithAI(generateAiSummaryLlm, abortSignal, assessments);
inputTokens += result.usage.inputTokens;
Expand All @@ -61,6 +62,42 @@ export async function prepareSummary(
}
}

const additionalAiAnalysis: {name: string; summary: string}[] = [];
if (generateAiSummaryLlm && env.analysisPrompts.length > 0) {
console.log(`✨ Generating additional AI analysis...`);

await Promise.all(
env.analysisPrompts.map(async config => {
try {
const result = await chatWithReportAI(
generateAiSummaryLlm,
config.prompt,
abortSignal,
assessments,
[],
model,
{
reportContextFilter: config.reportsFilter,
ratingContextFilter: config.ratingsFilter,
},
undefined,
);
inputTokens += result.usage.inputTokens;
outputTokens += result.usage.outputTokens;
thinkingTokens += result.usage.thinkingTokens;
totalTokens += result.usage.totalTokens;
additionalAiAnalysis.push({name: config.name, summary: result.responseHtml});
} catch (e) {
console.log(`${redX()} Failed custom analysis called "${config.name}".`);

if (process.env.DEBUG === '1' && (e as Partial<Error>).stack) {
console.error((e as Error).stack);
}
}
}),
);
}

const executorInfo = await env.executor.getExecutorInfo?.();

return {
Expand All @@ -78,6 +115,7 @@ export async function prepareSummary(
},
},
aiSummary,
additionalAiAnalysis,
completionStats: completionStats,
usage: {
inputTokens,
Expand Down
2 changes: 1 addition & 1 deletion runner/reporting/report-ai-chat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ ${serializeReportForPrompt(assessmentsToProcess, contextFilters)}
includeThoughts: false,
},
timeout: {
description: `Generating summary for report`,
description: `Chatting with AI`,
durationInMins: 3,
},
abortSignal,
Expand Down
2 changes: 2 additions & 0 deletions runner/shared-interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,8 @@ export interface RunSummary {
completionStats?: CompletionStats;
/** AI summary (as HTML code) of all assessments in this run/report. */
aiSummary?: string;
/** Additional user-defined AI analysis. */
additionalAiAnalysis?: {name: string; summary: string}[];
/**
* Information about the runner that was used for the eval.
* Optional since some older reports might not have it.
Expand Down
Loading