In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Product Fidelity Evaluation with Gecko

This notebook illustrates how to assess product image fidelity by using Gemini to create a detailed ground-truth description of a reference image, which then serves as the prompt for the rubric-based Gecko evaluation metric to score candidate images.

Gemini is used to generate a detailed description of an original product image and then using that description as the "prompt" for Gecko.

By generating a high-fidelity text description of your product, you effectively convert an **image-to-image consistency** task (checking if the generated product matches the original) into the **text-to-image alignment** task that Gecko is built to perform.

The intent of this approach is to transform an image-consistency task into a text-alignment task, enabling "rubric-based" and "interpretable" nature to receive granular, diagnostic feedback on exactly which product details (such as color, shape, or texture) were preserved or lost.

This process can be automated at scale using the Google Cloud GenAI Evaluation Suite. This notebook generates an evaluation report for HITL reference.

## Evaluation Framework Overview

Gecko is architected as a reference-free "text-to-image" evaluation metric that assesses alignment by verifying if specific keywords and attributes from a text prompt appear in a generated image.

### The Rubric Generation Step 

This phase converts a text prompt into a comprehensive testing rubric. First, an LLM decomposes the prompt into key semantic elements (entities, attributes, and relationships) to ensure the evaluation covers the entire prompt rather than just parts of it. The system then generates specific Question-Answer (QA) pairs to verify these elements. Uniquely, Gecko employs a Natural Language Inference (NLI) model to filter these pairs, removing any hallucinated questions that are not factually grounded in the original text.


### The Validator Step 
In this final stage, a multimodal model (such as Gemini) serves as the "rubric validator," probing the generated media against the created QA pairs. Rather than simply marking answers as correct or incorrect, the validator calculates a normalized score based on the probability of the predicted answers. This approach captures the model's uncertainty, offering a more nuanced assessment of alignment than binary scoring

# Get started

In [1]:
# @title ### Install Vertex AI SDK for Python and other required packages

%pip install --upgrade --quiet "google-cloud-aiplatform[evaluation]>=1.122.0"

Note: you may need to restart the kernel to use updated packages.


In [None]:
# # @title ### Authenticate your notebook environment (Colab only)
# # @markdown If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

# import sys

# if "google.colab" in sys.modules:
#     from google.colab import auth

#     auth.authenticate_user()



In [1]:
# @title ### Set Google Cloud project information
# @markdown To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).
# @markdown Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

# @markdown ---

PROJECT_ID = "cpg-cdp"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
LOCATION= "global"  # @param {type: "string", placeholder: "us-central1", isTemplate: true}

# from vertexai import Client, types
from vertexai import types as vertex_types
from vertexai import Client as VertexClient
from google import genai
from google.genai import types
import pandas as pd

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

# Generate Faithful Description based on Original Products

Use Gemini to review the original product image and generate a description that accurately captures the details, difelity, and faithfulness of the product. This will description will act as the "prompt" which Gecko uses to evaluate text to image generation.

In [2]:
MODEL_ID = "gemini-3-pro-preview"

system_instruction = """
You are an automated Visual Quality Assurance Specialist designed to support the text-to-image evaluation metric. Your task is to analyze an original product image and generate a comprehensive "Ground Truth Description."

Your output will be used as the reference prompt to evaluate if a generated image faithfully reproduces the original product. Therefore, precision, objectivity, and completeness are critical.

**GUIDELINES:**

1.  **Strict Objectivity:** Do not use subjective adjectives (e.g., "beautiful," "delicious," "stunning," "premium"). Only describe what is visibly verifiable (e.g., "matte finish," "steaming," "high-contrast," "serif font").

2.  **Subject Isolation & Inference:**
    *   **IGNORE HUMANS:** Treat any human elements (hands, faces, models, mannequins) as invisible background noise. Do not describe skin tone, poses, or body parts. Focus 100% of the description on the inanimate object being held, worn, or used.
    *   **Product Identification:** If the product type is not explicitly labeled, use visual cues (form factor, ports, buttons, context) to make a high-confidence deduction of the product's identity (e.g., identifying a rectangular device with a lens as a "compact digital camera" rather than just a "black box").

3.  **Attribute Exhaustiveness:** You must explicitly describe:
    *   **Entities:** The core object(s) present (e.g., "a ceramic coffee mug").
    *   **Colors:** Specific shades and color distribution (e.g., "navy blue body with a white rim").
    *   **Textures/Materials:** Surface details (e.g., "condensation droplets on glass," "brushed aluminum," "woven fabric pattern").
    *   **Text & Logos:** Transcribe visible text *exactly* as it appears, including capitalization and approximate location (e.g., "the word 'COFFEE' printed in bold white letters at the center").
    *   **Spatial Relationships:** Where items are located relative to each other (e.g., "the spoon rests to the right of the saucer").

4.  **Avoid Hallucination:** Do not invent details that are obscured or implied. If a label is unreadable, do not guess the text. Only describe what is clearly visible in pixels.

5.  **Format:** Output a single, dense, coherent paragraph. Semantic decomposition works best with complete sentences that establish relationships between objects and their attributes.
"""

text_prompt = """
I am providing reference images of the SAME product. Your goal is to synthesize a single, definitive "Ground Truth" description that unifies all visual data from these inputs.

**CRITICAL INSTRUCTIONS:**
*   **Human Exclusion:** If the image features a person holding, wearing, or interacting with the product, completely ignore the person in your description. Do not mention hands, fingers, or models. Act as if the product is floating in the air or placed on a neutral surface.
*  **Background Exclusion:** Do not describe the surrounding environment, scenery, or surface on which the product rests. Whether the background is a solid color, a cluttered room, or an outdoor landscape, treat it as invisible. Your description must remain strictly within the physical boundaries of the product itself.
*   **Holistic Synthesis:** Merge details from all angles (front, back, side) into one coherent object profile. If a feature (e.g., a port, label, or texture) is visible in only one image, treat it as a permanent feature of the product.
*   **Intrinsic Properties:** Filter out lighting artifacts (glare, shadows, flash) to describe the object's actual local colors and material finishes.
*   **High-Fidelity Nuances:** You must capture the following specific dimensions:
    *   **Deductive Object Identity:** Explicitly name the product based on its visual features. If the exact model is unknown, provide the most accurate category description possible (e.g., "wireless over-ear noise-canceling headphones").
    *   **Materiality:** Specific surface qualities (e.g., "brushed aluminum," "knitted wool," "matte rubberized grip," "transparent glass").
    *   **Surface Graphics:** Explicitly describe any patterns, gradients, prints, or woven designs.
    *   **Typography & Data:** Transcribe all visible text, logos, and numbers *verbatim*, noting their color and placement.

**OUTPUT FORMAT:**
Produce a single, dense, highly descriptive paragraph. 750 words max.
"""

In [3]:
# @title Input Images (GCS)
# Replace these with your actual GCS paths
gcs_image_uris = [
    "gs://sandbox-401718-product-fidelity-eval/SKU007-unicorn.jpg",
    # "gs://andbox-401718-product-fidelity-evals/side_view.png",
]

content_parts = []

# Add image parts using URIs
for uri in gcs_image_uris:
    content_parts.append(
        types.Part.from_uri(file_uri=uri, mime_type="image/png")
    )

content_parts.append(text_prompt)

# Generate the description
response = client.models.generate_content(
    model=MODEL_ID,
    contents=content_parts,
    config=types.GenerateContentConfig(
        system_instruction=system_instruction,
        temperature=1,
    ),
)

generated_ground_truth_prompt = response.text
print(generated_ground_truth_prompt)



This visible object is an inflatable, full-body costume designed to resemble a standing, anthropomorphic unicorn. The costume is constructed from a lightweight, white synthetic material, likely nylon or polyester, which exhibits a puffy, billowy volume with soft wrinkles characteristic of fan-inflated garments. The large, oversized head features a conical yellow horn with spiral segmentation at the crown and two ears with white exteriors and purple triangular interiors. A prominent, bulbous pink snout section defines the face, detailed with a simple black curved line depicting a mouth and a vertical center seam. On the cheek, there is a small, dark graphic element resembling a star. Positioned directly beneath the unicorn's chin, at the neck level, is a clear, rectangular plastic viewing window. The torso is dominated by a large, light teal-blue oval panel covering the belly area, bisected by a vertical seam. The upper limbs consist of inflated white sleeves that terminate in distinct 

# Prepare the Evaluation dataset

In the following dataset, two prompts are used for each generated image. The first is the prompt that corresponds to the generated content. The second is a counterexample that is similar but does not exactly match the generated content. This is done to demonstrate the difference in the Gecko evaluation for high quality and low quality responses.

In [5]:
# @title Create Evaluation Dataset (GCS)

# Define your prompts (using the description generated in the previous step)
# We repeat the prompt for however many images we are evaluating
ground_truth_prompt = generated_ground_truth_prompt 

# Define the GCS paths for the images you want to score
# (e.g., one high quality, one low quality)
eval_video_uris = [
    "gs://sandbox-401718-product-fidelity-eval/generated/01-unicorn-veo.mp4", 
]

# Construct Responses list using file_uri structure
responses = []

for uri in eval_video_uris:
    responses.append({
        "parts": [
            {
                "file_data": {
                    "mime_type": "video/mp4", 
                    "file_uri": uri
                }
            }
        ],
        "role": "model",
    })

# Create Prompts list matching the length of Responses
prompts = [ground_truth_prompt] * len(responses)

# Create the DataFrame
eval_dataset = pd.DataFrame(
    {
        "prompt": prompts,
        "response": responses,
    }
)

print(f"Dataset created with {len(eval_dataset)} items.")
eval_dataset.head()

Dataset created with 1 items.


Unnamed: 0,prompt,response
0,"This visible object is an inflatable, full-bod...",{'parts': [{'file_data': {'mime_type': 'video/...


# Run evaluation

In [7]:
# from vertexai import Client, types

vertex_client = VertexClient(project=PROJECT_ID, location=LOCATION)

In [8]:
# @title ### Generate rubrics
# @markdown First we generate rubrics for the user prompts.

data_with_rubrics = vertex_client.evals.generate_rubrics(
    src=eval_dataset,
    rubric_group_name="gecko_video_rubrics",
    predefined_spec_name=vertex_types.RubricMetric.GECKO_TEXT2VIDEO,
)



In [23]:
# data_with_rubrics.show() # crashes because of bytes input

In [12]:
# Select the first row's 'rubric_groups' cell
first_row_data = data_with_rubrics.eval_dataset_df['rubric_groups'].iloc[0]

# Access the specific rubric list using the key seen in your screenshot
rubrics_list = first_row_data['gecko_video_rubrics']

# Prints
# Iterate and print the details
# print(f"Found {len(rubrics_list)} rubric questions:\n")

# for i, rubric in enumerate(rubrics_list):
#     print(f"--- Question {i+1} ---")
#     print(rubric)
#     # If the output is still cluttered, try printing specific attributes:
#     # print(f"Question: {rubric.description}")
#     # print(f"Score: {rubric.score}")
#     print("\n")

In [13]:
# @title ### Evaluate with rubrics
# @markdown Then we use the generated rubrics to evaluate the quality of the responses.

eval_result = vertex_client.evals.evaluate(
    dataset=data_with_rubrics,
    metrics=[vertex_types.RubricMetric.GECKO_TEXT2VIDEO],
)

# eval_result.show()

Computing Metrics for Evaluation Dataset: 100%|██████████| 1/1 [00:27<00:00, 27.81s/it]


### Generate Product Report for HITL Review

In [26]:
import base64
import html
import os
from google.cloud import storage

def _load_media_as_base64(path: str) -> tuple[str, str, str]:
    """
    Load media (image or video) from a local path or GCS URI and return (base64_data, mime_type, media_category).
    Returns (None, None, None) if the media cannot be loaded.
    """
    try:
        # Determine mime type from extension
        ext = path.lower().split('.')[-1]
        mime_map = {
            'png': ('image/png', 'image'),
            'jpg': ('image/jpeg', 'image'),
            'jpeg': ('image/jpeg', 'image'),
            'gif': ('image/gif', 'image'),
            'webp': ('image/webp', 'image'),
            'mp4': ('video/mp4', 'video'),
            'mov': ('video/quicktime', 'video'),
            'webm': ('video/webm', 'video')
        }
        mime_type, media_category = mime_map.get(ext, ('image/png', 'image'))

        if path.startswith('gs://'):
            # Parse GCS path
            path_without_prefix = path[5:]  # Remove 'gs://'
            bucket_name = path_without_prefix.split('/')[0]
            blob_path = '/'.join(path_without_prefix.split('/')[1:])

            # Download from GCS
            client = storage.Client()
            bucket = client.bucket(bucket_name)
            blob = bucket.blob(blob_path)
            media_bytes = blob.download_as_bytes()
        else:
            # Local file
            with open(path, 'rb') as f:
                media_bytes = f.read()

        b64_data = base64.b64encode(media_bytes).decode('utf-8')
        return b64_data, mime_type, media_category

    except Exception as e:
        print(f"Warning: Could not load media {path}: {e}")
        return None, None, None

def create_gecko_html_report(eval_result, source_uris=None, filename="gecko_report.html"):
  """
  Generates an HTML report with collapsible candidate media sections.
  """

  # 1. Access Data safely
  try:
    df = eval_result.evaluation_dataset[0].eval_dataset_df
  except (AttributeError, IndexError):
    print("Error: Could not find evaluation dataset in result object.")
    return

  results = eval_result.eval_case_results

  # Extract the Global Prompt
  global_prompt_text = str(df.iloc[0]['prompt'])

  # Handle Summary Metrics
  if eval_result.summary_metrics:
    summary = eval_result.summary_metrics[0]
    s_val = summary.mean_score if summary.mean_score is not None else 0.0
    mean_score = f"{s_val:.2f}"
  else:
    mean_score = "N/A"

  # --- BUILD SOURCE MEDIA HTML (with actual media) ---
  source_media_html = ""
  if source_uris:
    for uri in source_uris:
      name = uri.split('/')[-1]
      b64_data, mime_type, media_category = _load_media_as_base64(uri)

      if b64_data:
        if media_category == 'video':
            media_tag = f'<video src="data:{mime_type};base64,{b64_data}" controls style="max-height: 160px; max-width: 200px; border-radius: 4px; border: 1px solid #ccc;"></video>'
        else:
            media_tag = f'<img src="data:{mime_type};base64,{b64_data}" alt="{html.escape(name)}" style="max-height: 160px; max-width: 200px; border-radius: 4px; border: 1px solid #ccc;">'
      else:
        media_tag = f'''<div style="height: 160px; width: 160px; background: #eee; display: flex; align-items: center; justify-content: center; border: 1px solid #ccc; border-radius: 4px;">
           <span style="font-size: 0.8em; color: #555;">Failed to load:<br>{html.escape(name)}</span>
        </div>'''

      source_media_html += f"""
      <div class="orig-item">
        {media_tag}
        <div class="orig-label">{html.escape(name)}</div>
      </div>
      """

  # 2. Start HTML String with updated CSS for collapsible sections
  html_content = f"""
  <!DOCTYPE html>
  <html lang="en">
  <head>
    <meta charset="UTF-8">
    <title>Gecko Evaluation Report</title>
    <style>
      body {{ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; margin: 0; background: #f4f4f4; color: #333; }}
      .main-container {{ max-width: 1200px; margin: 30px auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 15px rgba(0,0,0,0.08); }}
      h1 {{ color: #1a73e8; margin-top: 0; border-bottom: 2px solid #eee; padding-bottom: 10px; }}
      .ground-truth-section {{ display: flex; gap: 20px; margin-bottom: 30px; background: #fff; border: 1px solid #e0e0e0; border-radius: 8px; padding: 20px; }}
      .gt-images {{ flex: 1; display: flex; gap: 15px; flex-wrap: wrap; }}
      .orig-item {{ text-align: center; }}
      .gt-prompt-box {{ flex: 1.2; background: #f8f9fa; border: 1px solid #eee; border-radius: 4px; padding: 15px; display: flex; flex-direction: column; }}
      .gt-prompt-label {{ font-weight: bold; color: #1a73e8; margin-bottom: 8px; font-size: 0.9em; text-transform: uppercase; }}
      .gt-prompt-text {{ font-size: 0.9em; line-height: 1.5; color: #444; overflow-y: auto; max-height: 200px; white-space: pre-wrap; }}
      .summary-bar {{ background: #e8f0fe; padding: 12px 20px; border-radius: 5px; margin-bottom: 25px; border-left: 5px solid #1a73e8; display: flex; gap: 30px; align-items: center; }}
      .summary-val {{ font-weight: bold; color: #1a73e8; }}

      /* Collapsible candidate styles */
      .candidate-list {{ display: flex; flex-direction: column; gap: 12px; }}
      .candidate-item {{ border: 1px solid #e0e0e0; border-radius: 8px; overflow: hidden; }}
      .candidate-item[open] {{ box-shadow: 0 2px 8px rgba(0,0,0,0.1); }}
      .candidate-item summary {{
        padding: 15px 20px;
        background: #fafafa;
        cursor: pointer;
        display: flex;
        align-items: center;
        gap: 15px;
        list-style: none;
        border-bottom: 1px solid transparent;
      }}
      .candidate-item[open] summary {{
        border-bottom-color: #e0e0e0;
        background: #f5f5f5;
      }}
      .candidate-item summary::-webkit-details-marker {{ display: none; }}
      .candidate-item summary::before {{
        content: "\\25B6";
        font-size: 0.7em;
        color: #666;
        transition: transform 0.2s;
      }}
      .candidate-item[open] summary::before {{
        transform: rotate(90deg);
      }}
      .summary-info {{ flex: 1; display: flex; align-items: center; gap: 15px; }}
      .summary-name {{ font-weight: 500; color: #333; }}
      .summary-stats {{ color: #666; font-size: 0.85em; }}

      .candidate-content {{ padding: 20px; display: flex; gap: 20px; background: #fff; }}
      .candidate-media {{ flex: 0 0 450px; }}
      .candidate-media img, .candidate-media video {{ max-width: 100%; height: auto; border-radius: 4px; border: 1px solid #eee; }}
      .candidate-media .gcs-placeholder {{
        height: 200px;
        background: #f0f0f0;
        display: flex;
        align-items: center;
        justify-content: center;
        border-radius: 4px;
        text-align: center;
        padding: 10px;
      }}
      .candidate-verdicts {{ flex: 1; }}

      .score-badge {{ display: inline-block; padding: 5px 12px; border-radius: 15px; font-weight: bold; font-size: 0.9em; }}
      .score-high {{ background: #e6f4ea; color: #188038; }}
      .score-medium {{ background: #fef7e0; color: #b06000; }}
      .score-low {{ background: #fce8e6; color: #d93025; }}

      .rubric-list {{ list-style-type: none; padding: 0; margin: 0; }}
      .rubric-item {{ margin-bottom: 6px; padding: 10px 12px; background: #fafafa; border: 1px solid #eee; border-left-width: 4px; border-radius: 4px; font-size: 0.9em; }}
      .rubric-pass {{ border-left-color: #188038; background: #f6fef7; }}
      .rubric-fail {{ border-left-color: #d93025; background: #fef7f6; }}
      .rubric-icon {{ margin-right: 8px; }}

      .verdict-summary {{ margin-bottom: 15px; padding: 10px 15px; background: #f8f9fa; border-radius: 4px; font-size: 0.85em; color: #555; }}

      /* Expand/Collapse All buttons */
      .controls {{ margin-bottom: 15px; display: flex; gap: 10px; }}
      .controls button {{
        padding: 8px 16px;
        border: 1px solid #1a73e8;
        background: white;
        color: #1a73e8;
        border-radius: 4px;
        cursor: pointer;
        font-size: 0.85em;
      }}
      .controls button:hover {{ background: #e8f0fe; }}
    </style>
  </head>
  <body>
  <div class="main-container">
    <h1>Gecko Evaluation Report</h1>
    <div class="ground-truth-section">
      <div class="gt-images">{source_media_html}</div>
      <div class="gt-prompt-box">
        <div class="gt-prompt-label">Ground Truth Prompt</div>
        <div class="gt-prompt-text">{html.escape(global_prompt_text)}</div>
      </div>
    </div>
    <div class="summary-bar">
      <div class="summary-item">Average Score: <span class="summary-val">{mean_score} / 1.0</span></div>
      <div class="summary-item">Total Candidates: <span class="summary-val">{len(results)}</span></div>
    </div>

    <div class="controls">
      <button onclick="document.querySelectorAll('.candidate-item').forEach(d => d.open = true)">Expand All</button>
      <button onclick="document.querySelectorAll('.candidate-item').forEach(d => d.open = false)">Collapse All</button>
    </div>

    <div class="candidate-list">
  """

  # 3. Iterate through results
  for case in results:
    index = case.eval_case_index
    metric_data = case.response_candidate_results[0].metric_results
    metric_key = list(metric_data.keys())[0]
    data = metric_data[metric_key]
    score = data.score if data.score is not None else 0.0
    verdicts = data.rubric_verdicts

    # Get Original Data
    row = df.iloc[index]

    # Determine score class
    if score >= 0.7:
      score_class = "score-high"
    elif score >= 0.4:
      score_class = "score-medium"
    else:
      score_class = "score-low"

    # Extract media name and build media HTML
    media_name = f"Candidate {index + 1}"
    try:
      parts = row['response'].get('parts', [])
      if parts and 'file_data' in parts[0]:
          file_uri = parts[0]['file_data']['file_uri']
          media_name = file_uri.split('/')[-1]

          # Load actual media from GCS
          b64_data, mime_type, media_category = _load_media_as_base64(file_uri)
          if b64_data:
              if media_category == 'video':
                  media_html = f'<video src="data:{mime_type};base64,{b64_data}" controls></video>'
              else:
                  media_html = f'<img src="data:{mime_type};base64,{b64_data}" alt="{html.escape(media_name)}">'
          else:
              gcs_link = f"https://console.cloud.google.com/storage/browser/_details/{file_uri.replace('gs://', '')}"
              media_html = f"""<div class="gcs-placeholder">
                <div>
                  <div style="font-size: 2em; margin-bottom: 10px;">⚠️</div>
                  <div style="font-size: 0.85em; color: #d93025;">Failed to load media</div>
                  <a href="{gcs_link}" target="_blank" style="font-size: 0.8em;">{html.escape(media_name)}</a>
                </div>
              </div>"""
      elif parts and 'inline_data' in parts[0]:
          media_bytes = parts[0]['inline_data']['data']
          b64_media = base64.b64encode(media_bytes).decode('utf-8')
          mime_type = parts[0]['inline_data'].get('mime_type', 'image/png')
          if mime_type.startswith('video/'):
              media_html = f'<video src="data:{mime_type};base64,{b64_media}" controls></video>'
          else:
              media_html = f'<img src="data:{mime_type};base64,{b64_media}" alt="Generated Media">'
      else:
         media_html = "<div class='gcs-placeholder'><div style='color:orange;'>No media data</div></div>"
    except Exception as e:
      media_html = f"<div class='gcs-placeholder'><div style='color:red;'>Media Error: {e}</div></div>"

    # Count pass/fail verdicts
    pass_count = 0
    fail_count = 0
    verdicts_html = "<ul class='rubric-list'>"
    if verdicts:
      for v in verdicts:
        raw_verdict = getattr(v, 'verdict', False)
        is_pass = str(raw_verdict).lower() == 'true'
        if is_pass:
          pass_count += 1
          css_class = "rubric-pass"
          icon = "✓"
        else:
          fail_count += 1
          css_class = "rubric-fail"
          icon = "✗"
        try:
          text = v.evaluated_rubric.content.property.description
        except AttributeError:
          text = str(v)
        verdicts_html += f"<li class='rubric-item {css_class}'><span class='rubric-icon'>{icon}</span>{html.escape(str(text))}</li>"
    else:
       verdicts_html += "<li class='rubric-item'>No details available</li>"
    verdicts_html += "</ul>"

    total_verdicts = pass_count + fail_count
    verdict_summary = f"{pass_count} passed, {fail_count} failed" if total_verdicts > 0 else "No verdicts"

    # Build collapsible section (open by default for low scores)
    open_attr = "open" if score < 0.7 else ""

    html_content += f"""
      <details class="candidate-item" {open_attr}>
        <summary>
          <div class="summary-info">
            <span class="summary-name">{html.escape(media_name)}</span>
            <span class="score-badge {score_class}">{score:.2f}</span>
            <span class="summary-stats">{verdict_summary}</span>
          </div>
        </summary>
        <div class="candidate-content">
          <div class="candidate-media">
            {media_html}
          </div>
          <div class="candidate-verdicts">
            <div class="verdict-summary">
              <strong>Rubric Results:</strong> {pass_count}/{total_verdicts} criteria passed
            </div>
            {verdicts_html}
          </div>
        </div>
      </details>
    """

  html_content += """
    </div>
  </div>
  </body>
  </html>
  """

  with open(filename, "w", encoding="utf-8") as f:
    f.write(html_content)

  print(f"Report generated successfully: {filename}")

create_gecko_html_report(eval_result, source_uris=gcs_image_uris)


Report generated successfully: gecko_report.html
