# Prompt Evaluation & Optimization with Vertex AI SDK

# ----------------------------------------------------

This notebook demonstrates how to perform prompt evaluation and optimization
using Vertex AI Generative AI Evaluation SDK.
It includes: dataset prep, metric setup, evaluation run, optimization loop.

## Setup and installation


In [1]:
!pip install --upgrade google-cloud-aiplatform[evaluation] --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m684.5 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.1/278.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m753.1/753.1 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import pandas as pd
from google.cloud import aiplatform
import vertexai
from vertexai.evaluation import EvalTask, PointwiseMetric, PointwiseMetricPromptTemplate

  from google.cloud.aiplatform.utils import gcs_utils


In [None]:
PROJECT_ID = "your-gcp-project-id"
LOCATION = "us-central1"
vertexai.init(project=PROJECT_ID, location=LOCATION)
print(f"Initialized Vertex AI for project {PROJECT_ID}, location {LOCATION}")

## Prepare evaluation dataset


In [3]:
df = pd.DataFrame({
    'prompt': ["Summarize the benefits of AI in healthcare.", "Explain cloud computing in simple terms."],
    'response': ["AI improves diagnostics and personalized care.", "Cloud computing means using remote servers for storage and processing."],
    'reference': ["AI in healthcare aids diagnosis and treatment personalization.", "Cloud computing uses internet-based servers instead of local ones."]
})

df.to_json("eval_dataset.jsonl", orient="records", lines=True)
print("Evaluation dataset created and saved as eval_dataset.jsonl")


Evaluation dataset created and saved as eval_dataset.jsonl


## Define evaluation metrics


In [None]:
readability_metric = PointwiseMetric(
    metric="readability_grade_level",
    metric_prompt_template=PointwiseMetricPromptTemplate(
        criteria={
            "grade_level": (
                "Estimate the U.S. grade-level readability of the response. Lower is simpler, higher is complex."
            ),
            "conciseness": (
                "Evaluate how concise the response is: avoids unnecessary words while preserving meaning."
            )
        },
        rating_rubric={
            "1": "Excellent readability and conciseness.",
            "0": "Average readability and conciseness.",
            "-1": "Poor readability or verbosity."
        },
        input_variables=["prompt", "response", "reference"]
    )
)
print("Custom readability metric defined.")


## Run evaluation

In [None]:
eval_task = EvalTask(
    dataset="eval_dataset.jsonl",
    metrics=[readability_metric, "bleu", "rouge_l_sum"],
    experiment="prompt_eval_experiment"
)

print("Running evaluation task...")
eval_result = eval_task.evaluate(
    model="projects/your-project/locations/us-central1/models/YOUR_MODEL_ID",
    prompt_template="{prompt}"
)
print("Evaluation completed.")


## View evaluation results


In [None]:
print(eval_result.metrics_summary)
per_instance = eval_result.to_dataframe()
per_instance.sort_values(by="readability_grade_level", ascending=True).head(5)

# Prompt Optimization


In [4]:
from vertexai.preview.generative_models import PromptOptimizer
optimizer = PromptOptimizer()

optimized_prompt = optimizer.optimize(
    model="gemini-1.5-pro",
    task_type="text-generation",
    prompt="Explain cloud computing in simple terms.",
    optimization_goal="improve conciseness and clarity"
)
print("Optimized prompt:", optimized_prompt.optimized_prompt_text)


In [None]:
# Re-evaluate optimized prompt
optimized_data = pd.DataFrame({
    'prompt': [optimized_prompt.optimized_prompt_text],
    'response': ["Cloud computing is using online servers to store and process data instead of your computer."],
    'reference': ["Cloud computing uses internet-based servers instead of local ones."]
})
optimized_data.to_json("eval_optimized.jsonl", orient="records", lines=True)


In [None]:
re_eval = EvalTask(
    dataset="eval_optimized.jsonl",
    metrics=[readability_metric, "bleu", "rouge_l_sum"],
    experiment="prompt_re_eval_experiment"
)


In [None]:
optimized_result = re_eval.evaluate(
    model="projects/your-project/locations/us-central1/models/YOUR_MODEL_ID",
    prompt_template="{prompt}"
)
print("Re-evaluation completed. Metrics:")
print(optimized_result.metrics_summary)
