In [None]:
"""
Copyright 2025 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

# Use Gen AI Evaluation SDK to Evaluate Models

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_models_in_vertex_ai_studio_and_model_garden.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fevaluate_models_in_vertex_ai_studio_and_model_garden.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_models_in_vertex_ai_studio_and_model_garden.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/evaluate_models_in_vertex_ai_studio_and_model_garden.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>    
</table>

## Preparation

### Install required packages

In [None]:
%uv run pip install -U -q google-cloud-aiplatform[evaluation]
%uv run pip install -U -q datasets
%uv run pip install -U -q google-genai

### Restart Runtime

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Import Libraries

In [None]:
import vertexai
from vertexai.evaluation import (
    EvalTask,
    MetricPromptTemplateExamples,
    PairwiseMetric,
)
from vertexai.generative_models import GenerativeModel
from vertexai.preview.evaluation import notebook_utils
import sys

### Authenticate your notebook environment (Colab only)

In [None]:
if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    raise ValueError("Please set your PROJECT_ID")

vertexai.init(project=PROJECT_ID, location=LOCATION)

## Load an evaluation dataset

Load a subset of the `OpenOrca` dataset using the `huggingface/datasets` library. We will use 100 samples from the first 100 rows of the "train" split of `OpenOrca` dataset to demonstrate evaluating prompts and model responses in this Colab.

### Dataset Summary

The OpenOrca dataset is a collection of augmented [FLAN Collection data](https://arxiv.org/abs/2301.13688). Currently ~1M GPT-4 completions, and ~3.2M GPT-3.5 completions. It is tabularized in alignment with the distributions presented in the ORCA paper and currently represents a partial completion of the full intended dataset, with ongoing generation to expand its scope. The data is primarily used for training and evaluation in the field of natural language processing.


In [None]:
from datasets import load_dataset

ds = (
    load_dataset(
        "Open-Orca/OpenOrca",
        data_files="1M-GPT4-Augmented.parquet",
        split="train[:100]",
    )
    .to_pandas()
    .drop(columns=["id"])
    .rename(columns={"response": "reference"})
)

dataset = ds.sample(n=100)

### Preview Dataset

In [None]:
dataset.head()

## Initialize Model to be Evaluated

In [None]:
first_model_id = "gemini-2.0-flash"  # @param {type:"string"}
second_model_id = "[your-gemini-1.5-finetuned-endpoint-URI]"  # @param {type:"string"}

first_model = GenerativeModel(
    "gemini-2.0-flash",
    generation_config={"temperature": 0},
)

second_model = GenerativeModel(
    "projects/109790610330/locations/us-central1/endpoints/6532745037896220672",
    generation_config={"temperature": 0},
)

## Evaluation

Detailed of available metrics can be viewed [here](https://cloud.google.com/vertex-ai/generative-ai/docs/models/metrics-templates)

### Pointwise Evaluation

This evaluation type produce a score for each evaluated metrics. E.g. 1-5 in which higher is better

In [None]:
# Define an EvalTask with a list of metrics
pointwise_eval_task = EvalTask(
    dataset=dataset,
    metrics=[
        "groundedness",
        "instruction_following",
        "question_answering_quality",
    ],
)

first_model_result = pointwise_eval_task.evaluate(
    model=first_model,
    prompt_template="# System_prompt\n{system_prompt} # Question\n{question}",
)

second_model_result = pointwise_eval_task.evaluate(
    model=second_model,
    prompt_template="# System_prompt\n{system_prompt} # Question\n{question}",
)

#### Show Results

In [None]:
notebook_utils.display_eval_result(
    first_model, title="Gemini 2.0 Flash Evaluation Results"
)

In [None]:
notebook_utils.display_eval_result(
    second_model, title="Gemini 1.5 Flash Finetuned Evaluation Results"
)

### Pairwise Evaluation

This evaluation type will directly compete two models and show the winning rate of each model

In [None]:
# Define an EvalTask with a list of metrics
listed_metrics = ["groundedness", "instruction_following", "question_answering_quality"]

pairwise_metrics = [
    PairwiseMetric(
        metric=metric,
        metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template(metric),
        # Define a baseline model to compare against
        baseline_model=first_model,
    )
    for metric in listed_metrics
]

pairwise_result = EvalTask(
    dataset=dataset,
    metrics=pairwise_metrics,
).evaluate(
    model=second_model,
    prompt_template="# System_prompt\n{system_prompt} # Question\n{question}",
)

#### Show Results

In [6]:
notebook_utils.display_eval_result(
    pairwise_result, title="Gemini 2.0 Flash vs Gemini 1.5 Flash Finetuned Results"
)

Check detailed explanation 

In [None]:
notebook_utils.display_explanations(pairwise_result, num=1)