In [1]:
%pip install --quiet --upgrade pip==23.3.1

%pip install --upgrade --user --quiet "google-cloud-aiplatform[agent_engines,evaluation,langchain]" \
    "google-cloud-aiplatform" \
    "google-cloud-logging" \
    "google-cloud-aiplatform[autologging]" \
    "langchain_google_vertexai" \
    "cloudpickle==3.0.0" \
    "pydantic>=2.10" \
    "requests==2.32.3"

In [2]:
# General
import random
import string
import google.cloud.logging
import logging

from IPython.display import HTML, Markdown, display
import pandas as pd

# Build agent
import vertexai
from google.cloud import aiplatform
from vertexai import agent_engines

# Evaluate agent
from vertexai.preview.evaluation import EvalTask
from vertexai.preview.evaluation.metrics import (
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
    TrajectorySingleToolUse,
)
from vertexai.preview.reasoning_engines import LangchainAgent

# Do not remove logging section
client = google.cloud.logging.Client()
client.setup_logging()

In [3]:
# Variable initialization
PROJECT_ID ="qwiklabs-gcp-03-3e4e70962c05"
LOCATION ="us-central1"
BUCKET_URI ="gs://qwiklabs-gcp-03-3e4e70962c05-experiments-staging-bucket"
EXPERIMENT_NAME ="evaluate-agent"

import vertexai

# Initialize vertexai
vertexai.init(
    project=PROJECT_ID,
    location=LOCATION,
    staging_bucket=BUCKET_URI,
    experiment= EXPERIMENT_NAME
)

##Build agent and execute a query

In [4]:
# General
import random
import string

from IPython.display import HTML, Markdown, display

# Build agent
import vertexai
from google.cloud import aiplatform
from vertexai import agent_engines

from vertexai.preview.reasoning_engines import LangchainAgent

In [5]:
def get_product_details(product_name: str):
    """Gathers basic details about a product."""
    details = {
        "mens blue shorts": "Elevate your summer style with these tailored blue dress shorts. Featuring a modern slim fit and breathable cotton-blend fabric, they deliver all-day comfort with a refined look. Finished with a flat front, belt loops, and sleek pockets, they're perfect for everything from rooftop brunches to casual office days.",
        "floral dress": "Stay cool and stylish in this lightweight floral midi dress, featuring a flattering cinched waist, flowing A-line skirt, and delicate ruffle details. Perfect for sunny brunches or sunset strolls, it blends feminine charm with effortless ease for any summer occasion.",
        "garden furniture": "Create your perfect outdoor retreat with this stylish, weather-resistant garden furniture set featuring plush cushions, timeless design, and all-day comfort — ideal for relaxing, entertaining, or enjoying the outdoors in style.",
        "oled tv": "Experience stunning 4K clarity, vibrant color, and perfect blacks with this OLED Smart TV — featuring AI-powered optimization, cinematic sound, and a sleek design for immersive, next-gen home entertainment.",
        "dishwasher": "The SmartWash Dishwasher delivers powerful, quiet cleaning with advanced spray tech, smart cycles, and app control—saving time, water, and energy while adding modern style to your kitchen..",
    }
    return details.get(product_name, "Product details not found.")


def get_product_price(product_name: str):
    """Gathers price about a product."""
    details = {
        "mens blue shorts": 50,
        "floral dress": 100,
        "garden furniture": 1000,
        "oled tv": 1500,
        "dishwasher": 400,
    }
    return details.get(product_name, "Product price not found.")

In [6]:
model_name ="gemini-2.0-flash"

In [7]:
local_1p_agent = LangchainAgent(
    model=model_name,
    tools=[get_product_details, get_product_price],
    agent_executor_kwargs={"return_intermediate_steps": True},
)

In [8]:
!pip install langchain_google_vertexai



In [9]:
response = local_1p_agent.query(input="Get product details for garden furniture")
display(Markdown(response["output"]))

Ok. I have the following details about garden furniture: Create your perfect outdoor retreat with this stylish, weather-resistant garden furniture set featuring plush cushions, timeless design, and all-day comfort — ideal for relaxing, entertaining, or enjoying the outdoors in style.


In [10]:
response = local_1p_agent.query(input="Get product price for garden furniture")
display(Markdown(response["output"]))

The price of the garden furniture is 1000.


In [12]:
remote_1p_agent = agent_engines.create(
    local_1p_agent,
    requirements=[
        "google-cloud-aiplatform[agent_engines,langchain]",
        "langchain_google_vertexai",
        "cloudpickle==3.0.0",
        "pydantic>=2.10",
        "requests==2.32.3",
    ],
)

INFO:vertexai.agent_engines:Identified the following requirements: {'google-cloud-aiplatform': '1.104.0', 'cloudpickle': '3.0.0', 'pydantic': '2.11.7'}
INFO:vertexai.agent_engines:The final list of requirements: ['google-cloud-aiplatform[agent_engines,langchain]', 'langchain_google_vertexai', 'cloudpickle==3.0.0', 'pydantic>=2.10', 'requests==2.32.3']
INFO:vertexai.agent_engines:Using bucket qwiklabs-gcp-03-3e4e70962c05-experiments-staging-bucket
INFO:vertexai.agent_engines:Wrote to gs://qwiklabs-gcp-03-3e4e70962c05-experiments-staging-bucket/agent_engine/agent_engine.pkl
INFO:vertexai.agent_engines:Writing to gs://qwiklabs-gcp-03-3e4e70962c05-experiments-staging-bucket/agent_engine/requirements.txt
INFO:vertexai.agent_engines:Creating in-memory tarfile of extra_packages
INFO:vertexai.agent_engines:Writing to gs://qwiklabs-gcp-03-3e4e70962c05-experiments-staging-bucket/agent_engine/dependencies.tar.gz
INFO:vertexai.agent_engines:Creating AgentEngine
INFO:vertexai.agent_engines:Create A

##Evaluate the agent using Single Tool Selection

In [13]:
eval_data = {
    "prompt": [
        "Get price for mens blue shorts",
        "Get product details and price for floral dress",
    ],
    "reference_trajectory": [
        [
            {
                "tool_name": "get_product_price",
                "tool_input": {"product_name": "mens blue shorts"},
            }
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "floral dress"},
            },
            {
                "tool_name": "get_product_price",
                "tool_input": {"product_name": "floral dress"},
            },
        ],
    ],
}

eval_sample_dataset = pd.DataFrame(eval_data)

In [None]:
EXPERIMENT_RUN = "qwiklabs-gcp-03-3e4e70962c05-single-tool-use"

single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name="get_product_price")]

single_tool_call_eval_task = EvalTask(
    dataset=eval_sample_dataset,
    metrics=single_tool_usage_metrics,
    experiment=EXPERIMENT_NAME,
    output_uri_prefix=BUCKET_URI + "/single-metric-eval",
)


single_tool_call_eval_result = single_tool_call_eval_task.evaluate(
    runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN
)


In [17]:
def display_eval_report(eval_result: pd.DataFrame) -> None:
    """Display the evaluation results."""
    metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient="index").T
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    display(Markdown("### Row-wise Metrics"))
    display(eval_result.metrics_table)

In [18]:
display_eval_report(single_tool_call_eval_result)

### Summary Metrics

Unnamed: 0,row_count,trajectory_single_tool_use/mean,trajectory_single_tool_use/std,latency_in_seconds/mean,latency_in_seconds/std,failure/mean,failure/std
0,2.0,1.0,0.0,1.278935,0.205988,0.0,0.0


### Row-wise Metrics

Unnamed: 0,prompt,reference_trajectory,response,latency_in_seconds,failure,predicted_trajectory,trajectory_single_tool_use/score
0,Get price for mens blue shorts,"[{'tool_name': 'get_product_price', 'tool_inpu...",The price for mens blue shorts is 50.\n,1.13328,0,"[{'tool_name': 'get_product_price', 'tool_inpu...",1.0
1,Get product details and price for floral dress,"[{'tool_name': 'get_product_details', 'tool_in...",The floral dress is a lightweight floral midi ...,1.424591,0,"[{'tool_name': 'get_product_details', 'tool_in...",1.0


## Evaluate the agent using Multiple Tool Selection (Trajectory) Evaluation

In [19]:
trajectory_metrics = [
    "trajectory_exact_match",
    "trajectory_in_order_match",
    "trajectory_any_order_match",
    "trajectory_recall",
]

In [20]:
EXPERIMENT_RUN = "qwiklabs-gcp-03-3e4e70962c05-agent-trajectory-evaluation"

trajectory_eval_task = EvalTask(
    dataset=eval_sample_dataset,
    metrics=trajectory_metrics,
    experiment=EXPERIMENT_NAME,
    output_uri_prefix=BUCKET_URI + "/multiple-metric-eval",
)

trajectory_eval_result = trajectory_eval_task.evaluate(runnable=remote_1p_agent,experiment_run_name=EXPERIMENT_RUN)

INFO:vertexai.preview.evaluation.eval_task:Logging Eval experiment evaluation metadata: {'output_file': 'gs://qwiklabs-gcp-03-3e4e70962c05-experiments-staging-bucket/multiple-metric-eval/eval_results_2025-07-21-18-17-48-a64f9.csv'}
100%|██████████| 2/2 [00:01<00:00,  1.63it/s]
INFO:vertexai.preview.evaluation._evaluation:All 2 responses are successfully generated from the runnable.
INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 8 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 8/8 [00:00<00:00, 10.60it/s]
INFO:vertexai.preview.evaluation._evaluation:All 8 metric requests are successfully computed.
INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:0.7774964099999124 seconds


In [21]:
display_eval_report(trajectory_eval_result)

### Summary Metrics

Unnamed: 0,row_count,trajectory_exact_match/mean,trajectory_exact_match/std,trajectory_in_order_match/mean,trajectory_in_order_match/std,trajectory_any_order_match/mean,trajectory_any_order_match/std,trajectory_recall/mean,trajectory_recall/std,latency_in_seconds/mean,latency_in_seconds/std,failure/mean,failure/std
0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.207397,0.016993,0.0,0.0


### Row-wise Metrics

Unnamed: 0,prompt,reference_trajectory,response,latency_in_seconds,failure,predicted_trajectory,trajectory_exact_match/score,trajectory_in_order_match/score,trajectory_any_order_match/score,trajectory_recall/score
0,Get price for mens blue shorts,"[{'tool_name': 'get_product_price', 'tool_inpu...",The price for mens blue shorts is 50.\n,1.19538,0,"[{'tool_name': 'get_product_price', 'tool_inpu...",1.0,1.0,1.0,1.0
1,Get product details and price for floral dress,"[{'tool_name': 'get_product_details', 'tool_in...",OK. The floral dress is a lightweight floral m...,1.219413,0,"[{'tool_name': 'get_product_details', 'tool_in...",1.0,1.0,1.0,1.0


## Evaluate the quality of the agent response to prompts

In [22]:
response_metrics = ["safety", "coherence"]

In [23]:
EXPERIMENT_RUN = "qwiklabs-gcp-03-3e4e70962c05-agent-response-evaluation"

response_eval_task = EvalTask(
    dataset=eval_sample_dataset,
    metrics=response_metrics,
    experiment=EXPERIMENT_NAME,
    output_uri_prefix=BUCKET_URI + "/response-metric-eval",
)

response_eval_result = response_eval_task.evaluate(runnable=remote_1p_agent,experiment_run_name=EXPERIMENT_RUN)

display(response_eval_result.metrics_table)


INFO:vertexai.preview.evaluation.eval_task:Logging Eval experiment evaluation metadata: {'output_file': 'gs://qwiklabs-gcp-03-3e4e70962c05-experiments-staging-bucket/response-metric-eval/eval_results_2025-07-21-18-18-44-c59d4.csv'}
100%|██████████| 2/2 [00:01<00:00,  1.45it/s]
INFO:vertexai.preview.evaluation._evaluation:All 2 responses are successfully generated from the runnable.
INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 4 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 4/4 [00:01<00:00,  3.87it/s]
INFO:vertexai.preview.evaluation._evaluation:All 4 metric requests are successfully computed.
INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:1.0503984059996583 seconds


Unnamed: 0,prompt,reference_trajectory,response,latency_in_seconds,failure,predicted_trajectory,safety/explanation,safety/score,coherence/explanation,coherence/score
0,Get price for mens blue shorts,"[{'tool_name': 'get_product_price', 'tool_inpu...",The price for mens blue shorts is 50.\n,0.987222,0,"[{'tool_name': 'get_product_price', 'tool_inpu...",The response is safe because it does not conta...,1.0,The response is coherent because it directly a...,4.0
1,Get product details and price for floral dress,"[{'tool_name': 'get_product_details', 'tool_in...",OK. The floral dress is a lightweight floral m...,1.37581,0,"[{'tool_name': 'get_product_details', 'tool_in...",The response is harmless and does not contain ...,1.0,The response is completely coherent as it logi...,5.0
