In [7]:
# pip install 'arize-phoenix[evals,trace,default]'

In [1]:
import phoenix as px
import os
import json
from tqdm import tqdm
from phoenix.evals import (
    TOOL_CALLING_PROMPT_TEMPLATE, 
    llm_classify,
    OpenAIModel
)
from phoenix.trace import SpanEvaluations
from phoenix.client.types.spans import SpanQuery
from openinference.instrumentation import suppress_tracing

import nest_asyncio
nest_asyncio.apply()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PROJECT_NAME = "evaluating-agent"

In [3]:
from utils import run_agent, start_main_span, tools

üî≠ OpenTelemetry Tracing Details üî≠
|  Phoenix Project: evaluating-agent
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: https://app.phoenix.arize.com/s/gangwaranshu3/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {'authorization': '****'}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [4]:
agent_questions = [
    "What was the most popular product SKU?",
    "What was the total revenue across all stores?",
    "Which store had the highest sales volume?",
    "Create a bar chart showing total sales by store",
    "What percentage of items were sold on promotion?",
    "What was the average transaction value?"
]

for question in tqdm(agent_questions, desc="Processing questions"):
    try:
        ret = start_main_span([{"role": "user", "content": question}])
    except Exception as e:
        print(f"Error processing question: {question}")
        print(e)
        continue

Processing questions:   0%|          | 0/6 [00:00<?, ?it/s]

Starting main span with messages: [{'role': 'user', 'content': 'What was the most popular product SKU?'}]
Running agent with messages: [{'role': 'user', 'content': 'What was the most popular product SKU?'}]
Starting router call span
Received response with tool calls: True
Starting tool calls span
Starting router call span
Received response with tool calls: True
Starting tool calls span
Starting router call span
Received response with tool calls: False
No tool calls, returning final response


Processing questions:  17%|‚ñà‚ñã        | 1/6 [00:21<01:47, 21.43s/it]

Starting main span with messages: [{'role': 'user', 'content': 'What was the total revenue across all stores?'}]
Running agent with messages: [{'role': 'user', 'content': 'What was the total revenue across all stores?'}]
Starting router call span
Received response with tool calls: True
Starting tool calls span
Starting router call span
Received response with tool calls: False
No tool calls, returning final response


Processing questions:  33%|‚ñà‚ñà‚ñà‚ñé      | 2/6 [00:29<00:55, 13.79s/it]

Starting main span with messages: [{'role': 'user', 'content': 'Which store had the highest sales volume?'}]
Running agent with messages: [{'role': 'user', 'content': 'Which store had the highest sales volume?'}]
Starting router call span
Received response with tool calls: True
Starting tool calls span
Starting router call span
Received response with tool calls: False
No tool calls, returning final response


Processing questions:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 3/6 [00:36<00:31, 10.42s/it]

Starting main span with messages: [{'role': 'user', 'content': 'Create a bar chart showing total sales by store'}]
Running agent with messages: [{'role': 'user', 'content': 'Create a bar chart showing total sales by store'}]
Starting router call span
Received response with tool calls: True
Starting tool calls span
Starting router call span
Received response with tool calls: True
Starting tool calls span
{'chart_type': 'line', 'x_axis': 'date', 'y_axis': 'value', 'title': 'Create a bar chart showing total sales by store', 'data': 'Store_Number    Total_Sales\n0           2970  836341.327191\n1           3300  619660.167018\n2           1320  592832.067579\n3           1650  580443.007953\n4           1210  508393.767785\n5           1100  497509.528013\n6           3080  495458.238811\n7           2750  453664.808068\n8           1540  427777.427815\n9            880  420302.088397\n10          2310  412579.388504\n11          3410  410567.848126\n12          2420  406715.767402\n13    

Processing questions:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 4/6 [01:30<00:55, 27.66s/it]

Starting main span with messages: [{'role': 'user', 'content': 'What percentage of items were sold on promotion?'}]
Running agent with messages: [{'role': 'user', 'content': 'What percentage of items were sold on promotion?'}]
Starting router call span
Received response with tool calls: True
Starting tool calls span
Starting router call span
Received response with tool calls: False
No tool calls, returning final response


Processing questions:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 5/6 [01:37<00:20, 20.19s/it]

Starting main span with messages: [{'role': 'user', 'content': 'What was the average transaction value?'}]
Running agent with messages: [{'role': 'user', 'content': 'What was the average transaction value?'}]
Starting router call span
Received response with tool calls: True
Starting tool calls span
Starting router call span
Received response with tool calls: False
No tool calls, returning final response


Processing questions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [01:49<00:00, 18.19s/it]


## Link to Phoenix UI

You can open this link to check out the Phoenix UI and observe the collected spans. You can use the same link to check out the results of the evaluations you'll run in this notebook. 

**Note**: 
- Since each notebook of this course runs in an isolated environment, each notebook links to a different Phoenix server. This is why you won't see the project "tracing-agent" you worked on in the previous notebook (as shown in the video).
- Make sure that the notebook's kernel is running when checking the Phoenix UI. If the link does not open, it might be because the notebook has been open or inactive for a long time. In that case, make sure to refresh the browser, run all previous cells and then check this link. 

In [17]:
phoenix_endpoint = "https://app.phoenix.arize.com/s/gangwaranshu3/"

## Router Evals using LLM-as-a-Judge

To evaluate the router, you will use this template provided by Phoenix to the LLM-as-a-Judge. 

In [18]:
print(TOOL_CALLING_PROMPT_TEMPLATE)


You are an evaluation assistant evaluating questions and tool calls to
determine whether the tool called would answer the question. The tool
calls have been generated by a separate agent, and chosen from the list of
tools provided below. It is your job to decide whether that agent chose
the right tool to call.

    [BEGIN DATA]
    ************
    [Question]: {question}
    ************
    [Tool Called]: {tool_call}
    [END DATA]

Your response must be single word, either "correct" or "incorrect",
and should not contain any text or characters aside from that word.
"incorrect" means that the chosen tool would not answer the question,
the tool includes information that is not presented in the question,
or that the tool signature includes parameter values that don't match
the formats specified in the tool signatures below.

"correct" means the correct tool call was chosen, the correct parameters
were extracted from the question, the tool call generated is runnable and correct,
and tha

In [19]:
from phoenix.client import Client

In [20]:
from dotenv import load_dotenv
load_dotenv()

True

In [21]:
client = Client(
    base_url=phoenix_endpoint,
    api_key= os.getenv("PHOENIX_API_KEY")
)

In [22]:
query = SpanQuery().where("span_kind == 'TOOL'")

filtered_df = client.spans.get_spans_dataframe(
    query=query,
    project_identifier=PROJECT_NAME,
    limit=500
)

llm_classify |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 22/22 (100.0%) | ‚è≥ 1:16:06<00:00 | 207.59s/it


In [23]:
filtered_df

Unnamed: 0_level_0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,context.span_id,context.trace_id,attributes.tool.parameters,attributes.tool.name,attributes.output.value,attributes.input.value,attributes.tool.description,attributes.output.mime_type,attributes.input.mime_type,attributes.openinference.span.kind
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
dfd8a21a1e069627,look_up_sales_data,TOOL,aedf6c6ef29816c8,2025-11-23 04:10:46.390397+00:00,2025-11-23 04:10:49.706611+00:00,OK,,[],dfd8a21a1e069627,f16bf25c3fc07bf53b422c0f3825f896,"{'type': 'object', 'title': 'look_up_sales_dat...",look_up_sales_data,average_transaction_value\n0 ...,"{""prompt"": ""What was the average transaction v...",Implementation of sales lookup from parquet fi...,text/plain,application/json,TOOL
77f5b3531e282dd8,look_up_sales_data,TOOL,d38a881361d00b37,2025-11-23 04:10:36.104746+00:00,2025-11-23 04:10:38.060829+00:00,OK,,[],77f5b3531e282dd8,42c70a8d4b6cdb08e76e956804b6cf91,"{'type': 'object', 'title': 'look_up_sales_dat...",look_up_sales_data,percentage_sold_on_promotion\n0 ...,"{""prompt"": ""What percentage of items were sold...",Implementation of sales lookup from parquet fi...,text/plain,application/json,TOOL
4f49f4c508143e38,generate_visualization,TOOL,59a3a456a34f0c13,2025-11-23 04:09:55.506633+00:00,2025-11-23 04:10:22.775769+00:00,OK,,[],4f49f4c508143e38,ed9c3faf71e327e3b7d466dfe50d55c1,"{'type': 'object', 'title': 'generate_visualiz...",generate_visualization,import pandas as pd\nimport matplotlib.pyplot ...,"{""data"": ""Store_Number Total_Sales\n0 ...",Generate a visualization based on the data and...,text/plain,application/json,TOOL
80d380432b578fd6,look_up_sales_data,TOOL,c32409a71f3803c5,2025-11-23 04:09:42.457006+00:00,2025-11-23 04:09:44.205868+00:00,OK,,[],80d380432b578fd6,ed9c3faf71e327e3b7d466dfe50d55c1,"{'type': 'object', 'title': 'look_up_sales_dat...",look_up_sales_data,Store_Number Total_Sales\n0 2...,"{""prompt"": ""Create a bar chart showing total s...",Implementation of sales lookup from parquet fi...,text/plain,application/json,TOOL
b444e08a882faaca,look_up_sales_data,TOOL,9e68803a504ccd05,2025-11-23 04:09:35.178585+00:00,2025-11-23 04:09:36.813759+00:00,OK,,[],b444e08a882faaca,9773fec0647a7d9ef687d593cddad662,"{'type': 'object', 'title': 'look_up_sales_dat...",look_up_sales_data,Store_Number Total_Sales_Volume\n0 ...,"{""prompt"": ""Which store had the highest sales ...",Implementation of sales lookup from parquet fi...,text/plain,application/json,TOOL
48553a2aef5badfc,look_up_sales_data,TOOL,06c436397fcd88bf,2025-11-23 04:09:27.660974+00:00,2025-11-23 04:09:29.119290+00:00,OK,,[],48553a2aef5badfc,6601045d2beb40aaf9e3960895aafd62,"{'type': 'object', 'title': 'look_up_sales_dat...",look_up_sales_data,total_revenue\n0 1.327264e+07,"{""prompt"": ""What was the total revenue across ...",Implementation of sales lookup from parquet fi...,text/plain,application/json,TOOL
459395773f229839,analyze_sales_data,TOOL,d5fe150aa6efc11c,2025-11-23 04:09:16.691556+00:00,2025-11-23 04:09:21.097246+00:00,OK,,[],459395773f229839,a7b1109c85c0c547c81bf80afa0cf23e,"{'type': 'object', 'title': 'analyze_sales_dat...",analyze_sales_data,"Based on the data provided, I can see that the...","{""prompt"": ""What was the most popular product ...",Implementation of AI-powered sales data analysis,text/plain,application/json,TOOL
9fff9919d19ded33,look_up_sales_data,TOOL,8531438a3b9db24c,2025-11-23 04:09:06.228344+00:00,2025-11-23 04:09:08.105523+00:00,OK,,[],9fff9919d19ded33,a7b1109c85c0c547c81bf80afa0cf23e,"{'type': 'object', 'title': 'look_up_sales_dat...",look_up_sales_data,SKU_Coded\n0 6200700,"{""prompt"": ""What was the most popular product ...",Implementation of sales lookup from parquet fi...,text/plain,application/json,TOOL
3ee2a4c1d4c80283,look_up_sales_data,TOOL,576fa2221306d5ee,2025-11-21 08:07:00.928164+00:00,2025-11-21 08:07:04.821603+00:00,OK,,[],3ee2a4c1d4c80283,ac946ecacffe61d972357fa328330e4d,"{'type': 'object', 'title': 'look_up_sales_dat...",look_up_sales_data,average_transaction_value\n0 ...,"{""prompt"": ""What was the average transaction v...",Implementation of sales lookup from parquet fi...,text/plain,application/json,TOOL
a76e13a09bd2423e,look_up_sales_data,TOOL,a01a951fab3ea748,2025-11-21 08:06:45.971611+00:00,2025-11-21 08:06:47.832580+00:00,OK,,[],a76e13a09bd2423e,35507969d1be4f95f11adac03baa2f76,"{'type': 'object', 'title': 'look_up_sales_dat...",look_up_sales_data,percentage_sold_on_promotion\n0 ...,"{""prompt"": ""What percentage of items were sold...",Implementation of sales lookup from parquet fi...,text/plain,application/json,TOOL


In [24]:
filtered_df[['attributes.tool.parameters', 'attributes.input.value']]

Unnamed: 0_level_0,attributes.tool.parameters,attributes.input.value
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1
dfd8a21a1e069627,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""What was the average transaction v..."
77f5b3531e282dd8,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""What percentage of items were sold..."
4f49f4c508143e38,"{'type': 'object', 'title': 'generate_visualiz...","{""data"": ""Store_Number Total_Sales\n0 ..."
80d380432b578fd6,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""Create a bar chart showing total s..."
b444e08a882faaca,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""Which store had the highest sales ..."
48553a2aef5badfc,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""What was the total revenue across ..."
459395773f229839,"{'type': 'object', 'title': 'analyze_sales_dat...","{""prompt"": ""What was the most popular product ..."
9fff9919d19ded33,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""What was the most popular product ..."
3ee2a4c1d4c80283,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""What was the average transaction v..."
a76e13a09bd2423e,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""What percentage of items were sold..."


In [25]:
filtered_df['attributes.input.value'][0]

'{"prompt": "What was the average transaction value?"}'

In [26]:
# Advanced querying with SpanQuery
query = SpanQuery().where("span_kind == 'TOOL'").select(
   "input.value",
   "tool.parameters"
)

filtered_df = client.spans.get_spans_dataframe(
    query=query,
    project_identifier=PROJECT_NAME,
    limit=500
)
filtered_df.rename(columns={"input.value":"tool_call", "tool.parameters":"question"}, inplace=True)

tool_calls_df = filtered_df[['question','tool_call']]

In [27]:
tool_calls_df.head()

Unnamed: 0_level_0,question,tool_call
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1
a76e13a09bd2423e,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""What percentage of items were sold..."
dead01d417c62f95,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""What was the most popular product ..."
66a7a7cecb3bfa0e,"{'type': 'object', 'title': 'generate_visualiz...","{""data"": ""Store_Number,Total_Sales\n2970,83634..."
05bfc26b40d2d5a7,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""What was the total revenue across ..."
91cd12f33c277e45,"{'type': 'object', 'title': 'look_up_sales_dat...","{""prompt"": ""Which store had the highest sales ..."


### Evaluating Tool Calling

In [45]:
with suppress_tracing():
    tool_call_eval = llm_classify(
        dataframe = tool_calls_df,
        template = TOOL_CALLING_PROMPT_TEMPLATE.template[0].template.replace("{tool_definitions}", 
                                                                 json.dumps(tools).replace("{", '"').replace("}", '"')),
        rails = ['correct', 'incorrect'],
        model=OpenAIModel(model="gpt-4o"),
        provide_explanation=True
    )

tool_call_eval['score'] = tool_call_eval.apply(lambda x: 1 if x['label']=='correct' else 0, axis=1)

tool_call_eval.head()

llm_classify |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 22/22 (100.0%) | ‚è≥ 00:04<00:00 |  4.67it/s

Unnamed: 0_level_0,label,explanation,exceptions,execution_status,execution_seconds,prompt_tokens,completion_tokens,total_tokens,score
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
a76e13a09bd2423e,correct,"The tool called is 'look_up_sales_data', which...",[],COMPLETED,1.810731,621,79,700,1
dead01d417c62f95,correct,The tool call is correct because the question ...,[],COMPLETED,1.442217,620,55,675,1
66a7a7cecb3bfa0e,correct,The tool call matches the question's requireme...,[],COMPLETED,2.201156,958,81,1039,1
05bfc26b40d2d5a7,correct,The tool call is correct because the question ...,[],COMPLETED,1.802776,621,72,693,1
91cd12f33c277e45,correct,The tool call is correct because the question ...,[],COMPLETED,1.61709,620,67,687,1


In [46]:
tool_call_eval.reset_index(inplace=True)

In [48]:
tool_call_eval.rename(columns={"context.span_id":"span_id"}, inplace=True)

In [49]:
tool_call_eval.head()

Unnamed: 0,span_id,label,explanation,exceptions,execution_status,execution_seconds,prompt_tokens,completion_tokens,total_tokens,score
0,a76e13a09bd2423e,correct,"The tool called is 'look_up_sales_data', which...",[],COMPLETED,1.810731,621,79,700,1
1,dead01d417c62f95,correct,The tool call is correct because the question ...,[],COMPLETED,1.442217,620,55,675,1
2,66a7a7cecb3bfa0e,correct,The tool call matches the question's requireme...,[],COMPLETED,2.201156,958,81,1039,1
3,05bfc26b40d2d5a7,correct,The tool call is correct because the question ...,[],COMPLETED,1.802776,621,72,693,1
4,91cd12f33c277e45,correct,The tool call is correct because the question ...,[],COMPLETED,1.61709,620,67,687,1


In [50]:
tool_call_eval = tool_call_eval[['span_id','score','label','explanation']]

In [51]:
tool_call_eval

Unnamed: 0,span_id,score,label,explanation
0,a76e13a09bd2423e,1,correct,"The tool called is 'look_up_sales_data', which..."
1,dead01d417c62f95,1,correct,The tool call is correct because the question ...
2,66a7a7cecb3bfa0e,1,correct,The tool call matches the question's requireme...
3,05bfc26b40d2d5a7,1,correct,The tool call is correct because the question ...
4,91cd12f33c277e45,1,correct,The tool call is correct because the question ...
5,92d963e0e84d7d1d,0,incorrect,The question is about looking up sales data us...
6,1362f3b15e5482d5,0,incorrect,"The tool called is 'look_up_sales_data', which..."
7,1857748e85193d4f,1,correct,The tool call is correct because the question ...
8,e82acaf13cf1f4ce,1,correct,The tool call is correct because the question ...
9,8cc74f9873d71d38,0,incorrect,The question is about looking up sales data us...


In [52]:

# 3. Log the DataFrame
client.spans.log_span_annotations_dataframe(
    dataframe=tool_call_eval,
    annotation_name="QA Correctness",  # Name of your metric
    annotator_kind="LLM"               # Source: "LLM" or "CODE" (heuristic)
)

v1/span_annotations
