<table align="left">
  <tr>
    <td><img src="fleet.png" alt="fleet of icecream trucks" width="120"/></td>
    <td align="left"><h1>Lesson 4: Monitoring and Evaluating your Agent</h1></td>
  </tr>
</table>


<div style="background-color:#fff6ff; padding:13px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px">
<p> 💻 &nbsp; <b>Access <code>requirements.txt</code> file:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Open"</em>.

<p> ⬇ &nbsp; <b>Download Notebooks:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Download as"</em> and select <em>"Notebook (.ipynb)"</em>.</p>

<p> 📒 &nbsp; For more help, please see the <em>"Appendix – Tips, Help, and Download"</em> Lesson.</p>

</div>

<p style="background-color:#f7fff8; padding:15px; border-width:3px; border-color:#e0f0e0; border-style:solid; border-radius:6px"> 🚨
&nbsp; <b>Different Run Results:</b> The output generated by AI chat models can vary with each execution due to their dynamic, probabilistic nature. Don't be surprised if your results differ from those shown in the video.</p>

## Setup Tracing

In [1]:
PROJECT_NAME = "Customer-Success"

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
from phoenix.otel import register
from openinference.instrumentation.smolagents import SmolagentsInstrumentor

tracer_provider = register(
    project_name=PROJECT_NAME,
    #endpoint= get_phoenix_endpoint() + "v1/traces"
    endpoint = os.getenv('DLAI_LOCAL_URL').format(port='6006') + "v1/traces"
)
SmolagentsInstrumentor().instrument(tracer_provider=tracer_provider)

🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: Customer-Success
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: https://s172-29-70-252p6006.lab-aws-production.deeplearning.ai/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [3]:
from dotenv import load_dotenv, find_dotenv
load_dotenv() # load variables from local .env file

from huggingface_hub import login

login(os.getenv('HF_API_KEY'))

In [4]:
from smolagents import HfApiModel

model=HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct", provider="together")

model([{"role": "user", "content": "Hello!"}])

ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Hello! How can I assist you today?', tool_calls=[], raw=ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content='Hello! How can I assist you today?', tool_call_id=None, tool_calls=[]), logprobs=None, seed=13618980564928662000)], created=1745329835, id='nr7pchw-4yUbBN-9345974f3e93058b', model='Qwen/Qwen2.5-Coder-32B-Instruct', system_fingerprint=None, usage=ChatCompletionOutputUsage(completion_tokens=10, prompt_tokens=31, total_tokens=41), object='chat.completion', prompt=[]))

In [5]:
# This is where you can access the display:
print(os.environ.get('DLAI_LOCAL_URL').format(port='6006'))

https://s172-29-70-252p6006.lab-aws-production.deeplearning.ai/


## Trace an agent run

In [6]:
from smolagents import HfApiModel, CodeAgent

agent = CodeAgent(model=model, tools=[])

>Note, the following line will sometimes get a timeout on the interface to the tracing package due to the networked interface. If this happens, try it again.


In [7]:
agent.run("What is the 100th Fibonacci number?")

354224848179261915075

In [8]:
# This is where you can access the display:
print(os.environ.get('DLAI_LOCAL_URL').format(port='6006'))

https://s172-29-70-252p6006.lab-aws-production.deeplearning.ai/


## Setup ice cream production system

In [9]:
from smolagents import tool
from typing import Dict

menu_prices = {"crepe nutella": 1.50, "vanilla ice cream": 2, "maple pancake": 1.}

ORDER_BOOK = {}

@tool
def place_order(quantities: Dict[str, int], session_id: int) -> None:
    """Places a pre-order of snacks.

    Args:
        quantities: a dictionary with names as keys and quantities as values
        session_id: the id for the client session
    """
    global ORDER_BOOK
    assert isinstance(quantities, dict), "Incorrect type for the input dictionary!"
    assert [key in menu_prices for key in quantities.keys()], f"All food names should be within {menu_prices.keys()}"
    ORDER_BOOK[session_id] = quantities

@tool
def get_prices(quantities: Dict[str, int]) -> str:
    """Gets price for certain quantities of ice cream.

    Args:
        quantities: a dictionary with names as keys and quantities as values
    """
    assert isinstance(quantities, dict), "Incorrect type for the input dictionary!"
    assert [key in menu_prices for key in quantities.keys()], f"All food names should be within {menu_prices.keys()}"
    total_price = sum([menu_prices[key] * value for key, value in quantities.items()])
    return (
        f"Given the current menu prices:\n{menu_prices}\nThe total price for your order would be: ${total_price}"
    )

In [10]:
order_agent = CodeAgent(
    tools=[place_order, get_prices],
    model=HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct", provider="together")
)

In [11]:
order_agent.run(
    "Could I come and collect one crepe nutella?",
    additional_args={"session_id": 192}
)

'Order placed for one crepe nutella.'

### Try multiple orders

In [12]:
client_requests = [
    ("Could I come and collect one crepe nutella?", "place_order"),
    ("What would be the price for 1 crêpe nutella + 2 pancakes?", "get_prices"),
    ("How did you start your ice-cream business?", None),
    ("What's the weather at the Louvre right now?", None),
    ("I'm not sure if I should order. I want a vanilla ice cream. but if it's more expensive than $1, I don't want it. If it's below, I'll order it, please.", "place_order")
]

In [13]:
for request in client_requests:
    order_agent.run(
        request[0],
        additional_args={"session_id": 0, "menu_prices": menu_prices}
    )

In [14]:
import phoenix as px

spans = px.Client().get_spans_dataframe(project_name=PROJECT_NAME)
spans.head(20)



Unnamed: 0_level_0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,context.span_id,context.trace_id,...,attributes.llm.token_count.total,attributes.openinference.span.kind,attributes.output.value,attributes.input.value,attributes.llm.token_count.prompt,attributes.llm.input_messages,attributes.tool.description,attributes.tool.name,attributes.tool.parameters,attributes.smolagents
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35dc96a5cd252e47,HfApiModel.__call__,LLM,,2025-04-23 17:33:52.916623+00:00,2025-04-23 17:33:53.077994+00:00,OK,,[],35dc96a5cd252e47,b88d0d897c1dd16dc3cc387fe7e957af,...,41.0,LLM,"{""role"": ""assistant"", ""content"": ""Hello! How c...","{""messages"": [{""role"": ""user"", ""content"": ""Hel...",31.0,,,,,
51153a488eb91ce4,HfApiModel.__call__,LLM,2ab6684d6a2f2299,2025-04-23 17:33:53.189991+00:00,2025-04-23 17:33:53.215385+00:00,OK,,[],51153a488eb91ce4,f815d9b433b495ea94d917eb117f0952,...,2150.0,LLM,"{""role"": ""assistant"", ""content"": ""Thought: To ...","{""messages"": [{""role"": ""system"", ""content"": [{...",2020.0,"[{'message.role': 'system', 'message.content':...",,,,
75d9d598bf3e2575,FinalAnswerTool,TOOL,2ab6684d6a2f2299,2025-04-23 17:33:53.271558+00:00,2025-04-23 17:33:53.271626+00:00,OK,,[],75d9d598bf3e2575,f815d9b433b495ea94d917eb117f0952,...,,TOOL,,"{""args"": [354224848179261915075], ""sanitize_in...",,,Provides a final answer to the given problem.,final_answer,"{'answer': {'type': 'any', 'description': 'The...",
2ab6684d6a2f2299,Step 1,CHAIN,8cb27821e70119f2,2025-04-23 17:33:53.189770+00:00,2025-04-23 17:33:53.298794+00:00,OK,,[],2ab6684d6a2f2299,f815d9b433b495ea94d917eb117f0952,...,,CHAIN,Execution logs:\nLast output from code snippet...,"{""memory_step"": ""ActionStep(model_input_messag...",,,,,,
8cb27821e70119f2,CodeAgent.run,AGENT,,2025-04-23 17:33:53.181789+00:00,2025-04-23 17:33:53.327488+00:00,OK,,[],8cb27821e70119f2,f815d9b433b495ea94d917eb117f0952,...,2150.0,AGENT,354224848179261915075,"{""task"": ""What is the 100th Fibonacci number?""...",2020.0,,,,,"{'max_steps': 20, 'tools_names': ['final_answe..."
e3172f53bdaf61b5,HfApiModel.__call__,LLM,0f23f396686395d2,2025-04-23 17:33:55.360668+00:00,2025-04-23 17:33:55.375512+00:00,OK,,[],e3172f53bdaf61b5,d0902e4aedb1b38037e0c3d2b9e5a424,...,2261.0,LLM,"{""role"": ""assistant"", ""content"": ""Thought: I n...","{""messages"": [{""role"": ""system"", ""content"": [{...",2193.0,"[{'message.role': 'system', 'message.content':...",,,,
34b870b90802dd91,SimpleTool,TOOL,0f23f396686395d2,2025-04-23 17:33:55.402042+00:00,2025-04-23 17:33:55.402104+00:00,OK,,[],34b870b90802dd91,d0902e4aedb1b38037e0c3d2b9e5a424,...,,TOOL,,"{""args"": [], ""sanitize_inputs_outputs"": false,...",,,Places a pre-order of snacks.,place_order,"{'quantities': {'type': 'object', 'additionalP...",
6276e1b97d35fd51,FinalAnswerTool,TOOL,0f23f396686395d2,2025-04-23 17:33:55.427925+00:00,2025-04-23 17:33:55.427975+00:00,OK,,[],6276e1b97d35fd51,d0902e4aedb1b38037e0c3d2b9e5a424,...,,TOOL,,"{""args"": [""Order placed for one crepe nutella....",,,Provides a final answer to the given problem.,final_answer,"{'answer': {'type': 'any', 'description': 'The...",
0f23f396686395d2,Step 1,CHAIN,0f6bbb3aa5904b13,2025-04-23 17:33:55.360412+00:00,2025-04-23 17:33:55.454953+00:00,OK,,[],0f23f396686395d2,d0902e4aedb1b38037e0c3d2b9e5a424,...,,CHAIN,Execution logs:\nLast output from code snippet...,"{""memory_step"": ""ActionStep(model_input_messag...",,,,,,
0f6bbb3aa5904b13,CodeAgent.run,AGENT,,2025-04-23 17:33:55.353038+00:00,2025-04-23 17:33:55.480614+00:00,OK,,[],0f6bbb3aa5904b13,d0902e4aedb1b38037e0c3d2b9e5a424,...,2261.0,AGENT,Order placed for one crepe nutella.,"{""task"": ""Could I come and collect one crepe n...",2193.0,,,,,"{'additional_args': '{""session_id"": 192}', 'to..."


### Add processing to extract desired information

In [15]:
import pandas as pd
import json

agents = spans[spans['span_kind'] == 'AGENT'].copy()
agents['task'] = agents['attributes.input.value'].apply(
    lambda x: json.loads(x).get('task') if isinstance(x, str) else None
)

tools = spans.loc[
    spans['span_kind'] == 'TOOL',
    ["attributes.tool.name", "attributes.input.value", "context.trace_id"]
].copy()

tools_per_task = agents[
    ["name", "start_time", "task", "context.trace_id"]
].merge(
    tools,
    on="context.trace_id",
    how="left",
)
tools_per_task.head()

Unnamed: 0,name,start_time,task,context.trace_id,attributes.tool.name,attributes.input.value
0,CodeAgent.run,2025-04-23 17:33:53.181789+00:00,What is the 100th Fibonacci number?,f815d9b433b495ea94d917eb117f0952,final_answer,"{""args"": [354224848179261915075], ""sanitize_in..."
1,CodeAgent.run,2025-04-23 17:33:55.353038+00:00,Could I come and collect one crepe nutella?,d0902e4aedb1b38037e0c3d2b9e5a424,place_order,"{""args"": [], ""sanitize_inputs_outputs"": false,..."
2,CodeAgent.run,2025-04-23 17:33:55.353038+00:00,Could I come and collect one crepe nutella?,d0902e4aedb1b38037e0c3d2b9e5a424,final_answer,"{""args"": [""Order placed for one crepe nutella...."
3,CodeAgent.run,2025-04-23 17:33:58.939387+00:00,Could I come and collect one crepe nutella?,7528d9f130bdf70f4a23683ecabf9629,place_order,"{""args"": [], ""sanitize_inputs_outputs"": false,..."
4,CodeAgent.run,2025-04-23 17:33:58.939387+00:00,Could I come and collect one crepe nutella?,7528d9f130bdf70f4a23683ecabf9629,final_answer,"{""args"": [""Order placed for one crepe nutella...."


### Now, compare tool calls with exected tool calls

In [16]:
def score_request(expected_tool: str, tool_calls: list):
    if expected_tool is None:
        return tool_calls == set(["final_answer"])
    else:
        return expected_tool in tool_calls

results = []
for request, expected_tool in client_requests:
    tool_calls = set(tools_per_task.loc[tools_per_task["task"] == request, "attributes.tool.name"].tolist())
    results.append(
        {
            "request": request,
            "tool_calls_performed": tool_calls,
            "is_correct": score_request(expected_tool, tool_calls)
        }
    )
pd.DataFrame(results)

Unnamed: 0,request,tool_calls_performed,is_correct
0,Could I come and collect one crepe nutella?,"{final_answer, place_order}",True
1,What would be the price for 1 crêpe nutella + ...,"{final_answer, get_prices}",True
2,How did you start your ice-cream business?,"{final_answer, get_prices}",False
3,What's the weather at the Louvre right now?,{final_answer},True
4,I'm not sure if I should order. I want a vanil...,{},False
