In [1]:
import os
import json
from azure.ai.evaluation import (
    evaluate, 
    IntentResolutionEvaluator,
    ToolCallAccuracyEvaluator,
    TaskAdherenceEvaluator,
    AzureOpenAIModelConfiguration,
)
from pprint import pprint
from dotenv import load_dotenv

load_dotenv('../.env')

model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
    azure_deployment=os.environ.get("AZURE_OPENAI_GPT4o_DEPLOYMENT"),
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    api_version=os.environ.get("AZURE_OPENAI_API_VERSION"),
)

In [2]:
tool_definitions=[
    {
        "name": "BookingPlugin-check_availability",
        "description": "Check if a room is available on a certain date.",
        "parameters": {
            "type": "object",
            "properties": {
                "room_type": {
                    "type": "string",
                    "description": "Type of room."
                },
                "date": {
                    "type": "string",
                    "description": "Booking date in YYYY-MM-DD format."
                }
            }
        }
    },
    {
        "name": "BookingPlugin-confirm_booking",
        "description": "Confirm booking and reduce room count.",
        "parameters": {
            "type": "object",
            "properties": {
                "room_type": {
                    "type": "string",
                    "description": "Type of room."
                },
                "date": {
                    "type": "string",
                    "description": "Booking date in YYYY-MM-DD format."
                },
                "count": {
                    "type": "integer",
                    "description": "Number of rooms to book."
                }
            }
        }
    },
    {
        "name": "DiningPlugin-get_specials",
        "description": "Provides today's dining specials.",
        "parameters": {
            "type": "object",
            "properties": {}
        }
    },
    {
        "name": "DiningPlugin-get_item_price",
        "description": "Provides the price of a specified menu item.",
        "parameters": {
            "type": "object",
            "properties": {
                "menu_item": {
                    "type": "string",
                    "description": "Menu item name."
                }
            }
        }
    },
    {
        "name": "DiningPlugin-reserve_table",
        "description": "Simulates table reservation at the hotel restaurant.",
        "parameters": {
            "type": "object",
            "properties": {
                "time": {
                    "type": "string",
                    "description": "Reservation time (e.g., HH:MM)."
                },
                "party_size": {
                    "type": "integer",
                    "description": "Number of people for the reservation."
                }
            }
        }
    },
    {
        "name": "SemanticSearchPlugin-search_rooms_by_description",
        "description": "Search for hotel rooms by semantic meaning based on room description.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The description of the type of room the user is looking for."
                }
            }
        }
    },
    {
        "name": "TimePlugin-get_today",
        "description": "Returns today's date in YYYY-MM-DD format.",
        "parameters": {
            "type": "object",
            "properties": {}
        }
    },
    {
        "name": "TimePlugin-get_relative_date",
        "description": "Returns a relative date based on offset in days.",
        "parameters": {
            "type": "object",
            "properties": {
                "days_offset": {
                    "type": "integer",
                    "description": "Number of days to add to today."
                }
            }
        }
    }
]

In [3]:
intent_resolution_evaluator = IntentResolutionEvaluator(model_config=model_config)

# Open and read the entire JSONL file
print("Intent Resolution Evaluation")
print("--------------------------------------------------")
with open("evaluation_dataset.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        entry = json.loads(line)
        query = entry.get("query", "")
        response = entry.get("response", "")

        print(f"--- Entry {i} ---")
        print(f"Query           : {query}")
        print(f"Response        : {response}")
        print(f"Tool Definitions : {tool_definitions}")
        result = intent_resolution_evaluator(
            query=query,
            response=response,
            tool_definitions=tool_definitions,
        )
        print(f"Intent Resolution Result: {result}")
        print("--------------------------------------------------")

Class IntentResolutionEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Intent Resolution Evaluation
--------------------------------------------------
--- Entry 1 ---
Query           : I need a deluxe room for tomorrow. Can you check if any are available?
Response        : I'm sorry, but there are no deluxe rooms available for tomorrow. Would you like me to check availability for a different type of room or on another date?
Tool Definitions : [{'name': 'BookingPlugin-check_availability', 'description': 'Check if a room is available on a certain date.', 'parameters': {'type': 'object', 'properties': {'room_type': {'type': 'string', 'description': 'Type of room.'}, 'date': {'type': 'string', 'description': 'Booking date in YYYY-MM-DD format.'}}}}, {'name': 'BookingPlugin-confirm_booking', 'description': 'Confirm booking and reduce room count.', 'parameters': {'type': 'object', 'properties': {'room_type': {'type': 'string', 'description': 'Type of room.'}, 'date': {'type': 'string', 'description': 'Booking date in YYYY-MM-DD format.'}, 'count': {'type': 'int

In [4]:
query = "I need a deluxe room for tomorrow. Can you check if any are available?"

tool_calls=[
    {
        "type": "tool_call",
        "tool_call_id": "call_1744299539059",
        "name": "TimePlugin-get_relative_date",
        "arguments": {
            "days_offset": 1
        },
    },
    {
        "type": "tool_call",
        "tool_call_id": "call_1744299539059",
        "name": "BookingPlugin-check_availability",
        "arguments": "{\"room_type\":\"deluxe\",\"date\":\"2025-04-11\"}"
    }
]

tool_definitions=[
    {
        "name": "BookingPlugin-check_availability",
        "description": "Check if a room is available on a certain date.",
        "parameters": {
            "type": "object",
            "properties": {
                "room_type": {
                    "type": "string",
                    "description": "Type of room."
                },
                "date": {
                    "type": "string",
                    "description": "Booking date in YYYY-MM-DD format."
                }
            }
        }
    },
    {
        "name": "BookingPlugin-confirm_booking",
        "description": "Confirm booking and reduce room count.",
        "parameters": {
            "type": "object",
            "properties": {
                "room_type": {
                    "type": "string",
                    "description": "Type of room."
                },
                "date": {
                    "type": "string",
                    "description": "Booking date in YYYY-MM-DD format."
                },
                "count": {
                    "type": "integer",
                    "description": "Number of rooms to book."
                }
            }
        }
    },
    {
        "name": "DiningPlugin-get_specials",
        "description": "Provides today's dining specials.",
        "parameters": {
            "type": "object",
            "properties": {}
        }
    },
    {
        "name": "DiningPlugin-get_item_price",
        "description": "Provides the price of a specified menu item.",
        "parameters": {
            "type": "object",
            "properties": {
                "menu_item": {
                    "type": "string",
                    "description": "Menu item name."
                }
            }
        }
    },
    {
        "name": "DiningPlugin-reserve_table",
        "description": "Simulates table reservation at the hotel restaurant.",
        "parameters": {
            "type": "object",
            "properties": {
                "time": {
                    "type": "string",
                    "description": "Reservation time (e.g., HH:MM)."
                },
                "party_size": {
                    "type": "integer",
                    "description": "Number of people for the reservation."
                }
            }
        }
    },
    {
        "name": "SemanticRoomSearchPlugin-search_rooms_by_description",
        "description": "Search for hotel rooms by semantic meaning based on room description.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The description of the type of room the user is looking for."
                }
            }
        }
    },
    {
        "name": "TimePlugin-get_today",
        "description": "Returns today's date in YYYY-MM-DD format.",
        "parameters": {
            "type": "object",
            "properties": {}
        }
    },
    {
        "name": "TimePlugin-get_relative_date",
        "description": "Returns a relative date based on offset in days.",
        "parameters": {
            "type": "object",
            "properties": {
                "days_offset": {
                    "type": "integer",
                    "description": "Number of days to add to today."
                }
            }
        }
    }
]
tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config)

response = tool_call_accuracy_evaluator(
    query=query, 
    tool_calls=tool_calls, 
    tool_definitions=tool_definitions,
)
pprint(response)

Class ToolCallAccuracyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{'per_tool_call_details': [{'tool_call_accurate': True,
                            'tool_call_accurate_reason': 'The TOOL CALL is '
                                                         'directly relevant to '
                                                         "the user's request "
                                                         'for a room for '
                                                         'tomorrow, uses the '
                                                         'correct parameter as '
                                                         'per the TOOL '
                                                         'DEFINITION, and the '
                                                         'parameter value is '
                                                         'correctly inferred '
                                                         'from the '
                                                         'conversation. It is '
            

In [5]:
file_path = "evaluation_dataset.jsonl"
tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config=model_config)

with open(file_path, "r") as file:
    for line in file:
        line = line.strip()
        if not line:
            continue

        record = json.loads(line)
        query = record.get("query", "No query provided")
        tool_calls = record.get("tool_calls", [])
        response = record.get("response", "No response provided")

        print(f"Query: {query}")
        print("\nTool Calls:")
        pprint(tool_calls)

        if tool_calls:  # Only evaluate if tool_calls is not empty
            metric = tool_call_accuracy_evaluator(
                query=query, 
                tool_calls=tool_calls, 
                tool_definitions=tool_definitions,
            )
            print("\nTool Call Accuracy Result: ")
            pprint(metric)
        else:
            print("\nNo tool calls to evaluate.")
        print("-----------------------------------------------")


Query: I need a deluxe room for tomorrow. Can you check if any are available?

Tool Calls:
[{'arguments': '{"days_offset":1}',
  'name': 'TimePlugin-get_relative_date',
  'tool_call_id': 'call_1744299539059',
  'type': 'tool_call'},
 {'arguments': '{"room_type":"deluxe","date":"2025-04-11"}',
  'name': 'BookingPlugin-check_availability',
  'tool_call_id': 'call_1744299539059',
  'type': 'tool_call'}]

Tool Call Accuracy Result: 
{'per_tool_call_details': [{'tool_call_accurate': True,
                            'tool_call_accurate_reason': 'The TOOL CALL is '
                                                         'directly relevant to '
                                                         "the user's request "
                                                         'for a room for '
                                                         'tomorrow, uses the '
                                                         'correct parameter '
                                          

In [6]:
task_adherence_evaluator = TaskAdherenceEvaluator(model_config=model_config)
file_path = "evaluation_dataset.jsonl"

# do the same than before, except that the inputs needed are query, response and tool_definitions

with open(file_path, "r") as file:
    for line in file:
        line = line.strip()
        if not line:
            continue

        record = json.loads(line)
        query = record.get("query", "No query provided")
        response = record.get("response", "No response provided")

        print(f"Query: {query}")
        print(f"Response: {response}")

        metric = task_adherence_evaluator(
            query=query, 
            response=response, 
            tool_definitions=tool_definitions,
        )
        print("\nTask Adherence Result: ")
        pprint(metric)
        print("-----------------------------------------------")

Class TaskAdherenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Query: I need a deluxe room for tomorrow. Can you check if any are available?
Response: I'm sorry, but there are no deluxe rooms available for tomorrow. Would you like me to check availability for a different type of room or on another date?

Task Adherence Result: 
{'task_adherence': 4.0,
 'task_adherence_reason': 'The response is clear, accurate, and aligns with '
                          'the instructions, with only a minor issue of not '
                          'explicitly mentioning the tool usage. Therefore, it '
                          'is mostly adherent to the task.',
 'task_adherence_result': 'pass',
 'task_adherence_threshold': 3}
-----------------------------------------------
Query: Please book 1 deluxe room for tomorrow.
Response: Unfortunately, we don't have any deluxe rooms available for tomorrow. Can I help you find a different room or assist with another date?

Task Adherence Result: 
{'task_adherence': 3.0,
 'task_adherence_reason': 'The response meets the core 

In [7]:
import random

agentic_evals = evaluate(
    data=file_path,
    evaluation_name=f"agentic_evals_{random.randint(1, 10000)}",
    evaluators={
        "intent_resolution": intent_resolution_evaluator,
        "tool_call_accuracy": tool_call_accuracy_evaluator,
        "task_adherence": task_adherence_evaluator,
    },
    evaluator_config={
        "tool_call_accuracy": {
            "tool_definitions": tool_definitions,
            "query": "{data.query}",
            "tool_calls": "{data.tool_calls}",
        },
        "task_adherence": {
            "tool_definitions": tool_definitions,
            "query": "{data.query}",
            "response": "{data.response}",
        },
        "intent_resolution": {
            "tool_definitions": tool_definitions,
            "query": "{data.query}",
            "response": "{data.response}",
        },
    },
    azure_ai_project={
        "subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"],
        "project_name": os.environ["PROJECT_NAME"],
        "resource_group_name": os.environ["RESOURCE_GROUP_NAME"],
    }
)
print(agentic_evals)

[2025-04-13 18:49:39 +0100][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_task_adherence_20250413_184938_419413, log path: C:\Users\alevret\.promptflow\.runs\azure_ai_evaluation_evaluators_task_adherence_20250413_184938_419413\logs.txt
[2025-04-13 18:49:39 +0100][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_intent_resolution_20250413_184938_417611, log path: C:\Users\alevret\.promptflow\.runs\azure_ai_evaluation_evaluators_intent_resolution_20250413_184938_417611\logs.txt
[2025-04-13 18:49:39 +0100][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_tool_call_accuracy_20250413_184938_419413, log path: C:\Users\alevret\.promptflow\.runs\azure_ai_evaluation_evaluators_tool_call_accuracy_20250413_184938_419413\logs.txt


2025-04-13 18:49:39 +0100   21708 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2025-04-13 18:49:56 +0100   21708 execution.bulk     INFO     Finished 1 / 5 lines.
2025-04-13 18:49:56 +0100   21708 execution.bulk     INFO     Average execution time for completed lines: 16.33 seconds. Estimated time for incomplete lines: 65.32 seconds.
2025-04-13 18:49:56 +0100   21708 execution.bulk     INFO     Finished 2 / 5 lines.
2025-04-13 18:49:56 +0100   21708 execution.bulk     INFO     Average execution time for completed lines: 8.22 seconds. Estimated time for incomplete lines: 24.66 seconds.
2025-04-13 18:49:56 +0100   21708 execution.bulk     INFO     Finished 4 / 5 lines.
2025-04-13 18:49:56 +0100   21708 execution.bulk     INFO     Average execution time for completed lines: 4.18 seconds. Estimated time for incomplete lines: 4.18 seconds.
2025-04-13 18:49:56 +0100   21708 execution.bulk     INFO     Finished 5 / 5 lines.
20

 Please check out C:/Users/alevret/.promptflow/.runs/azure_ai_evaluation_evaluators_tool_call_accuracy_20250413_184938_419413 for more details.


2025-04-13 18:49:39 +0100   21708 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2025-04-13 18:50:59 +0100   21708 execution.bulk     INFO     Finished 5 / 5 lines.
2025-04-13 18:50:59 +0100   21708 execution.bulk     INFO     Average execution time for completed lines: 15.82 seconds. Estimated time for incomplete lines: 0.0 seconds.
2025-04-13 18:50:59 +0100   21708 execution          ERROR    5/5 flow run failed, indexes: [1,4,3,0,2], exception of index 1: (UserError) response does not have tool calls. Either provide tool_calls or response with tool calls.

Run name: "azure_ai_evaluation_evaluators_tool_call_accuracy_20250413_184938_419413"
Run status: "Completed"
Start time: "2025-04-13 18:49:38.536110+01:00"
Duration: "0:01:20.787992"
Output path: "C:\Users\alevret\.promptflow\.runs\azure_ai_evaluation_evaluators_tool_call_accuracy_20250413_184938_419413"


{
    "intent_resolution": {
        "status": "Completed",
 