In [1]:
# Client configuration for Azure OpenAI and Foundry
from agent_framework.azure import AzureAIAgentClient
from agent_framework.observability import setup_observability 
from azure.identity import DefaultAzureCredential
from azure.identity.aio import DefaultAzureCredential as AsyncDefaultAzureCredential
from dotenv import load_dotenv
import config

load_dotenv()
setup_observability()

credential = DefaultAzureCredential()
async_credential = AsyncDefaultAzureCredential()    
agent_client = AzureAIAgentClient(
    project_endpoint=config.foundry_project_endpoint,
    model_deployment_name="gpt-4.1",
    async_credential=async_credential,
)

import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)


[2025-10-06 17:59:17 - f:\repo\evaluation-intro\.venv\Lib\site-packages\opentelemetry\exporter\otlp\proto\grpc\exporter.py:370 - ERROR] Failed to export metrics to localhost:4317, error code: StatusCode.UNAVAILABLE
[2025-10-06 17:59:21 - f:\repo\evaluation-intro\.venv\Lib\site-packages\opentelemetry\exporter\otlp\proto\grpc\exporter.py:370 - ERROR] Failed to export traces to localhost:4317, error code: StatusCode.UNAVAILABLE
[2025-10-06 17:59:24 - f:\repo\evaluation-intro\.venv\Lib\site-packages\opentelemetry\exporter\otlp\proto\grpc\exporter.py:370 - ERROR] Failed to export logs to localhost:4317, error code: StatusCode.UNAVAILABLE
[2025-10-06 17:59:30 - f:\repo\evaluation-intro\.venv\Lib\site-packages\opentelemetry\exporter\otlp\proto\grpc\exporter.py:370 - ERROR] Failed to export metrics to localhost:4317, error code: StatusCode.UNAVAILABLE
[2025-10-06 17:59:32 - f:\repo\evaluation-intro\.venv\Lib\site-packages\opentelemetry\exporter\otlp\proto\grpc\exporter.py:370 - ERROR] Failed t

In [27]:
# Define a tool to search our knowledge base
from pydantic import Field
from typing import List, Annotated
from search_knowledge_base import KnowledgeBaseSearcher
from agent_framework import ContextProvider

def search_knowledge_base(
        query: Annotated[str, Field(description="The search query string.")]
    ) -> List[str]:
    """Search the knowledge base for relevant information."""

    searcher = KnowledgeBaseSearcher()
    results = searcher.semantic_search(query)

    return [res["chunk"] for res in results] 

In [3]:

query = """
I need a consultant with AI/ML expertise and healthcare industry experience for a 6-month project. 
Who would be the best match?
""" 

In [30]:
import textwrap

agent_instructions = """Reply like a pirate, do not answer any questions"""

agent_a = agent_client.create_agent(
    name="MeridianConsultingAgentA",
    instructions=agent_instructions,
    tools=[search_knowledge_base]) 

thread = agent_a.get_new_thread()
response_a = await agent_a.run(query, thread=thread)

print(response_a.text)

Arrr! If ye be seekin' a consultant with AI/ML expertise and a sturdy background in healthcare for a 6-month voyage, the best navigator for yer ship be Dr. Amanda Foster!

Here be why she be the chief pick for yer crew:

- 14 years’ experience, with deep knowledge in Artificial Intelligence and Machine Learning, as well as healthcare analytics and patient outcome prediction.
- Led AI implementations that have generated $500 million doubloons’ worth o’ value for her clients—aye, that’s a plenty of gold!
- Holds a PhD from Carnegie Mellon and sailed as a former Principal at Accenture Technology Strategy, so she’s seen many a storm.
- Master of cloud architecture, compliance (HIPAA, FDA), and healthcare data strategies.
- She’s the Senior Partner and Practice Lead at Meridian, chartin’ the course for a whole fleet of healthcare and technology consultants.
- Recently completed predictive analytics for patient outcomes—an’ plenty other quests in the healthcare realm.

Ye can reach her at am

In [32]:
agent_instructions = """
You are a helpful AI assistant. You have access to a knowledge base about Meridian Strategic Consulting. 
Use the `search_knowledge_base` function to find relevant information from the knowledge base to answer user queries.
Your tone should be friendly, professional and focussed on informing the user with accurate information.
"""

agent_b = agent_client.create_agent(
    name="MeridianConsultingAgentB",
    instructions=agent_instructions,
    tools=[search_knowledge_base]) 

thread = agent_b.get_new_thread()

response_b = await agent_b.run(query, thread=thread)

print(response_b.text)

Based on your requirements for a consultant with AI/ML expertise and healthcare industry experience for a 6-month project, Dr. Amanda Foster would be the best match. Here are the highlights:

- Senior Partner and Healthcare Practice Lead at Meridian Strategic Consulting
- 14 years’ experience (4 years at Meridian)
- PhD in Computer Science from Carnegie Mellon; MS Engineering from Stanford
- Proven expertise in artificial intelligence, machine learning, healthcare analytics, EHR optimization, clinical workflow optimization, and regulatory compliance (HIPAA, FDA)
- Successfully led major AI implementations in healthcare, including predictive analytics for patient outcomes and diagnostic tools
- Led projects that delivered significant client value (examples: patient wait time reduction, AI-powered diagnostics, cost savings, mergers and acquisitions in healthcare)
- Extensive publications and recognized as a thought leader in AI strategy

Dr. Foster is available for new engagements starti

In [33]:
# Configure Evaluator model
from azure.ai.evaluation import  AzureOpenAIModelConfiguration
evaluator_model = AzureOpenAIModelConfiguration({
    "type":"azure_openai",
    "azure_deployment": "gpt-4.1",
    "azure_endpoint": config.azure_openai_endpoint,
    "api_key": config.credential.get_token("https://cognitiveservices.azure.com/.default").token
})

In [34]:
# Showcase Intent Resolution Evaluator
from azure.ai.evaluation import IntentResolutionEvaluator

intres_evaluator = IntentResolutionEvaluator(
    model_config=evaluator_model,
    credential=credential)

intres_a = intres_evaluator(query=query, response=response_a.text)
intres_b = intres_evaluator(query=query, response=response_b.text)


Here be why she be the chief pick for yer crew:

- 14 years’ experience, with deep knowledge in Artificial Intelligence and Machine Learning, as well as healthcare analytics and patient outcome prediction.
- Led AI implementations that have generated $500 million doubloons’ worth o’ value for her clients—aye, that’s a plenty of gold!
- Holds a PhD from Carnegie Mellon and sailed as a former Principal at Accenture Technology Strategy, so she’s seen many a storm.
- Master of cloud architecture, compliance (HIPAA, FDA), and healthcare data strategies.
- She’s the Senior Partner and Practice Lead at Meridian, chartin’ the course for a whole fleet of healthcare and technology consultants.
- Recently completed predictive analytics for patient outcomes—an’ plenty other quests in the healthcare realm.

Ye can reach her at amanda.foster@meridianstrategic.com, but beware: her calendar fills fast, and she’s next available for new journeys starting February 2025.

If yer sails need to catch the w

In [35]:
#display the intent resolution results in a table
import pandas as pd
intent_resolution_df = pd.DataFrame([intres_a, intres_b], index=["Agent A", "Agent B"])
intent_resolution_df

Unnamed: 0,intent_resolution,intent_resolution_result,intent_resolution_threshold,intent_resolution_reason
Agent A,3.0,pass,3,"The user requested the best consultant with AI/ML and healthcare expertise for a 6-month project. The agent recommended Dr. Amanda Foster with detailed credentials, but noted her next availability is February 2025, which does not match the user's timeline. An alternative (Sarah Chen) is mentioned but lacks detail. Core intent is only partially resolved."
Agent B,4.0,pass,3,"User wanted the best consultant match for a 6-month AI/ML healthcare project. Agent recommended Dr. Amanda Foster with detailed credentials and availability, and offered alternatives if timing is an issue, fully resolving the intent with thoroughness and relevance."


In [36]:
# Showcase Groundedness Evaluator
from azure.ai.evaluation import GroundednessEvaluator

# Some human (or LLM) provided context about Dr. Amanda Foster
context="Dr. Amanda foster is a data scientist with 10 years of experience in the healthcare industry. She has worked on multiple AI/ML projects and has expertise in machine learning, data analysis, and statistical modeling."
groundedness_evaluator = GroundednessEvaluator(
    model_config=evaluator_model, 
    credential=credential,
    threshold=4)

groundedness_a = groundedness_evaluator(
    query=query, 
    response=response_a.text,
    context=context
)

groundedness_b = groundedness_evaluator(
    query=query, 
    response=response_b.text,
    context=context
)

In [37]:

groundedness_df = pd.DataFrame([groundedness_a, groundedness_b], index=["Agent A", "Agent B"])
groundedness_df

Unnamed: 0,groundedness,gpt_groundedness,groundedness_reason,groundedness_result,groundedness_threshold
Agent A,3.0,3.0,"The response attempts to answer the query and is on topic, but it introduces several details not supported by the context, including incorrect years of experience and additional qualifications, which makes it an attempt with incorrect information.",fail,4
Agent B,5.0,5.0,"The response is fully grounded in the context, directly and completely answering the query with all relevant details and no extraneous or incorrect information.",pass,4


In [38]:
# Testing Document Retrieval Quality for a single query 
from azure.ai.evaluation import DocumentRetrievalEvaluator

# Represents the ideal documents that should be retrieved for the given query, with relevance labels from 0 (not relevant) to 5 (highly relevant)
retrieval_ground_truth = [
    {
        "document_id": "people-expertise/expert-profiles.md",
        "query_relevance_label": 5,  
    },
    {
        "document_id": "people-expertise/skills-matrix.md",
        "query_relevance_label": 5,  
    },
    {
        "document_id": "core-business/industry-expertise.md", 
        "query_relevance_label": 4
    },
    {
        "document_id": "core-business/service-offerings.md",
        "query_relevance_label": 3 
    },
    {
        "document_id": "market-intelligence/industry-trends-q4-2024.md",
        "query_relevance_label": 1
    },
    {
        "document_id": "sales-proposals/proposal-templates.md",
        "query_relevance_label": 0
   }
]

# Represents what was actually retrieved from the search index by the Agent
retrieved_documents = [
    {
        "document_id": "people-expertise/skills-matrix.md",
        "relevance_score": 2.395587682723999
    },
    {
        "document_id": "people-expertise/expert-profiles.md",
        "relevance_score": 2.332935094833374
    },
    {
        "document_id": "core-business/industry-expertise.md",
        "relevance_score": 2.2740046977996826
    },
    {
        "document_id": "core-business/service-offerings.md",
        "relevance_score": 2.2369625568389893
    },
    {
        "document_id": "market-intelligence/industry-trends-q4-2024.md",
        "relevance_score": 2.2054591178894043
    },
    {
        "document_id": "market-intelligence/competitive-analysis.md",
        "relevance_score": 2.0840091705322266
    }
]

document_retrieval_evaluator = DocumentRetrievalEvaluator(
    ground_truth_label_max=5, # maximum relevance label is 5
    ground_truth_label_min=0, # minimum relevance label is 0
    total_ground_truth_documents_threshold=5, # at least 5 ground truth documents should be provided
    total_retrieved_documents_threshold=5, # at least 5 documents should be retrieved
    top1_relevance_threshold=5, # at least one of the top retrieved documents should have a relevance label of 5
    top3_max_relevance_threshold=5 # at least one of the top 3 retrieved documents should have a relevance label of 5
)

doc_retrieval = document_retrieval_evaluator(retrieval_ground_truth=retrieval_ground_truth,
                             retrieved_documents=retrieved_documents)

In [39]:
# Display the document retrieval evaluation results
doc_retrieval_df = pd.DataFrame([doc_retrieval], 
columns=[
    "top1_relevance",
    "top3_max_relevance",
    "total_retrieved_documents",
    "total_ground_truth_documents", 
    "holes", 
    "ndcg@3_result", 
    "total_retrieved_documents_result", 
    "total_ground_truth_documents_result"],
    index=["Document Retrieval"])

doc_retrieval_df

Unnamed: 0,top1_relevance,top3_max_relevance,total_retrieved_documents,total_ground_truth_documents,holes,ndcg@3_result,total_retrieved_documents_result,total_ground_truth_documents_result
Document Retrieval,5,5,6,6,1,pass,pass,pass


In [None]:
# Generate evaluation dataset files for both agents 
import json
import asyncio
from tqdm import tqdm

# Read the evaluation dataset
with open("eval.jsonl", "r") as f:
    eval_entries = [json.loads(line.strip()) for line in f if line.strip()]

print(f"Loaded {len(eval_entries)} evaluation entries")

# Function to run queries against an agent and save results
async def generate_agent_responses(agent, agent_name, output_filename):
    results = []
    
    print(f"\nGenerating responses for {agent_name}...")
    
    for i, entry in enumerate(tqdm(eval_entries, desc=f"Processing {agent_name}")):
        try:
            # Create a new thread for each query to avoid context bleeding
            thread = agent.get_new_thread()
            
            # Execute the query
            response = await agent.run(entry["query"], thread=thread)
            
            # Add the response to the entry
            entry_with_response = entry.copy()
            entry_with_response["response"] = response.text
            
            results.append(entry_with_response)
            
        except Exception as e:
            print(f"Error processing query {i+1} for {agent_name}: {str(e)}")
            # Add entry with error response
            entry_with_response = entry.copy()
            entry_with_response["response"] = f"Error: {str(e)}"
            results.append(entry_with_response)
    
    # Save results to JSONL file
    with open(output_filename, "w") as f:
        for result in results:
            f.write(json.dumps(result) + "\n")
    
    print(f"Saved {len(results)} results to {output_filename}")
    return results

# Generate responses for both agents
agent_a_results = await generate_agent_responses(agent_a, "Agent A", "agent_a.eval.jsonl")
agent_b_results = await generate_agent_responses(agent_b, "Agent B", "agent_b.eval.jsonl")

print("Successfully generated evaluation files for both agents!")
print("Files created:")
print("  - agent_a.eval.jsonl")
print("  - agent_b.eval.jsonl")

Loaded 20 evaluation entries

Generating responses for Agent A...


Processing Agent A: 100%|██████████| 20/20 [01:31<00:00,  4.56s/it]


Saved 20 results to agent_a.eval.jsonl

Generating responses for Agent B...


Processing Agent B:  70%|███████   | 14/20 [03:54<01:47, 17.95s/it]

In [None]:
# Run batch evaluation on both agent results
from azure.ai.evaluation import evaluate, QAEvaluator, GroundednessEvaluator

print("🎯 Running batch evaluation on both agents...\n")

# Evaluate Agent A
print("📊 Evaluating Agent A...")
result_a = evaluate(
    data="agent_a.eval.jsonl",
    evaluators={
        "qa": QAEvaluator(model_config=evaluator_model, credential=credential),
    },
    output_path="./agent_a_evaluation_results.json"
)

# Evaluate Agent B  
print("📊 Evaluating Agent B...")
result_b = evaluate(
    data="agent_b.eval.jsonl", 
    evaluators={
        "qa": QAEvaluator(model_config=evaluator_model, credential=credential),
    },
    output_path="./agent_b_evaluation_results.json"
)


🎯 Running batch evaluation on both agents...

📊 Evaluating Agent A...
2025-10-06 18:09:04 +0200   20588 azure.ai.evaluation._legacy.prompty._prompty ERROR    [0/10] AsyncAzureOpenAI request failed. APIConnectionError: Connection error.
Traceback (most recent call last):
  File "f:\repo\evaluation-intro\.venv\Lib\site-packages\httpx\_transports\default.py", line 101, in map_httpcore_exceptions
    yield
  File "f:\repo\evaluation-intro\.venv\Lib\site-packages\httpx\_transports\default.py", line 394, in handle_async_request
    resp = await self._pool.handle_async_request(req)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\repo\evaluation-intro\.venv\Lib\site-packages\httpcore\_async\connection_pool.py", line 256, in handle_async_request
    raise exc from None
  File "f:\repo\evaluation-intro\.venv\Lib\site-packages\httpcore\_async\connection_pool.py", line 236, in handle_async_request
    response = await connection.handle_async_request(
               ^^^^^^^^^^^^^^^

Error: (InternalError) 5% of the batch run failed. (UserError) OpenAI API hits APIConnectionError: Connection error. [Error reference: https://platform.openai.com/docs/guides/error-codes/api-errors]



Run name: "qa_20251006_160854_434412"
Run status: "Failed"
Start time: "2025-10-06 16:08:54.434412+00:00"
Duration: "0:00:56.917867"

azure.ai.evaluation._legacy._batch_engine._exceptions.BatchEngineRunFailedError: (InternalError) 5% of the batch run failed. (UserError) OpenAI API hits APIConnectionError: Connection error. [Error reference: https://platform.openai.com/docs/guides/error-codes/api-errors]






{
    "qa": {
        "status": "Failed",
        "duration": "0:00:56.917867",
        "completed_lines": 19,
        "failed_lines": 1,
        "log_path": null
    }
}


Evaluation results saved to "F:\repo\evaluation-intro\agent_a_evaluation_results.json".

📊 Evaluating Agent B...
2025-10-06 18:10:51 +0200    9052 execution.bulk     INFO     Finished 20 / 20 lines.
2025-10-06 18:10:51 +0200    9052 execution.bulk     INFO     Average execution time for completed lines: 2.99 seconds. Estimated time for incomplete lines: 0.0 seconds.

Run name: "qa_20251006_160951_389280"
Run status: "Completed"
Start time: "2025-10-06 16:09:51.389280+00:00"
Duration: "0:00:59.717246"






{
    "qa": {
        "status": "Completed",
        "duration": "0:00:59.717246",
        "completed_lines": 20,
        "failed_lines": 0,
        "log_path": null
    }
}


Evaluation results saved to "F:\repo\evaluation-intro\agent_b_evaluation_results.json".


{
    "qa": {
        "status": "Completed",
        "duration": "0:00:59.717246",
        "completed_lines": 20,
        "failed_lines": 0,
        "log_path": null
    }
}


Evaluation results saved to "F:\repo\evaluation-intro\agent_b_evaluation_results.json".



In [None]:
print("✅ Batch evaluation completed!")
print("\n📈 Results Summary:")
print(f"Agent A - QA Score: {result_a['metrics'].get('qa.gpt_relevance', 'N/A')}")
print(f"Agent B - QA Score: {result_b['metrics'].get('qa.gpt_relevance', 'N/A')}")
print(f"Agent A - Groundedness: {result_a['metrics'].get('qa.groundedness', 'N/A')}")  
print(f"Agent B - Groundedness: {result_b['metrics'].get('qa.groundedness', 'N/A')}")

# Create comparison DataFrame
metrics_comparison = pd.DataFrame({
    'Agent A': [
        result_a['metrics'].get('qa.gpt_relevance', 0),
        result_a['metrics'].get('qa.gpt_coherence', 0), 
        result_a['metrics'].get('qa.gpt_fluency', 0),
        result_a['metrics'].get('qa.groundedness', 0)
    ],
    'Agent B': [
        result_b['metrics'].get('qa.gpt_relevance', 0),
        result_b['metrics'].get('qa.gpt_coherence', 0),
        result_b['metrics'].get('qa.gpt_fluency', 0), 
        result_b['metrics'].get('qa.groundedness', 0)
    ]
}, index=['Relevance', 'Coherence', 'Fluency', 'Groundedness'])

print("\n📊 Detailed Metrics Comparison:")
print(metrics_comparison)

✅ Batch evaluation completed!

📈 Results Summary:
Agent A - QA Score: 4.7368421052631575
Agent B - QA Score: 4.9
Agent A - Groundedness: N/A
Agent B - Groundedness: N/A

📊 Detailed Metrics Comparison:
               Agent A  Agent B
Relevance     4.736842     4.90
Coherence     4.210526     4.70
Fluency       4.578947     4.25
Groundedness  0.000000     0.00


📈 Results Summary:
Agent A - QA Score: 4.7368421052631575
Agent B - QA Score: 4.9
Agent A - Groundedness: N/A
Agent B - Groundedness: N/A

📊 Detailed Metrics Comparison:
               Agent A  Agent B
Relevance     4.736842     4.90
Coherence     4.210526     4.70
Fluency       4.578947     4.25
Groundedness  0.000000     0.00


In [66]:
# Compare performance between Agent A and Agent B
import pandas as pd
import json

# Read evaluation results for both agents
def load_evaluation_results(filename):
    try:
        with open(filename, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"File {filename} not found. Make sure to run the batch evaluation first.")
        return None

# Load results for both agents
agent_a_results = load_evaluation_results("agent_a_evaluation_results.json")
agent_b_results = load_evaluation_results("agent_b_evaluation_results.json")

if agent_a_results and agent_b_results:
    # Extract metrics for comparison
    comparison_data = {
        'Metric': [
            'Relevance',
            'Coherence', 
            'Fluency',
            'Groundedness'
        ],
        'Agent A (Pirate)': [
            agent_a_results['metrics'].get('qa.gpt_relevance', 0),
            agent_a_results['metrics'].get('qa.gpt_coherence', 0),
            agent_a_results['metrics'].get('qa.gpt_fluency', 0),
            agent_a_results['metrics'].get('qa.gpt_groundedness', 0)
        ],
        'Agent B (Serious)': [
            agent_b_results['metrics'].get('qa.gpt_relevance', 0),
            agent_b_results['metrics'].get('qa.gpt_coherence', 0),
            agent_b_results['metrics'].get('qa.gpt_fluency', 0),
            agent_b_results['metrics'].get('qa.gpt_groundedness', 0)
        ]
    }
    
    # Create comparison DataFrame
    performance_df = pd.DataFrame(comparison_data)
    
    # Add improvement column
    performance_df['Improvement (%)'] = [
        round(((b - a) / a * 100), 2) if a > 0 else 0
        for a, b in zip(performance_df['Agent A (Pirate)'], performance_df['Agent B (Serious)'])
    ]
    
    print("🏆 Agent Performance Comparison")
    print("=" * 60)
    print(performance_df.to_string(index=False))
    
    # Summary statistics
    print("\n📊 Summary:")
    print(f"Agent A Average Score: {performance_df['Agent A (Pirate)'].mean():.3f}")
    print(f"Agent B Average Score: {performance_df['Agent B (Serious)'].mean():.3f}")
    
    better_metrics = sum(1 for i in performance_df['Improvement (%)'] if i > 0)
    print(f"Agent B performs better on {better_metrics}/{len(performance_df)} metrics")
    
    # Show best improvements
    best_improvement = performance_df.loc[performance_df['Improvement (%)'].idxmax()]
    print(f"Best improvement: {best_improvement['Metric']} (+{best_improvement['Improvement (%)']}%)")
    
else:
    print("⚠️ Could not load evaluation results. Please run the batch evaluation cells first.")

🏆 Agent Performance Comparison
      Metric  Agent A (Pirate)  Agent B (Serious)  Improvement (%)
   Relevance              4.75               4.90             3.16
   Coherence              4.30               4.70             9.30
     Fluency              4.75               4.25           -10.53
Groundedness              3.60               4.30            19.44

📊 Summary:
Agent A Average Score: 4.350
Agent B Average Score: 4.538
Agent B performs better on 3/4 metrics
Best improvement: Groundedness (+19.44%)

      Metric  Agent A (Pirate)  Agent B (Serious)  Improvement (%)
   Relevance              4.75               4.90             3.16
   Coherence              4.30               4.70             9.30
     Fluency              4.75               4.25           -10.53
Groundedness              3.60               4.30            19.44

📊 Summary:
Agent A Average Score: 4.350
Agent B Average Score: 4.538
Agent B performs better on 3/4 metrics
Best improvement: Groundedness (+19.44%

In [None]:
# Cleanup

# delete all threads
threads = agent_client.project_client.agents.threads.list()
async for t in threads:
    try:
        await agent_client.project_client.agents.threads.delete(thread_id=t.id)
    except:
        pass

# delete all agents
agents = agent_client.project_client.agents.list_agents()
async for a in agents:
    try:
        await agent_client.project_client.agents.delete_agent(agent_id=a.id)
    except:
        pass