In [1]:
# Client configuration for Azure OpenAI and Foundry
import asyncio

from agent_framework.azure import AzureAIAgentClient, AzureOpenAIChatClient
from agent_framework.observability import setup_observability 
from azure.identity import DefaultAzureCredential
from azure.identity.aio import DefaultAzureCredential as AsyncDefaultAzureCredential
from openai import AzureOpenAI
from dotenv import load_dotenv
import config

load_dotenv()
setup_observability()

credential = DefaultAzureCredential()
async_credential = AsyncDefaultAzureCredential()    
openai_client = AzureOpenAI(
    azure_endpoint=config.azure_openai_endpoint,
    azure_ad_token_provider=lambda: credential.get_token("https://cognitiveservices.azure.com/.default").token,
    api_version="2024-02-01"
)

chat_client = AzureOpenAIChatClient(
    credential=credential,
    deployment_name="gpt-5-chat",
    endpoint=config.azure_openai_endpoint
)

agent_client = AzureAIAgentClient(
    project_endpoint=config.foundry_project_endpoint,
    model_deployment_name="gpt-5-chat",
    async_credential=async_credential,
)


In [None]:
# Define test questions and reference answers 
test_cases = [
    {
        "category": "Company Information",
        "query": "When was Meridian founded?",
        "reference_answer": "Meridian Strategic Consulting was founded in 2018"
    },
    {
        "category": "Service Offerings",
        "query": "What specific AI and automation services does Meridian offer, and what kind of ROI can clients expect?",
        "reference_answer": """Meridian offers comprehensive AI & Automation Implementation services including AI opportunity assessment, solution design and architecture, implementation and testing, and change management support. These engagements typically last 16-28 weeks with teams of 5-10 consultants. Clients can expect a typical ROI of 400-600%, making this one of our highest-value service offerings. Dr. Amanda Foster leads this practice with expertise in machine learning, having 12 patents in ML and automation, and the team has delivered $500M+ in client value through AI implementations."""
    },
    {
        "category": "Expert Identification", 
        "query": "I need a consultant with AI/ML expertise and healthcare industry experience for a 6-month project. Who would be the best match?",
        "reference_answer": """Dr. Amanda Foster would be the ideal match for this project. She is a Senior Partner with expertise in AI/ML and leads both the Digital Transformation practice and Healthcare vertical. She has a PhD in Computer Science from Carnegie Mellon, holds 12 patents in machine learning and automation, and has led AI implementations generating $500M+ in client value. She has specific healthcare experience with projects including AI-powered diagnostic tools, predictive analytics for patient outcomes, and clinical decision support systems. However, she is currently available starting February 2025, so timeline coordination would be needed."""
    }
]

📋 Loaded 3 test cases for evaluation


In [23]:
# Functions for semantic similarity evaluation 
from typing import Any
from pydantic import BaseModel


evaluation_instructions = """
You are an expert evaluator of AI generated answers.
Given a reference answer, evaluation criteria, and an agent-generated response, evaluate the quality of the agent's response based on the following metrics. 
Provide a score from 1 to 5 for each metric, where 1 is poor and 5 is excellent. Also, provide an overall score with justification.

REFERENCE ANSWER:
{reference_answer}

AGENT RESPONSE:
{agent_response}

EVALUATION METRICS:
1. Accuracy: Are the facts and figures correct?
2. Completeness: Does it cover all key points mentioned in the reference answer?
3. Clarity: Is the response clear and well-structured?
4. Relevance: Does it directly answer the user's question?

OUTPUT FORMAT:
You MUST return your evaluation as a JSON object with the following structure:
{{
    "accuracy": X,
    "completeness": X,
    "clarity": X,
    "relevance": X,
    "overall": X,
    "justification": "Detailed explanation of your evaluation"
}}

Where X is a score from 1-5 (1=Poor, 2=Below Average, 3=Average, 4=Good, 5=Excellent).
"""

class EvaluationResponse(BaseModel):
    accuracy: int
    completeness: int
    clarity: int
    relevance: int
    overall: int
    justification: str

async def evaluate_answer(response: str, reference: str) -> Any:
    prompt = evaluation_instructions.format(reference_answer=reference, agent_response=response)

    evaluation = await chat_client.get_response(
        [prompt],
        model="o3-mini",
        temperature=0.0,
        tool_choice="auto")

    return evaluation.text


In [24]:
# Initialize the agent
from main import MeridianKnowledgeBaseAgent

agent_instructions = """
You are a helpful AI assistant. 

You will receive a question and exerpts of documents returned from a search service in the following format
```
Question: {question}
Exerpts: {context}
```

Keep thinking untill you have a complete answer. If the question contains multiple parts, split the query into parts and be sure to answer all parts of the question.

Use ONLY the provided knowledge base excerpts to answer the user's question as accurately as possible. 
Do not make assumptions or add information not contained in the excerpts.
If you cannot answer (part of the question) state which part of the question you are unable to answer.
"""

agent = MeridianKnowledgeBaseAgent(agent_instructions)


In [None]:
for test_case in test_cases:
    test_query = test_case["query"]
    reference = test_case["reference_answer"]

    agent_response = await agent.ask(test_query)
    evaluation = await evaluate_answer(agent_response, reference)
    print(evaluation)
    


{
    "accuracy": 5,
    "completeness": 5,
    "clarity": 5,
    "relevance": 5,
    "overall": 5,
    "justification": "The agent's response exactly matches the reference answer, with the correct founding year of Meridian Strategic Consulting. It covers the sole key point from the reference answer, is clear and concise, and directly addresses the question without unnecessary information. Therefore, it scores full marks across all metrics."
}
{
    "accuracy": 4,
    "completeness": 4,
    "clarity": 5,
    "relevance": 5,
    "overall": 4,
    "justification": "The agent's response is largely accurate, correctly reflecting key facts from the reference answer such as the service offerings, engagement duration (16–28 weeks), ROI range (400–600%), and $500M+ in client value. However, it introduces additional services (predictive modeling, integration, industry-specific applications) and ROI figures (250–500% for predictive models) that are not mentioned in the reference answer, which sl