# Phase 5: Evaluate answers comparing with ground truth
In this notebook:
- Import libraries, load configuration variables and create clients
- Test the end-to-end process with one query and with queries and expected answers in an Excel file:
    + Hybrid search with Semantic ranker
    + Filter the chunks leaving the most relevant compared with the user's question
    + Generate the answer for the query using the most relevante chunks as the context
    + Evaluate with AI Foundry SDK evaluations

In [None]:
#%pip install azure-ai-evaluation
#%pip install openpyxl

In [None]:
# Import libraries
import os
import sys
import json
import time
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from openai import AzureOpenAI
from azure.ai.evaluation import QAEvaluator

sys.path.append('..')
from common_utils import *

# Load Azure OpenAI and AI Search variables and create clients
openai_config, ai_search_config = load_config()

# Initialzing Groundedness and Similarity evaluators
model_config = {
    "azure_endpoint": openai_config["aoai_endpoint"],
    "api_key": openai_config["aoai_key"],
    "azure_deployment": openai_config["aoai_rerank_model"],
    "api_version": openai_config["api_version"]
}
qa_eval = QAEvaluator(model_config=model_config)

### Test with one question the end-to-end process:
- Generate query for AI Search
- Hybrid search with Semantic ranker
- Filter chunks comparing with the question
- Generate the answer with the relevant chunks as context
- Evaluate the answer compared with the expected answer

In [None]:
# Test with one question
question = "What is included in my Northwind Health Plus plan?"

# Hybrid search
results, num_results = semantic_hybrid_search(ai_search_config["ai_search_client_docs"],
                                              openai_config["openai_client"],
                                              openai_config["aoai_embedding_model"],
                                              question, 10)
print(f"num results: {num_results}")
show_results(results, question)

# Valid chunks for the user question
valid_chunks, num_chunks = get_filtered_chunks(openai_config["openai_client"],
                                               openai_config["aoai_rerank_model"],
                                               results, question)

# Generate answer:
answer = generate_answer(openai_config["openai_client"],
                         openai_config["aoai_deployment_name"],
                         valid_chunks, question)
print(f"\n>> Answer:\n{answer}")

# Evaluate answer
expected_answer = """The Northwind Health Plus plan includes the following benefits coverage:
- Deductible: $2,000 per year.
- Coinsurance: 20% of the cost of a covered service after the deductible is met.
- Out-of-Pocket Maximum: $4,000 per year, including deductible, coinsurance, and copayments.
- In-Network Provider: Lower copayments and coinsurance amounts.
- Out-of-Network Provider: Higher copayments and coinsurance amounts.
- Preventive Care: Covered at 100% with no copayment, deductible, or coinsurance.
- Prescription Drugs: Subject to a copayment, varying by drug type. Generic drugs usually have lower copayments.
- Mental Health and Substance Abuse Services: Subject to a copayment and deductible, varying by service type."""

qa_score = evaluate_answer(qa_eval, question, valid_chunks, answer, expected_answer)
print(f'Evaluation results:\n{qa_score}')

In [None]:
# Test with all questions in an excel file
input_file = "ground_truth.xlsx"
df = pd.read_excel(input_file)
data_dict = df.to_dict(orient='records')

# For earch line in the input Excel file
for i, line in enumerate(data_dict):
    question = line['QUESTION']
    print(f"[{i+1}] question: {question}")

    # Hybrid search
    results, num_results = semantic_hybrid_search(ai_search_config["ai_search_client_docs"],
                                                openai_config["openai_client"],
                                                openai_config["aoai_embedding_model"],
                                                question, 10)
    print(f"\tnum results: {num_results}")
    #show_results(results, query)

    # Valid chunks for the user question
    valid_chunks, num_chunks = get_filtered_chunks(openai_config["openai_client"],
                                                openai_config["aoai_rerank_model"],
                                                results, question)
    print(f"\tnum valid chunks: {num_chunks}")

    # Generate answer:
    answer = generate_answer(openai_config["openai_client"],
                            openai_config["aoai_deployment_name"],
                            valid_chunks, question)
    print(f"\n>> Answer:\n{answer}")

    # Evaluate answer
    expected_answer = "Your Northwind Health Plus plan includes coverage for medical, vision, and dental services. It also provides coverage for prescription drugs, mental health and substance abuse services, and preventive care. You can choose from a variety of in-network providers, including primary care physicians, specialists, hospitals, and pharmacies. Emergency services are covered, both in-network and out-of-network. Co-pays, deductibles, and out-of-pocket maximums may apply to your plan. Your plan may also include separate deductibles for different services, such as prescription drugs and hospitalization. It is important to know what your plan covers and what the cost-sharing requirements are"
    qa_score = evaluate_answer(qa_eval, question, valid_chunks, answer, expected_answer)
    print(f'Evaluation results:\n{qa_score}')
    print('--------------------------------------------------')