In [1]:
from IPython.display import clear_output

In [2]:
%pip install azure-ai-evaluation
clear_output()

# Evaluating Response Quality with RelevanceEvaluator

In [3]:
import os
from azure.ai.evaluation import RelevanceEvaluator
from dotenv import load_dotenv

load_dotenv('../.env')

# Configure the AOAI model
model_config = {
    "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
    "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
    "api_version": os.environ.get("AZURE_OPENAI_API_VERSION"),
}

# Initialize the Relevance evaluator
relevance_evaluator = RelevanceEvaluator(model_config)

# Evaluate a single query-response pair
result = relevance_evaluator(
    query="What is the capital of France?",
    response="The capital of France is Paris.",
)

print(result)

{'relevance': 4.0, 'gpt_relevance': 4.0, 'relevance_reason': 'The response is accurate and directly answers the query, providing all necessary information.'}


# Evaluating Response Quality with CoherenceEvaluator

In [4]:
from azure.ai.evaluation import CoherenceEvaluator

coherence_evaluator = CoherenceEvaluator(model_config)

result = coherence_evaluator(
    query="What's the capital of France?", 
    response="Paris."
)
print(result)

{'coherence': 4.0, 'gpt_coherence': 4.0, 'coherence_reason': 'The response is coherent because it directly and correctly answers the query without any unnecessary information or confusion.'}


# Evaluating Response Quality with FluencyEvaluator

In [5]:
from azure.ai.evaluation import FluencyEvaluator

fluency_evaluator = FluencyEvaluator(model_config)

result = fluency_evaluator(
    response="Paris."
)
print(result)

{'fluency': 1.0, 'gpt_fluency': 1.0, 'fluency_reason': 'The RESPONSE is a single word and does not demonstrate any fluency in terms of sentence structure, grammar, or vocabulary. It is largely incomprehensible as it does not convey any meaningful message.'}


In [6]:
from azure.ai.evaluation import GroundednessEvaluator

groundedness_evaluator = GroundednessEvaluator(model_config)

result = groundedness_evaluator(
    query="Who discovered penicillin?",
    context="Alexander Fleming discovered penicillin in 1928 while working at St. Mary's Hospital in London.",
    response="Alexander Fleming discovered penicillin in 1928.",
)
print(result)

{'groundedness': 5.0, 'gpt_groundedness': 5.0, 'groundedness_reason': 'The response accurately and completely answers the query using information from the context, without introducing any errors or irrelevant details.'}


# Creating custom evaluators

## Code-based evaluator

### Function-based evaluator

In [7]:
# Custom evaluator function to calculate response length
def response_length_evaluator(response, **kwargs):
    return {"response_length": len(response)}

# Example usage
result = response_length_evaluator(response="Hello, world!")
print(result)

{'response_length': 13}


### Class-based evaluator

In [8]:
# Custom class-based evaluator to check for blocked words
class BlocklistEvaluator:
    def __init__(self, blocklist):
        self.blocklist = blocklist

    def __call__(self, *, response: str, **kwargs):
        contains_blocked_word = any(word in response for word in self.blocklist)
        return {"contains_blocked_word": contains_blocked_word}
    
# Example usage
blocklist_evaluator = BlocklistEvaluator(blocklist=["bad", "evil", "worst"])
result = blocklist_evaluator(response="This is the worst response ever!")
print(result)

{'contains_blocked_word': True}


## Prompt-based evaluators

#### Helpfulness evaluator

In [9]:
from helpfulness import HelpfulnessEvaluator

helpfulness_evaluator = HelpfulnessEvaluator(model_config)

helpfulness_score = helpfulness_evaluator(
    query="What's the meaning of life?", 
    context="Arthur Schopenhauer was the first to explicitly ask the question, in an essay entitled 'Character'.", 
    response="The answer is 42."
)
print(helpfulness_score)

{'helpfulness': 1.0, 'helpfulness_reason': 'The RESPONSE is entirely unhelpful as it does not address the philosophical nature of the QUERY or provide any meaningful information related to the CONTEXT.'}


#### JSON accuracy evaluator

In [10]:
import json
from json_accuracy import JSONAccuracyEvaluator

# Load jsons/example.jsonl file here
example_json_schema = json.load(open('jsons/example_schema.json', 'r'))

# Example JSON object
sample_json_data = json.load(open('jsons/poor_output.json', 'r'))

accuracy_evaluator = JSONAccuracyEvaluator(model_config)
accuracy_score = accuracy_evaluator(json_output=sample_json_data, schema=example_json_schema)
print(accuracy_score)

{'json_accuracy': 0.5, 'json_accuracy_reason': 'The JSON output is mostly correct but is missing the required "companyName" field in the "companyInfo" object, which affects its completeness according to the schema.'}


# Evaluating a dataset

In [11]:
import pandas as pd
from azure.ai.evaluation import evaluate
from pprint import pprint
from model_endpoint import ModelEndpoint
from IPython.display import clear_output

azure_ai_project = {
    "subscription_id": os.environ.get("SUBSCRIPTION_ID"),
    "resource_group_name": os.environ.get("RG_NAME"),
    "project_name": os.environ.get("PROJECT_NAME"),
}

# Define your evaluators
relevance_evaluator = RelevanceEvaluator(model_config)

# Evaluate the dataset
result = evaluate(
    data="evaluation_dataset.jsonl",
    target=ModelEndpoint(model_config),
    evaluators={
        # Performance and quality evaluators (AI-assisted)
        "relevance": relevance_evaluator,
        "coherence": coherence_evaluator,
        "fluency": fluency_evaluator,
        "groundedness": groundedness_evaluator,
        # Custom evaluators (code and prompt based)
        "helpfulness": helpfulness_evaluator,
    },
    evaluator_config={
        "relevance": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "coherence": {
            "column_mapping": {"response": "${target.response}", "query": "${data.query}"}
        },
        "fluency": {
            "column_mapping": {"response": "${target.response}"}
        },
        "groundedness": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "helpfulness": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
    },
    azure_ai_project=azure_ai_project,
    output_path="./evaluation_results.json",
)
clear_output()

In [12]:
pd.DataFrame(result["rows"])

Unnamed: 0,outputs.query,outputs.response,inputs.query,inputs.context,inputs.ground_truth,outputs.relevance.relevance,outputs.relevance.gpt_relevance,outputs.relevance.relevance_reason,outputs.coherence.coherence,outputs.coherence.gpt_coherence,outputs.coherence.coherence_reason,outputs.fluency.fluency,outputs.fluency.gpt_fluency,outputs.fluency.fluency_reason,outputs.groundedness.groundedness,outputs.groundedness.gpt_groundedness,outputs.groundedness.groundedness_reason,outputs.helpfulness.helpfulness,outputs.helpfulness.helpfulness_reason,line_number
0,"What event started on July 28, 1914?","World War I began on July 28, 1914, when Austr...","What event started on July 28, 1914?",It involved multiple countries and lasted unti...,World War I,5,5,The response accurately and completely address...,4,4,The RESPONSE is coherent and effectively addre...,4,4,The RESPONSE demonstrates proficient fluency w...,5,5,The response accurately identifies the event a...,5,The RESPONSE is fully helpful as it accurately...,0
1,Who was the first person to walk on the moon?,The first person to walk on the moon was Neil ...,Who was the first person to walk on the moon?,The event occurred during the Apollo 11 missio...,Neil Armstrong,5,5,The response accurately and completely answers...,4,4,The response is coherent and directly addresse...,4,4,"The RESPONSE is well-articulated, with good co...",5,5,"The response is fully correct and complete, di...",5,The RESPONSE is fully helpful as it accurately...,1
2,What was the significance of the year 1776 in ...,The year 1776 is highly significant in America...,What was the significance of the year 1776 in ...,A key document was signed declaring independen...,The Declaration of Independence,5,5,The response fully addresses the query with ac...,4,4,The RESPONSE is coherent and effectively addre...,4,4,The RESPONSE should receive a high score becau...,5,5,"The RESPONSE is fully correct and complete, di...",5,The RESPONSE is entirely helpful as it fully a...,2
3,"Which wall fell in 1989, symbolizing the end o...","The Berlin Wall fell in 1989, symbolizing the ...","Which wall fell in 1989, symbolizing the end o...",It divided a German city into East and West.,The Berlin Wall,4,4,The RESPONSE fully addresses the QUERY with ac...,4,4,The RESPONSE is coherent as it directly answer...,3,3,The RESPONSE is clear and coherent with correc...,5,5,"The response is fully correct and complete, di...",5,The RESPONSE is fully helpful as it accurately...,3
4,What ancient city was buried by the eruption o...,The ancient city buried by the eruption of Mou...,What ancient city was buried by the eruption o...,The city's ruins were rediscovered in the 18th...,Pompeii,5,5,The RESPONSE not only answers the QUERY accura...,4,4,The RESPONSE is coherent and effectively addre...,3,3,"The RESPONSE is clear and coherent, with corre...",5,5,The RESPONSE is fully correct and complete in ...,5,The RESPONSE is entirely helpful as it accurat...,4
5,Who was the British Prime Minister during Worl...,"During World War II, the British Prime Ministe...",Who was the British Prime Minister during Worl...,"He is famous for his leadership and speeches, ...",Winston Churchill,5,5,The response accurately and completely answers...,4,4,"The RESPONSE is coherent, directly answers the...",4,4,The response is well-articulated with good con...,5,5,"The response is fully correct and complete, di...",5,The RESPONSE is fully helpful as it accurately...,5
6,What was the name of the ship that sank on its...,The ship that sank on its maiden voyage in 191...,What was the name of the ship that sank on its...,It was deemed 'unsinkable' before it hit an ic...,RMS Titanic,4,4,The RESPONSE accurately and completely answers...,4,4,The response is coherent because it directly a...,3,3,The response is clear and grammatically correc...,5,5,"The response is fully correct and complete, di...",5,The RESPONSE is fully helpful as it accurately...,6
7,Which empire was ruled by Genghis Khan?,Genghis Khan ruled the Mongol Empire. He found...,Which empire was ruled by Genghis Khan?,This empire became the largest contiguous land...,The Mongol Empire,5,5,The response not only accurately and completel...,4,4,The response is coherent and effectively addre...,3,3,"The RESPONSE is clear and coherent, with corre...",5,5,The response accurately and completely answers...,5,The RESPONSE is entirely helpful as it accurat...,7
8,What was the primary cause of the American Civ...,The primary cause of the American Civil War wa...,What was the primary cause of the American Civ...,The conflict between the Northern and Southern...,Slavery,5,5,The response fully addresses the query with ac...,4,4,The RESPONSE is coherent and effectively addre...,4,4,The RESPONSE should receive a high score becau...,5,5,"The response is fully grounded in the context,...",5,The RESPONSE is entirely helpful as it accurat...,8
9,Which ancient wonder was located in Egypt and ...,The ancient wonder located in Egypt that serve...,Which ancient wonder was located in Egypt and ...,It is the only one of the Seven Wonders of the...,The Great Pyramid of Giza,5,5,The RESPONSE fully addresses the QUERY with ac...,4,4,The RESPONSE is coherent and effectively addre...,4,4,The RESPONSE is well-articulated with correct ...,5,5,"The response is fully correct and complete, di...",5,The RESPONSE is entirely helpful as it accurat...,9
