# Evaluation with Azure AI Foundry

This notebook demonstrates how to evaluate data using custom evaluators and send the results to [Azure AI Foundry](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio).

### Prerequisites

- An Azure subscription.
- An Azure AI Foundry workspace.
- An Azure AI Foundry project.
- An Azure OpenAI resource.

### Install the required packages

```bash
pip install -r requirements.txt
```

### Create the following environment variables or add them to an `.env` file

```bash
AZURE_OPENAI_ENDPOINT=<your-azure-openai-endpoint>
AZURE_OPENAI_API_KEY=<your-azure-openai-api-key>
AZURE_OPENAI_DEPLOYMENT=<your-azure-openai-deployment>
AZURE_OPENAI_API_VERSION=<your-azure-openai-api-version>
AZURE_SUBSCRIPTION_ID=<your-azure-subscription-id>
AZURE_RESOURCE_GROUP=<your-azure-resource-group>
AZURE_AI_FOUNDRY_PROJECT=<your-azure-azure_foundry_project>
```

### References

- [Azure AI Foundry](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio)
- [Evaluate your Generative AI application locally with the Azure AI Evaluation SDK](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk#evaluating-direct-and-indirect-attack-jailbreak-vulnerability)

In [None]:
!pip install -r requirements.txt

## Imports

In [None]:
import json
import pandas as pd

In [None]:
# Import necessary libraries
import os
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from promptflow.core import AzureOpenAIModelConfiguration
from promptflow.tracing import start_trace

if "AZURE_OPENAI_API_KEY" not in os.environ:
    # load environment variables from .env file
    load_dotenv()

# start a trace session, and print a url for user to check trace
start_trace()

## Setup Credentials and Configuration

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()


azure_ai_project = {
    "subscription_id": os.getenv("AZURE_SUBSCRIPTION_ID"),
    "resource_group_name": os.getenv("AZURE_RESOURCE_GROUP"),
    "project_name": os.getenv("AZURE_AI_FOUNDRY_PROJECT"),
}


model_config = {
    "api_key":os.getenv("AZURE_OPENAI_API_KEY"),
    "azure_endpoint": os.getenv("AZURE_OPENAI_ENDPOINT"),
    "azure_deployment": os.getenv("AZURE_OPENAI_DEPLOYMENT"),
}



configuration = AzureOpenAIModelConfiguration(
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version = os.getenv("AZURE_OPENAI_API_VERSION"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY")
)

credential = DefaultAzureCredential()

In [None]:
print(azure_ai_project)



## Groundedness Evaluator

In [None]:
from azure.ai.evaluation import GroundednessProEvaluator, GroundednessEvaluator

# Initializing Groundedness and Groundedness Pro evaluators
groundedness_eval = GroundednessEvaluator(model_config, threshold=3)

query_response_pass = dict(
    query="Is it allowed to bring bottled water on domestic flights?",
    context="",
    response="No. According to the airline regulations, passengers are not permitted to bring bottled water through security checkpoints. Only liquids purchased after the security screening are allowed on board. This rule is in place to ensure passenger safety and comply with transportation security guidelines."
)

query_response_fail = dict(
    query="Is it allowed to bring bottled water on domestic flights?",
    context="",
    response="Yes. You can bring any amount of bottled water on a plane without restrictions. There are no rules about liquids on domestic flights."
)

# Running Groundedness Evaluator on a query and response pair
groundedness_score = groundedness_eval(
    **query_response_fail
)
print(groundedness_score)



In [None]:
records = []
with open("data/data.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        result = groundedness_eval(
            query=item["query"],
            context=item.get("context", ""),
            response=item["ground_truth"]
        )
        records.append({
            "query": item["query"],
            "response": item["ground_truth"],
            "groundedness": result["groundedness"],
            "groundedness_reason": result["groundedness_reason"],
            "groundedness_result": result["groundedness_result"],
            "groundedness_threshold": result["groundedness_threshold"],
        })

# Load into a DataFrame and display
df = pd.DataFrame(records)
pd.set_option('display.max_colwidth', None)
display(df)


## Relevance Evaluator

In [None]:
from azure.ai.evaluation import RelevanceEvaluator

# Initialazing Groundedness and Groundedness Pro evaluators
relevance_eval = RelevanceEvaluator(model_config, threshold=3)

query_response_pass = dict(
    query="Is it allowed to bring bottled water on domestic flights?",
    context="",
    response="No. According to the airline regulations, passengers are not permitted to bring bottled water through security checkpoints. Only liquids purchased after the security screening are allowed on board. This rule is in place to ensure passenger safety and comply with transportation security guidelines."
)

query_response_fail = dict(
    query="Is it allowed to bring bottled water on domestic flights?",
    context="",
    response="Yes. You can bring any amount of bottled water on a plane without restrictions. There are no rules about liquids on domestic flights."
)


# Running Groundedness Evaluator on a query and response pair
relevance_eval_score = relevance_eval(
    **query_response_pass
)
print(relevance_eval_score)

In [None]:
records = []
with open("data/data.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        result = relevance_eval(
            query=item["query"],
            context=item.get("context", ""),
            response=item["ground_truth"]
        )
        records.append({
            "query":       item["query"],
            "response":    item["ground_truth"],
            "relevance":   result["relevance"],
            "relevant_reason": result["relevance_reason"],
            "relevant_result": result["relevance_result"],
            "relevant_threshold": result["relevance_threshold"],
         
        })

# 3. Load into a DataFrame and display
df = pd.DataFrame(records)
pd.set_option('display.max_colwidth', None)
display(df)

## Friendliness Evaluator

In [None]:
from friendliness.friendliness import FriendlinessEvaluator

friendliness_eval = FriendlinessEvaluator(configuration)

friendliness_score = friendliness_eval(response="I will not apologize for my behavior!")

print(friendliness_score)

## Evaluate with both built-in and custom evaluators

In [None]:
import os
import pathlib
from pathlib import Path

from azure.ai.evaluation import evaluate
from azure.ai.evaluation import (
    ContentSafetyEvaluator,
    RelevanceEvaluator,
    CoherenceEvaluator,
    GroundednessEvaluator,
    FluencyEvaluator,
    SimilarityEvaluator,
)
from model_endpoint import ModelEndpoint

# Disable local snapshot to speed up evaluation
os.environ["PROMPTFLOW_DISABLE_LOCAL_SNAPSHOT"] = "true"

content_safety_evaluator = ContentSafetyEvaluator(
    azure_ai_project=azure_ai_project, credential=DefaultAzureCredential()
)
relevance_evaluator = RelevanceEvaluator(model_config)
coherence_evaluator = CoherenceEvaluator(model_config)
groundedness_evaluator = GroundednessEvaluator(model_config)
fluency_evaluator = FluencyEvaluator(model_config)
similarity_evaluator = SimilarityEvaluator(model_config)

# Create proper output path
output_path = str(Path.cwd() / "results.jsonl")
print(f"Output path: {output_path}")

# Pre-create the results file to avoid snapshot copy issues
Path("results.jsonl").touch()

results = evaluate(
    evaluation_name="Eval-Run-" + "-" + model_config["azure_deployment"].title(),
    data= "./data/data_3.jsonl",
    target=ModelEndpoint(model_config),
    evaluators={
        "content_safety": content_safety_evaluator,
        "coherence": coherence_evaluator,
        "relevance": relevance_evaluator,
        "groundedness": groundedness_evaluator,
        "fluency": fluency_evaluator,
        "similarity": similarity_evaluator,
        "friendliness": friendliness_eval #custom evaluator
    },
    # column mapping
    evaluator_config={
        "content_safety": {"column_mapping": {"query": "${data.query}", "response": "${data.response}"}},
        "coherence": {"column_mapping": {"response": "${data.response}", "query": "${data.query}"}},
        "relevance": {
            "column_mapping": {"response": "${data.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "groundedness": {
            "column_mapping": {
                "response": "${data.response}",
                "context": "${data.context}",
                "query": "${data.query}",
            }
        },
        "fluency": {
            "column_mapping": {"response": "${data.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "similarity": {
            "column_mapping": {"response": "${data.response}", "ground_truth": "${data.ground_truth}", "query": "${data.query}"}
        },
        "friendliness": {
            "column_mapping": {"response": "${data.response}", "context": "${data.context}", "query": "${data.query}"
            }
        }
    },
    # Optionally provide your Azure AI project information to track your evaluation results in your Azure AI project
    azure_ai_project = azure_ai_project,
    # Use proper output path
    output_path=output_path
)



In [None]:
import json
import pandas as pd
from IPython.display import display

# Load the JSON content from the file "results.jsonl"
with open("results.jsonl", "r") as f:
	jsonl_text = f.read()

data = json.loads(jsonl_text)

# Create a DataFrame using the "rows" key from the JSON data
df = pd.DataFrame(data.get("rows", []))

# Convert the DataFrame to CSV format as a string
csv_content = df.to_csv(index=False)
# Display the DataFrame as a table in the notebook
display(df)


In [None]:
df.to_csv("results.csv", index=False)