In [1]:
import os
from athina.llms.openai_service import OpenAiService
from athina.evals import DoesResponseAnswerQuery, ContextContainsEnoughInformation, Faithfulness, RagasContextRelevancy, RagasAnswerRelevancy
from athina.evals import FunctionEvaluator
from athina.loaders import RagLoader, ResponseLoader
from athina.keys import AthinaApiKey, OpenAiApiKey
from athina.interfaces.athina import AthinaFilters
import pandas as pd
from athina.llms.openai_service import OpenAiService

OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create batch dataset from list of dict objects
raw_data_one = [
    {
        "query": "What is the capital of Greece?",
        "context": "Greece is often called the cradle of Western civilization.",
        "response": "Athens",
    },
    {
        "query": "What is the price of a Tesla Model 3?",
        "context": "Tesla Model 3 is a fully electric car.",
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "query": "What is a shooting star?",
        "context": "Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.",
        "response": "A shooting star is a meteor that burns up in the atmosphere.",
    }
]

dataset_one = RagLoader().load_dict(raw_data_one)
pd.DataFrame(dataset_one)

Unnamed: 0,query,context,response
0,What is the capital of Greece?,Greece is often called the cradle of Western c...,Athens
1,What is the price of a Tesla Model 3?,Tesla Model 3 is a fully electric car.,I cannot answer this question as prices vary f...
2,What is a shooting star?,Black holes are stars that have collapsed unde...,A shooting star is a meteor that burns up in t...


In [3]:
eval_model = "gpt-3.5-turbo"
RagasContextRelevancy(model=eval_model).run_batch(data=dataset_one).to_df()

evaluating with [context_relevancy]


100%|██████████| 1/1 [00:00<00:00,  1.30it/s]


evaluating with [context_relevancy]


100%|██████████| 1/1 [00:00<00:00,  1.93it/s]


evaluating with [context_relevancy]


100%|██████████| 1/1 [00:00<00:00,  1.99it/s]


Unnamed: 0,query,context,response,display_name,failed,grade_reason,runtime,model,ragas_context_relevancy
0,What is the capital of Greece?,Greece is often called the cradle of Western civilization.,Athens,Context Relevancy,,This metric is calulated by dividing the number of sentences in context that are relevant for answering the given query by the total number of sentences in the retrieved context,1792,gpt-3.5-turbo,0.0
1,What is the price of a Tesla Model 3?,Tesla Model 3 is a fully electric car.,I cannot answer this question as prices vary from country to country.,Context Relevancy,,This metric is calulated by dividing the number of sentences in context that are relevant for answering the given query by the total number of sentences in the retrieved context,1317,gpt-3.5-turbo,0.0
2,What is a shooting star?,"Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.",A shooting star is a meteor that burns up in the atmosphere.,Context Relevancy,,This metric is calulated by dividing the number of sentences in context that are relevant for answering the given query by the total number of sentences in the retrieved context,844,gpt-3.5-turbo,0.0


In [4]:
raw_data_two = [
    {
        "query": "Where is France and what is it's capital?",
        "context": "France is the country in europe known for delicious cuisine. Paris is the capital of france",
        "response": "Tesla is an electric car",
    },
    {
        "query": "Where is France and what is it's capital?",
        "context": "France is the country in europe known for delicious cuisine. Paris is the capital of france",
        "response": "France is in western Europe and Paris is its capital",
    },
]

dataset_two = RagLoader().load_dict(raw_data_two)
pd.DataFrame(dataset_two)

Unnamed: 0,query,context,response
0,Where is France and what is it's capital?,France is the country in europe known for delicious cuisine. Paris is the capital of france,Tesla is an electric car
1,Where is France and what is it's capital?,France is the country in europe known for delicious cuisine. Paris is the capital of france,France is in western Europe and Paris is its capital


In [5]:
eval_model = "gpt-3.5-turbo"
RagasAnswerRelevancy(model=eval_model).run_batch(data=dataset_two).to_df()

evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:02<00:00,  2.53s/it]


evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:02<00:00,  2.09s/it]


Unnamed: 0,query,context,response,display_name,failed,grade_reason,runtime,model,ragas_answer_relevancy
0,Where is France and what is it's capital?,France is the country in europe known for delicious cuisine. Paris is the capital of france,Tesla is an electric car,Answer Relevancy,,"A response is deemed relevant when it directly and appropriately addresses the original query. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the response lacks completeness or contains redundant details",3168,gpt-3.5-turbo,0.749017
1,Where is France and what is it's capital?,France is the country in europe known for delicious cuisine. Paris is the capital of france,France is in western Europe and Paris is its capital,Answer Relevancy,,"A response is deemed relevant when it directly and appropriately addresses the original query. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the response lacks completeness or contains redundant details",2473,gpt-3.5-turbo,0.975475


In [None]:
eval_model = "gpt-3.5-turbo"
ContextContainsEnoughInformation(model=eval_model).run_batch(data=dataset_one).to_df()

In [None]:
# Checks if the LLM response answers the user query sufficiently
eval_model = "gpt-3.5-turbo"
DoesResponseAnswerQuery(model=eval_model).run_batch(data=dataset_one).to_df()

In [None]:
# Checks if the LLM response is faithful to the information provided to it
eval_model = "gpt-3.5-turbo"
data = {
        "query": "What is the capital of Greece?",
        "context": ["Greece is often called the cradle of Western civilization.", "Greece is the ancient birthplace of the Olympic Games."],
        "response": "Athens",
    }
Faithfulness(model=eval_model).run(**data).to_df()

In [None]:
# Checks if the context contains enough information to answer the user query provided
eval_model = "gpt-3.5-turbo"

In [None]:
# custom evaluator
# Checks if the response mentions black holes
grading_criteria="If the response mentions black holes, then fail. Otherwise pass."

In [None]:
raw_data = [
    {
        "query": "What is the capital of Greece?",
        "context": ["Greece is often called the cradle of Western civilization.", "Greece is the ancient birthplace of the Olympic Games."],
        "response": "Athens",
    },
    {
        "query": "What is the price of a Tesla Model 3?",
        "context": ["Tesla Model 3 is a fully electric car."],
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "query": "What is a shooting star?",
        "context": ["Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light."],
        "response": "A shooting star is a meteor that burns up in the atmosphere.",
    }
]
dataset = ResponseLoader().load_dict(raw_data)
functionEvaluator = FunctionEvaluator(function_name="ContainsAny", function_arguments={
    "keywords": ["tesla", "animal", "star"]
})
functionEvaluator.run_batch(data=dataset)


### You can run our function based evaluators as follows

In [None]:
# Imports
from athina.evals import ContainsAny, Regex
from athina.loaders import ResponseLoader

In [None]:
# Load dataset
raw_data = [ 
    { 
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "response": "A shooting star is a meteor that burns up in the atmosphere.",
    }
]
dataset = ResponseLoader().load_dict(raw_data)
pd.DataFrame(dataset)

In [None]:
# Eval checks if the response contains any of the keywords
ContainsAny(keywords=["star"]).run_batch(data=dataset).to_df()

In [None]:
# Load dataset
raw_data = [ 
    { 
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "response": "Contact us at hello@athina.ai to get access to our LLM observability platform where you can run the tests you've defined here against your LLM responses in production.",
    }
]
dataset = ResponseLoader().load_dict(raw_data)
pd.DataFrame(dataset)

In [None]:
# Eval checks if the response matches the regex
Regex(regex='([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)').run_batch(data=dataset).to_df()