In [1]:
import os
from athina.llms.openai_service import OpenAiService
from athina.evals import DoesResponseAnswerQuery, ContextContainsEnoughInformation, Faithfulness, CustomGrader
from athina.loaders import RagLoader
from athina.keys import AthinaApiKey, OpenAiApiKey
from athina.interfaces.athina import AthinaFilters

OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

In [2]:
# Create batch dataset from list of dict objects
raw_data = [
    {
        "query": "What is the capital of Greece?",
        "context": "Greece is often called the cradle of Western civilization.",
        "response": "Athens",
    },
    {
        "query": "What is the price of a Tesla Model 3?",
        "context": "Tesla Model 3 is a fully electric car.",
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "query": "What is a shooting star?",
        "context": "Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.",
        "response": "A shooting star is a meteor that burns up in the atmosphere.",
    }
]

dataset = RagLoader().load_dict(raw_data)

In [3]:
# Your inference call to OpenAI
model = "gpt-3.5-turbo"
eval_model = "gpt-4-1106-preview"

print(dataset)

[{'query': 'What is the capital of Greece?', 'context': 'Greece is often called the cradle of Western civilization.', 'response': 'Athens'}, {'query': 'What is the price of a Tesla Model 3?', 'context': 'Tesla Model 3 is a fully electric car.', 'response': 'I cannot answer this question as prices vary from country to country.'}]


In [4]:
# Checks if the LLM response answers the user query sufficiently
DoesResponseAnswerQuery(model=eval_model).run_batch(data=dataset)

[{'name': 'DoesResponseAnswerQuery',
  'data': {'query': 'What is the capital of Greece?',
   'context': 'Greece is often called the cradle of Western civilization.',
   'response': 'Athens'},
  'failure': False,
  'reason': "The response 'Athens' directly answers the user's query about the capital of Greece. It is specific and covers the aspect of the user's query that asks for the name of the capital.",
  'runtime': 3801,
  'model': 'gpt-4-1106-preview'},
 {'name': 'DoesResponseAnswerQuery',
  'data': {'query': 'What is the price of a Tesla Model 3?',
   'context': 'Tesla Model 3 is a fully electric car.',
   'response': 'I cannot answer this question as prices vary from country to country.'},
  'failure': True,
  'reason': "The response does not answer the user's query sufficiently. It does not provide any specific price information or a range for the Tesla Model 3, nor does it offer to provide a source where the user could find the price. Instead, it only states that prices vary, w

In [5]:
# Checks if the LLM response is faithful to the information provided to it
Faithfulness(model=eval_model).run_batch(data=dataset)

In [6]:
# Checks if the context contains enough information to answer the user query provided
ContextContainsEnoughInformation(model=eval_model).run_batch(data=dataset)

In [7]:
# custom evaluator
# Checks if the response mentions black holes
grading_criteria="If the response mentions black holes, then fail. Otherwise pass."
CustomGrader(model=eval_model, grading_criteria=grading_criteria).run_batch(data=dataset)