In [None]:
import os
from athina.llms.openai_service import OpenAiService
from athina.evals import DoesResponseAnswerQuery, ContextContainsEnoughInformation, Faithfulness, LlmEvaluator
from athina.loaders import RagLoader
from athina.keys import AthinaApiKey, OpenAiApiKey
from athina.interfaces.athina import AthinaFilters

OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

In [None]:
# Create batch dataset from list of dict objects
raw_data = [
    {
        "query": "What is the capital of Greece?",
        "context": "Greece is often called the cradle of Western civilization.",
        "response": "Athens",
    },
    {
        "query": "What is the price of a Tesla Model 3?",
        "context": "Tesla Model 3 is a fully electric car.",
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "query": "What is a shooting star?",
        "context": "Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.",
        "response": "A shooting star is a meteor that burns up in the atmosphere.",
    }
]

dataset = RagLoader().load_dict(raw_data)

In [None]:
# Your inference call to OpenAI
model = "gpt-3.5-turbo"
eval_model = "gpt-4-1106-preview"
# openai_service = OpenAiService()
# for data in dataset:
#     query = data['query']
#     context = data.get('context')
#     prompt = [
#         {
#             "role": "system",
#             "content": f"Use the information provided to you to answer the user's question. Information: {context}"
#         },
#         {
#             "role": "user",
#             "content": query
#         }
#     ]

#     response = openai_service.chat_completion(prompt, model=model)
#     data['response'] = response

print(dataset)

In [None]:
# Run the DoesResponseAnswerQuery evaluator
# Checks if the LLM response answers the user query sufficiently
DoesResponseAnswerQuery(model=eval_model).run_batch(data=dataset)

In [None]:
# Run the Faithfulness evaluator
# Checks if the LLM response is faithful to the information provided to it
Faithfulness(model=eval_model).run_batch(data=dataset)

In [None]:
# Run the ContextContainsEnoughInformation evaluator
# Checks if the context contains enough information to answer the user query provided
ContextContainsEnoughInformation(model=eval_model).run_batch(data=dataset)

In [None]:
# custom evaluator
# Checks if the response mentions black holes
# grading_criteria="If the response mentions black holes, then fail. Otherwise pass."
# LlmEvaluator(model=eval_model, grading_criteria=grading_criteria).run_batch(data=dataset)