In [1]:
import os
from athina.llms.openai_service import OpenAiService
from athina.evals import DoesResponseAnswerQuery, ContextContainsEnoughInformation, Faithfulness, CustomGrader
from athina.evals.llm.custom.evaluator import CustomLlmEval
from athina.evals.llm.faithfulness.examples import FAITHFULNESS_EVAL_EXAMPLES
from athina.keys import AthinaApiKey, OpenAiApiKey

OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
# AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

In [2]:
# user query
query = "What is a shooting star?"

# retrieved context - BAD
context = "A shooting star is a small piece of space debris, such as a rock or dust particle, that enters Earth's atmosphere and burns up due to friction with the air. This creates a streak of light in the sky, which is commonly referred to as a shooting star."

# Your inference call to OpenAI
# model = "gpt-3.5-turbo"
# prompt = [
#     {
#         "role": "system",
#         "content": f"Use the information provided to you to answer the user's question. Information: {context}"
#     },
#     {
#         "role": "user",
#         "content": query
#     }
# ]
# openai_service = OpenAiService()
# response = openai_service.chat_completion(prompt, model=model)

response = "A shooting star is a small piece of space debris, such as a rock or dust particle, that enters Earth's atmosphere and burns up due to friction with the air. This creates a streak of light in the sky, which is commonly referred to as a shooting star."

In [3]:
# evaluation parameters
eval_model = "gpt-3.5-turbo"

In [4]:
# Run the answer relevance evaluator
# Checks if the LLM response answers the user query sufficiently
DoesResponseAnswerQuery().run(query=query, response=response)

{'eval_request_id': [],
 'eval_results': [{'name': 'Draq',
   'display_name': 'Does Response Answer Query',
   'data': {'query': 'What is a shooting star?',
    'response': "A shooting star is a small piece of space debris, such as a rock or dust particle, that enters Earth's atmosphere and burns up due to friction with the air. This creates a streak of light in the sky, which is commonly referred to as a shooting star."},
   'failure': False,
   'reason': "The response answers the user's query sufficiently. It provides a clear and concise explanation of what a shooting star is, including the fact that it is a small piece of space debris that enters Earth's atmosphere and burns up due to friction with the air, creating a streak of light in the sky.",
   'runtime': 3048,
   'model': 'gpt-3.5-turbo',
   'metric': {'id': 'failed', 'value': 0.0}}]}

In [5]:
# Run the faithfulness evaluator
# Checks if the LLM response is faithful to the information provided to it
Faithfulness().run(context=context, response=response)

{'eval_request_id': [],
 'eval_results': [{'name': 'Irftc',
   'display_name': 'Faithfulness',
   'data': {'context': "A shooting star is a small piece of space debris, such as a rock or dust particle, that enters Earth's atmosphere and burns up due to friction with the air. This creates a streak of light in the sky, which is commonly referred to as a shooting star.",
    'response': "A shooting star is a small piece of space debris, such as a rock or dust particle, that enters Earth's atmosphere and burns up due to friction with the air. This creates a streak of light in the sky, which is commonly referred to as a shooting star."},
   'failure': False,
   'reason': 'The response can be directly inferred from the context as it is an exact repetition of the information provided. The context defines what a shooting star is and the response repeats that definition verbatim.',
   'runtime': 3171,
   'model': 'gpt-4-1106-preview',
   'metric': {'id': 'failed', 'value': 0.0}}]}

In [6]:
# Run the ContextContainsEnoughInformation evaluator
# Checks if the context contains enough information to answer the user query provided
ContextContainsEnoughInformation(model=eval_model).run(context=context, query=query)

{'eval_request_id': [],
 'eval_results': [{'name': 'Ccei',
   'display_name': 'Context Contains Enough Information',
   'data': {'context': "A shooting star is a small piece of space debris, such as a rock or dust particle, that enters Earth's atmosphere and burns up due to friction with the air. This creates a streak of light in the sky, which is commonly referred to as a shooting star.",
    'query': 'What is a shooting star?'},
   'failure': False,
   'reason': "The context provided contains sufficient information to answer the user's query about what a shooting star is. It explains that a shooting star is not actually a star, but rather a small piece of space debris that burns up upon entering Earth's atmosphere, resulting in a visible streak of light in the sky. This directly addresses the user's question.",
   'runtime': 15258,
   'model': 'gpt-4-1106-preview',
   'metric': {'id': 'failed', 'value': 0.0}}]}

In [7]:
# custom evaluator
# Checks if the response mentions black holes
grading_criteria="If the response mentions black holes, then fail. Otherwise pass."
CustomGrader(grading_criteria=grading_criteria).run(context=context, query=query, response=response)

{'eval_request_id': [],
 'eval_results': [{'name': 'Custom',
   'display_name': 'Custom',
   'data': {'context': "A shooting star is a small piece of space debris, such as a rock or dust particle, that enters Earth's atmosphere and burns up due to friction with the air. This creates a streak of light in the sky, which is commonly referred to as a shooting star.",
    'query': 'What is a shooting star?',
    'response': "A shooting star is a small piece of space debris, such as a rock or dust particle, that enters Earth's atmosphere and burns up due to friction with the air. This creates a streak of light in the sky, which is commonly referred to as a shooting star."},
   'failure': False,
   'reason': 'The response does not mention black holes and therefore passes according to the grading criteria.',
   'runtime': 2796,
   'model': 'gpt-4-1106-preview',
   'metric': {'id': 'failed', 'value': 0.0}}]}