In [1]:
import os
from athina.llms.openai_service import OpenAiService
from athina.evals import DoesResponseAnswerQuery, ContextContainsEnoughInformation, Faithfulness, RagasContextRelevancy, RagasAnswerRelevancy, RagasContextPrecision, RagasFaithfulness, RagasContextRecall, RagasAnswerSemanticSimilarity
from athina.evals import FunctionEvaluator
from athina.loaders import RagLoader, ResponseLoader, RagasLoader
from athina.keys import AthinaApiKey, OpenAiApiKey
from athina.interfaces.athina import AthinaFilters
import pandas as pd
from athina.llms.openai_service import OpenAiService

OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
raw_data_ragas = [
    {
        "query": "Where is France and what is it's capital?",
        "contexts": ["France is the country in europe known for delicious cuisine", "Tesla is an electric car", "Elephant is an animal"],
        "response": "Tesla is an electric car",
    },
    {
        "query": "Where is France and what is it's capital?",
        "contexts": ["France is the country in europe known for delicious cuisine", "Paris is the capital of france"],
        "response": "France is in western Europe and Paris is its capital",
    },
]

dataset_raw_data_ragas = RagasLoader().load_dict(raw_data_ragas)
pd.DataFrame(dataset_raw_data_ragas)

In [None]:
eval_model = "gpt-3.5-turbo"
RagasAnswerRelevancy(model=eval_model).run_batch(data=dataset_raw_data_ragas).to_df()

In [2]:
raw_data_ragas_with_expected_response = [
    {
        "query": "Where is France and what is it's capital?",
        "contexts": ["France is the country in europe known for delicious cuisine", "Tesla is an electric car", "Elephant is an animal"],
        "response": "Tesla is an electric car",
        "expected_response": "France is in europe. Paris is it's capital"
    },
    {
        "query": "What is Tesla? Who founded it?",
        "contexts": ["Tesla is the electric car company. Tesla is registerd in United States", "Elon Musk founded it"],
        "response": "France is in western Europe and Paris is its capital",
        "expected_response": "Tesla is an electric car company. Elon Musk founded it."
    },
]
dataset_raw_data_ragas_with_expected_response = RagasLoader().load_dict(raw_data_ragas_with_expected_response)
pd.DataFrame(dataset_raw_data_ragas_with_expected_response)

Unnamed: 0,query,contexts,response,expected_response
0,Where is France and what is it's capital?,[France is the country in europe known for del...,Tesla is an electric car,France is in europe. Paris is it's capital
1,What is Tesla? Who founded it?,[Tesla is the electric car company. Tesla is r...,France is in western Europe and Paris is its c...,Tesla is an electric car company. Elon Musk fo...


In [None]:
eval_model = "gpt-3.5-turbo"
RagasContextPrecision(model=eval_model).run_batch(data=dataset_raw_data_ragas_with_expected_response).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasContextRelevancy(model=eval_model).run_batch(data=dataset_raw_data_ragas).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasFaithfulness(model=eval_model).run_batch(data=dataset_raw_data_ragas).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasContextRecall(model=eval_model).run_batch(data=dataset_raw_data_ragas_with_expected_response).to_df()

In [3]:
eval_model = "gpt-3.5-turbo"
RagasAnswerSemanticSimilarity(model=eval_model).run_batch(data=dataset_raw_data_ragas_with_expected_response).to_df()

evaluating with [answer_similarity]


100%|██████████| 1/1 [00:01<00:00,  1.19s/it]


evaluating with [answer_similarity]


100%|██████████| 1/1 [00:00<00:00,  1.28it/s]


Unnamed: 0,query,contexts,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_answer_semantic_similarity
0,Where is France and what is it's capital?,"[France is the country in europe known for delicious cuisine, Tesla is an electric car, Elephant is an animal]",Tesla is an electric car,France is in europe. Paris is it's capital,Context Precision,,"Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated response and the ground truth. This evaluation is based on the ground truth and the response, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated response and the ground truth.",2672,gpt-3.5-turbo,0.744801
1,What is Tesla? Who founded it?,"[Tesla is the electric car company. Tesla is registerd in United States, Elon Musk founded it]",France is in western Europe and Paris is its capital,Tesla is an electric car company. Elon Musk founded it.,Context Precision,,"Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated response and the ground truth. This evaluation is based on the ground truth and the response, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated response and the ground truth.",1602,gpt-3.5-turbo,0.737235


In [None]:
# Create batch dataset from list of dict objects
raw_data_two = [
    {
        "query": "What is the capital of Greece?",
        "context": "Greece is often called the cradle of Western civilization.",
        "response": "Athens",
    },
    {
        "query": "What is the price of a Tesla Model 3?",
        "context": "Tesla Model 3 is a fully electric car.",
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "query": "What is a shooting star?",
        "context": "Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.",
        "response": "A shooting star is a meteor that burns up in the atmosphere.",
    }
]

dataset_one = RagLoader().load_dict(raw_data_one)
pd.DataFrame(dataset_one)

In [None]:
eval_model = "gpt-3.5-turbo"
ContextContainsEnoughInformation(model=eval_model).run_batch(data=dataset_one).to_df()

In [None]:
# Checks if the LLM response answers the user query sufficiently
eval_model = "gpt-3.5-turbo"
DoesResponseAnswerQuery(model=eval_model).run_batch(data=dataset_one).to_df()

In [None]:
# Checks if the LLM response is faithful to the information provided to it
eval_model = "gpt-3.5-turbo"
Faithfulness(model=eval_model).run_batch(data=dataset_one).to_df()

### You can run our function based evaluators as follows

In [None]:
# Imports
from athina.evals import ContainsAny, Regex
from athina.loaders import ResponseLoader

In [None]:
# Load dataset
raw_data = [ 
    { 
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "response": "A shooting star is a meteor that burns up in the atmosphere.",
    }
]
dataset = ResponseLoader().load_dict(raw_data)
pd.DataFrame(dataset)

In [None]:
# Eval checks if the response contains any of the keywords
ContainsAny(keywords=["star"]).run_batch(data=dataset).to_df()

In [None]:
# Load dataset
raw_data = [ 
    { 
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "response": "Contact us at hello@athina.ai to get access to our LLM observability platform where you can run the tests you've defined here against your LLM responses in production.",
    }
]
dataset = ResponseLoader().load_dict(raw_data)
pd.DataFrame(dataset)

In [None]:
# Eval checks if the response matches the regex
Regex(regex='([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)').run_batch(data=dataset).to_df()