In [1]:
import os
from athina.evals import (
    DoesResponseAnswerQuery,
    ContextContainsEnoughInformation,
    Faithfulness,
    RagasContextRelevancy,
    RagasAnswerRelevancy,
    RagasContextPrecision,
    RagasFaithfulness,
    RagasContextRecall,
    RagasAnswerSemanticSimilarity,
    RagasAnswerCorrectness,
    RagasHarmfulness,
    RagasMaliciousness,
    RagasCoherence,
    RagasConciseness
)
from athina.loaders import Loader
from athina.keys import AthinaApiKey, OpenAiApiKey
import pandas as pd


from dotenv import load_dotenv
load_dotenv()

OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_data = [
    {
        "query": "Who founded Tesla",
        "context": [
            "Tesla is an automative manufacturer.",
            "Tesla was founded by Elon Musk in 2003 and is headquartered in Palo Alto, California.",
            "Tesla makes electric cars.",
        ],
        "response": "Tesla is an electric car company",
    },
    {
        "query": "Where is France and what is it's capital?",
        "context": ["France is the country in europe known for delicious cuisine", "Paris is the capital of france"],
        "response": "France is in western Europe and Paris is its capital",
    },
]

dataset_raw_data = Loader().load_dict(raw_data)
pd.DataFrame(dataset_raw_data)

Unnamed: 0,query,context,response,expected_response
0,Who founded Tesla,"[Tesla is an automative manufacturer., Tesla w...",Tesla is an electric car company,
1,Where is France and what is it's capital?,[France is the country in europe known for del...,France is in western Europe and Paris is its c...,


In [3]:
eval_model = "gpt-3.5-turbo"
RagasAnswerRelevancy(model=eval_model).run_batch(data=dataset_raw_data).to_df()

evaluating with [answer_relevancy]
evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
100%|██████████| 1/1 [00:01<00:00,  1.44s/it]


Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_answer_relevancy
0,Who founded Tesla,"[Tesla is an automative manufacturer., Tesla was founded by Elon Musk in 2003 and is headquartered in Palo Alto, California., Tesla makes electric cars.]",Tesla is an electric car company,,Ragas Answer Relevancy,,"A response is deemed relevant when it directly and appropriately addresses the original query. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the response lacks completeness or contains redundant details",1743,gpt-3.5-turbo,0.820244
1,Where is France and what is it's capital?,"[France is the country in europe known for delicious cuisine, Paris is the capital of france]",France is in western Europe and Paris is its capital,,Ragas Answer Relevancy,,"A response is deemed relevant when it directly and appropriately addresses the original query. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the response lacks completeness or contains redundant details",1642,gpt-3.5-turbo,0.975397


In [4]:
data = {
        "query": "Where is France and what is its capital?",
        "context": [
            "France is a country in Europe known for delicious cuisine",
            "The capital of France is Paris.", 
            "French fries were not invented in France."
        ],
        "response": "Paris is the capital of France",
    }
eval_model = "gpt-3.5-turbo"
RagasAnswerRelevancy(model=eval_model).run(**data).to_df()

evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:01<00:00,  1.20s/it]


An error occurred while posting eval results 'str' object has no attribute 'get'


Unnamed: 0,query,context,response,display_name,failed,grade_reason,runtime,model,ragas_answer_relevancy
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,Ragas Answer Relevancy,,"A response is deemed relevant when it directly and appropriately addresses the original query. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the response lacks completeness or contains redundant details",1483,gpt-3.5-turbo,0.946617


In [5]:
raw_data_ragas_with_expected_response = [
    {
        "query": "Where is France and what is its capital?",
        "context": [
            "France is a country in Europe known for delicious cuisine",
            "The capital of France is Paris.", 
            "French fries were not invented in France."
        ],
        "response": "Paris is the capital of France",
        "expected_response": "France is in europe. Paris is it's capital"
    },
    {
        "query": "What is Tesla? Who founded it?",
        "context": [
            "Tesla is an electric car company.", 
            "Tesla is registered in United States", 
            "Elon Musk founded Tesla"
        ],
        "response": "Tesla is an electric car company",
        "expected_response": "Tesla is an electric car company, founded by Elon Musk."
    },
]
ragas_dataset_with_expected_response = Loader().load_dict(raw_data_ragas_with_expected_response)
pd.DataFrame(ragas_dataset_with_expected_response)

Unnamed: 0,query,context,response,expected_response
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,France is in europe. Paris is it's capital
1,What is Tesla? Who founded it?,"[Tesla is an electric car company., Tesla is registered in United States, Elon Musk founded Tesla]",Tesla is an electric car company,"Tesla is an electric car company, founded by Elon Musk."


In [6]:
eval_model = "gpt-3.5-turbo"
RagasContextPrecision(model=eval_model).run_batch(data=ragas_dataset_with_expected_response).to_df()

evaluating with [context_precision]
evaluating with [context_precision]


100%|██████████| 1/1 [00:01<00:00,  1.38s/it]
100%|██████████| 1/1 [00:01<00:00,  1.41s/it]


Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_context_precision
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,France is in europe. Paris is it's capital,Ragas Context Precision,,This metric evaluates whether all of the ground-truth relevant items present in the context are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks,1711,gpt-3.5-turbo,0.5
1,What is Tesla? Who founded it?,"[Tesla is an electric car company., Tesla is registered in United States, Elon Musk founded Tesla]",Tesla is an electric car company,"Tesla is an electric car company, founded by Elon Musk.",Ragas Context Precision,,This metric evaluates whether all of the ground-truth relevant items present in the context are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks,1673,gpt-3.5-turbo,0.333333


In [7]:
eval_model = "gpt-3.5-turbo"
RagasContextRelevancy(model=eval_model).run_batch(data=ragas_dataset_with_expected_response).to_df()

evaluating with [context_relevancy]
evaluating with [context_relevancy]


100%|██████████| 1/1 [00:00<00:00,  1.82it/s]
100%|██████████| 1/1 [00:01<00:00,  1.04s/it]


Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_context_relevancy
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,France is in europe. Paris is it's capital,Ragas Context Relevancy,,This metric is calulated by dividing the number of sentences in context that are relevant for answering the given query by the total number of sentences in the retrieved context,1321,gpt-3.5-turbo,0.666667
1,What is Tesla? Who founded it?,"[Tesla is an electric car company., Tesla is registered in United States, Elon Musk founded Tesla]",Tesla is an electric car company,"Tesla is an electric car company, founded by Elon Musk.",Ragas Context Relevancy,,This metric is calulated by dividing the number of sentences in context that are relevant for answering the given query by the total number of sentences in the retrieved context,873,gpt-3.5-turbo,0.333333


In [8]:
eval_model = "gpt-3.5-turbo"
RagasFaithfulness(model=eval_model).run_batch(data=ragas_dataset_with_expected_response).to_df()

evaluating with [faithfulness]
evaluating with [faithfulness]


100%|██████████| 1/1 [00:02<00:00,  2.93s/it]
100%|██████████| 1/1 [00:02<00:00,  2.94s/it]


Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_faithfulness
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,France is in europe. Paris is it's capital,Ragas Faithfulness,,The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not,3235,gpt-3.5-turbo,1.0
1,What is Tesla? Who founded it?,"[Tesla is an electric car company., Tesla is registered in United States, Elon Musk founded Tesla]",Tesla is an electric car company,"Tesla is an electric car company, founded by Elon Musk.",Ragas Faithfulness,,The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not,3223,gpt-3.5-turbo,1.0


In [9]:
eval_model = "gpt-3.5-turbo"
RagasContextRecall(model=eval_model).run_batch(data=ragas_dataset_with_expected_response).to_df()

evaluating with [context_recall]
evaluating with [context_recall]


100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
100%|██████████| 1/1 [00:01<00:00,  1.60s/it]


Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_context_recall
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,France is in europe. Paris is it's capital,Ragas Context Recall,,Context Recall metric is calculated by dividing the number of sentences in the ground truth that can be attributed to retrieved context by the total number of sentences in the grouund truth,1907,gpt-3.5-turbo,1.0
1,What is Tesla? Who founded it?,"[Tesla is an electric car company., Tesla is registered in United States, Elon Musk founded Tesla]",Tesla is an electric car company,"Tesla is an electric car company, founded by Elon Musk.",Ragas Context Recall,,Context Recall metric is calculated by dividing the number of sentences in the ground truth that can be attributed to retrieved context by the total number of sentences in the grouund truth,1906,gpt-3.5-turbo,1.0


In [10]:
eval_model = "gpt-3.5-turbo"
RagasAnswerSemanticSimilarity(model=eval_model).run_batch(data=ragas_dataset_with_expected_response).to_df()

evaluating with [answer_similarity]evaluating with [answer_similarity]



100%|██████████| 1/1 [00:00<00:00,  3.46it/s]
100%|██████████| 1/1 [00:00<00:00,  3.29it/s]


Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_answer_semantic_similarity
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,France is in europe. Paris is it's capital,Ragas Answer Semantic Similarity,,"Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated response and the ground truth. This evaluation is based on the ground truth and the response, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated response and the ground truth",583,gpt-3.5-turbo,0.937311
1,What is Tesla? Who founded it?,"[Tesla is an electric car company., Tesla is registered in United States, Elon Musk founded Tesla]",Tesla is an electric car company,"Tesla is an electric car company, founded by Elon Musk.",Ragas Answer Semantic Similarity,,"Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated response and the ground truth. This evaluation is based on the ground truth and the response, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated response and the ground truth",574,gpt-3.5-turbo,0.955073


In [11]:
eval_model = "gpt-3.5-turbo"
RagasAnswerCorrectness(model=eval_model).run_batch(data=ragas_dataset_with_expected_response).to_df()

evaluating with [answer_correctness]evaluating with [answer_correctness]



100%|██████████| 1/1 [00:01<00:00,  1.52s/it]
100%|██████████| 1/1 [00:01<00:00,  1.72s/it]


Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_answer_correctness
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,France is in europe. Paris is it's capital,Ragas Answer Correctness,,"Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score",2003,gpt-3.5-turbo,0.734328
1,What is Tesla? Who founded it?,"[Tesla is an electric car company., Tesla is registered in United States, Elon Musk founded Tesla]",Tesla is an electric car company,"Tesla is an electric car company, founded by Elon Musk.",Ragas Answer Correctness,,"Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score",1814,gpt-3.5-turbo,0.738768


In [12]:
eval_model = "gpt-3.5-turbo"
RagasHarmfulness(model=eval_model).run_batch(data=ragas_dataset_with_expected_response).to_df()

evaluating with [harmfulness]


  0%|          | 0/1 [00:00<?, ?it/s]

evaluating with [harmfulness]


100%|██████████| 1/1 [00:01<00:00,  1.32s/it]
100%|██████████| 1/1 [00:01<00:00,  1.36s/it]


Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_harmfulness
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,France is in europe. Paris is it's capital,Ragas Harmfulness,,"This is calculated by how much potential generated response has to cause harm to individuals, groups, or society at large",1652,gpt-3.5-turbo,0.0
1,What is Tesla? Who founded it?,"[Tesla is an electric car company., Tesla is registered in United States, Elon Musk founded Tesla]",Tesla is an electric car company,"Tesla is an electric car company, founded by Elon Musk.",Ragas Harmfulness,,"This is calculated by how much potential generated response has to cause harm to individuals, groups, or society at large",1622,gpt-3.5-turbo,0.0


In [13]:
eval_model = "gpt-3.5-turbo"
RagasMaliciousness(model=eval_model).run_batch(data=ragas_dataset_with_expected_response).to_df()

evaluating with [maliciousness]
evaluating with [maliciousness]


100%|██████████| 1/1 [00:00<00:00,  1.14it/s]
100%|██████████| 1/1 [00:01<00:00,  1.29s/it]


Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_maliciousness
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,France is in europe. Paris is it's capital,Ragas Maliciousness,,"This is calculated by how much potential generated response has to harm, deceive, or exploit users",1166,gpt-3.5-turbo,0.0
1,What is Tesla? Who founded it?,"[Tesla is an electric car company., Tesla is registered in United States, Elon Musk founded Tesla]",Tesla is an electric car company,"Tesla is an electric car company, founded by Elon Musk.",Ragas Maliciousness,,"This is calculated by how much potential generated response has to harm, deceive, or exploit users",1580,gpt-3.5-turbo,0.0


In [14]:
eval_model = "gpt-3.5-turbo"
RagasCoherence(model=eval_model).run_batch(data=ragas_dataset_with_expected_response).to_df()

evaluating with [coherence]
evaluating with [coherence]


100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
100%|██████████| 1/1 [00:01<00:00,  1.30s/it]


Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_coherence
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,France is in europe. Paris is it's capital,Ragas Coherence,,"This is calculated by how coherent is the generated llm response and how able it is able to present ideas, information, or arguments in a logical and organized manner",1599,gpt-3.5-turbo,1.0
1,What is Tesla? Who founded it?,"[Tesla is an electric car company., Tesla is registered in United States, Elon Musk founded Tesla]",Tesla is an electric car company,"Tesla is an electric car company, founded by Elon Musk.",Ragas Coherence,,"This is calculated by how coherent is the generated llm response and how able it is able to present ideas, information, or arguments in a logical and organized manner",1316,gpt-3.5-turbo,1.0


In [15]:
eval_model = "gpt-3.5-turbo"
RagasConciseness(model=eval_model).run_batch(data=ragas_dataset_with_expected_response).to_df()

evaluating with [conciseness]
evaluating with [conciseness]


100%|██████████| 1/1 [00:00<00:00,  1.26it/s]
100%|██████████| 1/1 [00:00<00:00,  1.16it/s]


Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,ragas_conciseness
0,Where is France and what is its capital?,"[France is a country in Europe known for delicious cuisine, The capital of France is Paris., French fries were not invented in France.]",Paris is the capital of France,France is in europe. Paris is it's capital,Ragas Conciseness,,"This is calculated by how efficiently generated llm response conveys information or ideas clearly and efficiently, without unnecessary or redundant details",1149,gpt-3.5-turbo,1.0
1,What is Tesla? Who founded it?,"[Tesla is an electric car company., Tesla is registered in United States, Elon Musk founded Tesla]",Tesla is an electric car company,"Tesla is an electric car company, founded by Elon Musk.",Ragas Conciseness,,"This is calculated by how efficiently generated llm response conveys information or ideas clearly and efficiently, without unnecessary or redundant details",1092,gpt-3.5-turbo,1.0


In [16]:
# Create batch dataset from list of dict objects
raw_data = [
    {
        "query": "What is the capital of Greece?",
        "context": ["Greece is often called the cradle of Western civilization."],
        "response": "Athens",
    },
    {
        "query": "What is the price of a Tesla Model 3?",
        "context": ["Tesla Model 3 is a fully electric car."],
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "query": "What is a shooting star?",
        "context": ["Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light."],
        "response": "A shooting star is a meteor that burns up in the atmosphere.",
    }
]

dataset = Loader().load_dict(raw_data)
pd.DataFrame(dataset)

Unnamed: 0,query,context,response,expected_response
0,What is the capital of Greece?,[Greece is often called the cradle of Western civilization.],Athens,
1,What is the price of a Tesla Model 3?,[Tesla Model 3 is a fully electric car.],I cannot answer this question as prices vary from country to country.,
2,What is a shooting star?,"[Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]",A shooting star is a meteor that burns up in the atmosphere.,


In [20]:
dataset = Loader().load_athina_inferences(limit=5)
pd.DataFrame(dataset)

Unnamed: 0,query,context,response,expected_response
0,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,
1,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,
2,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,
3,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,
4,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,


In [22]:
dataset = Loader().load_athina_inferences(limit=5)
pd.DataFrame(dataset)

Unnamed: 0,query,context,response,expected_response
0,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,
1,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,
2,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,
3,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,
4,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,


In [23]:
eval_model = "gpt-3.5-turbo"
ContextContainsEnoughInformation(model=eval_model).run_batch(data=dataset).to_df()

Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,passed
0,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Context Contains Enough Information,False,"The context provided contains sufficient information about India, including its official name, location, population ranking, and political system. Therefore, the chatbot should be able to generate a tweet about India based on this information.",1208,gpt-3.5-turbo,1.0
1,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Context Contains Enough Information,False,"The context provided contains relevant information about India, including its official name, location, population, and political system. This information is sufficient for the chatbot to generate a tweet about India.",944,gpt-3.5-turbo,1.0
2,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Context Contains Enough Information,False,"The context provided contains relevant information about India, including its official name, location, population, and political system. This information is sufficient for the chatbot to generate a tweet about India.",889,gpt-3.5-turbo,1.0
3,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Context Contains Enough Information,False,"The context provided contains relevant information about India, including its official name, location, population, and political system. This information is sufficient for the chatbot to generate a tweet about India.",1078,gpt-3.5-turbo,1.0
4,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Context Contains Enough Information,False,"The context provided contains sufficient information about India, including its official name, location, size, population, and political system. This information can be used to generate a tweet about India.",961,gpt-3.5-turbo,1.0


In [24]:
# Checks if the LLM response answers the user query sufficiently
eval_model = "gpt-4-turbo-preview"
DoesResponseAnswerQuery(model=eval_model).run_batch(data=dataset).to_df()

Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,passed
0,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Does Response Answer Query,True,"The response 'test response' does not answer the user's query about writing a tweet about India. It does not provide any information or content related to India, thus failing to cover any aspect of the user's query.",3844,gpt-4-turbo-preview,0.0
1,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Does Response Answer Query,True,"The response 'test response' does not answer the user's query sufficiently. It does not provide any information or content related to India that could be used in a tweet, thus failing to cover the aspects of the user's query.",2658,gpt-4-turbo-preview,0.0
2,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Does Response Answer Query,True,"The response 'test response' does not answer the user's query about writing a tweet about India. It does not provide any content related to India, nor does it attempt to form a tweet as requested by the user.",3334,gpt-4-turbo-preview,0.0
3,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Does Response Answer Query,True,"The response 'test response' does not answer the user's query about writing a tweet about India. It does not provide any information or content related to India, nor does it attempt to form a tweet as requested by the user.",4502,gpt-4-turbo-preview,0.0
4,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Does Response Answer Query,True,"The response 'test response' does not answer the user's query sufficiently. It does not provide any information about India, nor does it format the content in the style of a tweet, which was specifically requested by the user.",4125,gpt-4-turbo-preview,0.0


In [25]:
# Checks if the LLM response is faithful to the information provided to it
eval_model = "gpt-3.5-turbo"
Faithfulness(model=eval_model).run_batch(data=dataset).to_df()

Unnamed: 0,query,context,response,expected_response,display_name,failed,grade_reason,runtime,model,passed
0,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Faithfulness,True,The response 'test response' cannot be inferred from the provided context. The context does not provide any information that would lead to the response 'test response'.,1064,gpt-3.5-turbo,0.0
1,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Faithfulness,True,The response 'test response' cannot be inferred from the provided context. The context does not provide any information that would lead to the response 'test response'.,1199,gpt-3.5-turbo,0.0
2,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Faithfulness,True,The response 'test response' cannot be inferred from the provided context. The context does not provide any information that would lead to the response 'test response'.,967,gpt-3.5-turbo,0.0
3,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Faithfulness,True,The response 'test response' cannot be inferred from the provided context. The context does not provide any information that would lead to the response 'test response'.,861,gpt-3.5-turbo,0.0
4,Write a tweet about India,"[{'information': ""India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.""}]",test response,,Faithfulness,True,The response 'test response' cannot be inferred from the provided context. The context does not provide any information that would lead to the response 'test response'.,3243,gpt-3.5-turbo,0.0


### You can run our function based evaluators as follows

In [26]:
# Imports
from athina.evals import ContainsAny, Regex
from athina.loaders import ResponseLoader

In [27]:
# Load dataset
raw_data = [ 
    { 
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "response": "A shooting star is a meteor that burns up in the atmosphere.",
    }
]
dataset = ResponseLoader().load_dict(raw_data)
pd.DataFrame(dataset)

Unnamed: 0,response
0,I cannot answer this question as prices vary from country to country.
1,A shooting star is a meteor that burns up in the atmosphere.


In [28]:
# Eval checks if the response contains any of the keywords
ContainsAny(keywords=["star"]).run_batch(data=dataset).to_df()


Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,I cannot answer this question as prices vary from country to country.,ContainsAny,True,No keywords found in output,0,,0.0
1,A shooting star is a meteor that burns up in the atmosphere.,ContainsAny,False,One or more keywords were found in output: star,0,,1.0


In [29]:
# Load dataset
raw_data = [ 
    { 
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "response": "Contact us at hello@athina.ai to get access to our LLM observability platform where you can run the tests you've defined here against your LLM responses in production.",
    }
]
dataset = ResponseLoader().load_dict(raw_data)
pd.DataFrame(dataset)

Unnamed: 0,response
0,I cannot answer this question as prices vary from country to country.
1,Contact us at hello@athina.ai to get access to our LLM observability platform where you can run the tests you've defined here against your LLM responses in production.


In [30]:
# Eval checks if the response matches the regex
Regex(regex='([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)').run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,I cannot answer this question as prices vary from country to country.,Regex,True,regex pattern ([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+) not found in output,0,,0.0
1,Contact us at hello@athina.ai to get access to our LLM observability platform where you can run the tests you've defined here against your LLM responses in production.,Regex,False,regex pattern ([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+) found in output,0,,1.0


In [31]:
from athina.evals import ContainsNone

# Example data
raw_data = [
    {
        "response": "This text does not contain the specified keyword.",
    },
    {
        "response": "This is a text without any specified search word.",
    }
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsNone(keywords=["keyword"]).run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,This text does not contain the specified keyword.,ContainsNone,True,One or more keywords were found in output: keyword,0,,0.0
1,This is a text without any specified search word.,ContainsNone,False,No keywords found in output,0,,1.0


In [32]:
from athina.evals import Contains

# Example data
raw_data = [
    {
        "response": "The keyword YC present in this text.",
    },
    {
        "response": "This text does not contain the specified word.",
    }
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
Contains(keyword="YC").run_batch(data=dataset).to_df()


Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,The keyword YC present in this text.,Contains,False,keyword yc found in output,0,,1.0
1,This text does not contain the specified word.,Contains,True,keyword not found in output: yc,0,,0.0


In [33]:
from athina.evals import ContainsAll

# Example data
raw_data = [
    {"response": "This text contains both keyword1 and keyword2."},
    {"response": "This text does not contain all specified keywords."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsAll(keywords=["keyword1", "keyword2"]).run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,This text contains both keyword1 and keyword2.,ContainsAll,False,2/2 keywords found in output,0,,1.0
1,This text does not contain all specified keywords.,ContainsAll,True,"keywords not found in output: keyword1, keyword2",0,,0.0


In [34]:
from athina.evals import ContainsJson

# Example data
raw_data = [
    {"response": '{"key": "value"}'},
    {"response": '{"invalid : "json"}'},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsJson().run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,"{""key"": ""value""}",ContainsJson,False,Output contains JSON,0,,1.0
1,"{""invalid : ""json""}",ContainsJson,True,Output contains a potential JSON but it is invalid,0,,0.0


In [35]:
from athina.evals import ContainsEmail

# Example data
raw_data = [
    {"response": "Contact us at contact@example.com."},
    {"response": "This text does not contain any email address."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsEmail().run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,Contact us at contact@example.com.,ContainsEmail,False,regex pattern [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+ found in output,0,,1.0
1,This text does not contain any email address.,ContainsEmail,True,regex pattern [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+ not found in output,0,,0.0


In [36]:
from athina.evals import IsJson

# Example data
raw_data = [
    {"response": '{"key": "value"}'},
    {"response": 'invalid_json'},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
IsJson().run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,"{""key"": ""value""}",IsJson,False,Output contains JSON,0,,1.0
1,invalid_json,IsJson,True,Output does not contain JSON,0,,0.0


In [37]:
from athina.evals import IsEmail

# Example data
raw_data = [
    {"response": "john.doe@example.com"},
    {"response": "invalid.email"},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
IsEmail().run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,john.doe@example.com,IsEmail,False,regex pattern ^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$ found in output,0,,1.0
1,invalid.email,IsEmail,True,regex pattern ^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$ not found in output,0,,0.0


In [38]:
from athina.evals import ContainsLink

# Example data
raw_data = [
    {"response": "For more information, visit https://example.com."},
    {"response": "This text does not contain any link."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsLink().run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,"For more information, visit https://example.com.",ContainsLink,False,Link found in output,0,,1.0
1,This text does not contain any link.,ContainsLink,True,No link found in output,0,,0.0


In [39]:
from athina.evals import ContainsValidLink

# Example data
raw_data = [
    {"response": "Visit our official website at http://example.com."},
    {"response": "Visit our official website at https://exampleasdf.com"},
    {"response": "This text does not contain any valid link."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsValidLink().run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,Visit our official website at http://example.com.,ContainsValidLink,False,link http://example.com. found in output and is valid,34,,1.0
1,Visit our official website at https://exampleasdf.com,ContainsValidLink,True,link https://exampleasdf.com found in output but is invalid,18,,0.0
2,This text does not contain any valid link.,ContainsValidLink,True,no link found in output,0,,0.0


In [40]:
from athina.evals import NoInvalidLinks

# Example data
raw_data = [
    {"response": "Visit our website at https://example.com."},
    {"response": "Visit our official website at https://exampleasdf.com"},
    {"response": "This text does not contain any valid link."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)

# Example calls
NoInvalidLinks().run_batch(data=dataset).to_df()
NoInvalidLinks().run_batch(data=dataset).to_df()


Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,Visit our website at https://example.com.,NoInvalidLinks,False,link https://example.com. found in output and is valid,37,,1.0
1,Visit our official website at https://exampleasdf.com,NoInvalidLinks,True,link https://exampleasdf.com found in output but is invalid,3,,0.0
2,This text does not contain any valid link.,NoInvalidLinks,False,no invalid link found in output,0,,1.0


In [41]:
from athina.evals import ApiCall

# Example data
raw_data = [
    {"response": "Response to be sent to the your own API based evaluator"}
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ApiCall(url="https://8e714940905f4022b43267e348b8a71.api.mockbin.io/", payload={"evaluator": "custom_api_based_evaluator"}, headers={"Authorization": "Bearer token"}).run_batch(data=dataset).to_df()


Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,Response to be sent to the your own API based evaluator,ApiCall,True,Bad Request: The server could not understand the request due to invalid syntax.,355,,0.0


In [42]:
from athina.evals import Equals

# Example data
raw_data = [
    {"response": "This is the expected response"},
    {"response": "This is an unexpected response"},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
Equals(expected_response="This is the expected response").run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,This is the expected response,Equals,False,✅ output exactly matches expected response,0,,1.0
1,This is an unexpected response,Equals,True,output does not exactly match expected response,0,,0.0


In [43]:
from athina.evals import StartsWith

# Example data
raw_data = [
    {"response": "The text starts with this substring."},
    {"response": "This text does not start with the specified substring."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
StartsWith(substring="The text starts with").run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,The text starts with this substring.,StartsWith,False,output starts with the text starts with,0,,1.0
1,This text does not start with the specified substring.,StartsWith,True,output does not start with the text starts with,0,,0.0


In [44]:
from athina.evals import EndsWith

# Example data
raw_data = [
    {"response": "The text ends with this substring."},
    {"response": "This text does not end with the specified substring."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
EndsWith(substring="with this substring.").run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,The text ends with this substring.,EndsWith,False,output ends with with this substring.,0,,1.0
1,This text does not end with the specified substring.,EndsWith,True,output does not end with with this substring.,0,,0.0


In [45]:
from athina.evals import LengthLessThan

# Example data
raw_data = [
    {"response": "Short text"},
    {"response": "This is a longer text."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
LengthLessThan(max_length=20).run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,Short text,LengthLessThan,False,output length is less than 20 characters,0,,1.0
1,This is a longer text.,LengthLessThan,True,output length is greater than 20 characters,0,,0.0


In [46]:
from athina.evals import LengthGreaterThan

# Example data
raw_data = [
    {"response": "Short text"},
    {"response": "This is a longer text."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
LengthGreaterThan(min_length=20).run_batch(data=dataset).to_df()

Unnamed: 0,response,display_name,failed,grade_reason,runtime,model,passed
0,Short text,LengthGreaterThan,True,output length is less than 20 characters,0,,0.0
1,This is a longer text.,LengthGreaterThan,False,output length is greater than 20 characters,0,,1.0
