In [None]:
import os
from athina.llms.openai_service import OpenAiService
from athina.evals import (
    DoesResponseAnswerQuery,
    ContextContainsEnoughInformation,
    Faithfulness
)

from athina.evals import FunctionEvaluator
from athina.loaders import RagLoader, ResponseLoader, RagasLoader
from athina.keys import AthinaApiKey, OpenAiApiKey
from athina.interfaces.athina import AthinaFilters
import pandas as pd
from athina.llms.openai_service import OpenAiService

from dotenv import load_dotenv
load_dotenv()

OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

In [None]:
raw_data_ragas = [
    {
        "query": "Where is France and what is it's capital?",
        "contexts": ["France is the country in europe known for delicious cuisine", "Tesla is an electric car", "Elephant is an animal"],
        "response": "Tesla is an electric car",
    },
    {
        "query": "Where is France and what is it's capital?",
        "contexts": ["France is the country in europe known for delicious cuisine", "Paris is the capital of france"],
        "response": "France is in western Europe and Paris is its capital",
    },
]

dataset_raw_data_ragas = RagasLoader().load_dict(raw_data_ragas)
pd.DataFrame(dataset_raw_data_ragas)

In [None]:
data = {
        "query": "Where is France and what is it's capital?",
        "contexts": ["France is the country in europe known for delicious cuisine", "Tesla is an electric car", "Elephant is an animal"],
        "response": "Tesla is an electric car",
    }
eval_model = "gpt-3.5-turbo"
RagasAnswerRelevancy(model=eval_model).run(**data).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasAnswerRelevancy(model=eval_model).run_batch(data=dataset_raw_data_ragas).to_df()

In [None]:
raw_data_ragas_with_expected_response = [
    {
        "query": "Where is France and what is it's capital?",
        "contexts": ["France is the country in europe known for delicious cuisine", "Tesla is an electric car", "Elephant is an animal"],
        "response": "Tesla is an electric car",
        "expected_response": "France is in europe. Paris is it's capital"
    },
    {
        "query": "What is Tesla? Who founded it?",
        "contexts": ["Tesla is the electric car company. Tesla is registerd in United States", "Elon Musk founded it"],
        "response": "France is in western Europe and Paris is its capital",
        "expected_response": "Tesla is an electric car company. Elon Musk founded it."
    },
]
dataset_raw_data_ragas_with_expected_response = RagasLoader().load_dict(raw_data_ragas_with_expected_response)
pd.DataFrame(dataset_raw_data_ragas_with_expected_response)

In [None]:
eval_model = "gpt-3.5-turbo"
RagasContextPrecision(model=eval_model).run_batch(data=dataset_raw_data_ragas_with_expected_response).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasContextRelevancy(model=eval_model).run_batch(data=dataset_raw_data_ragas).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasFaithfulness(model=eval_model).run_batch(data=dataset_raw_data_ragas).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasContextRecall(model=eval_model).run_batch(data=dataset_raw_data_ragas_with_expected_response).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasAnswerSemanticSimilarity(model=eval_model).run_batch(data=dataset_raw_data_ragas_with_expected_response).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasAnswerCorrectness(model=eval_model).run_batch(data=dataset_raw_data_ragas_with_expected_response).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasHarmfulness(model=eval_model).run_batch(data=dataset_raw_data_ragas_with_expected_response).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasMaliciousness(model=eval_model).run_batch(data=dataset_raw_data_ragas_with_expected_response).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasCoherence(model=eval_model).run_batch(data=dataset_raw_data_ragas_with_expected_response).to_df()

In [None]:
eval_model = "gpt-3.5-turbo"
RagasConciseness(model=eval_model).run_batch(data=dataset_raw_data_ragas_with_expected_response).to_df()

In [None]:
# Create batch dataset from list of dict objects
raw_data = [
    {
        "query": "What is the capital of Greece?",
        "context": "Greece is often called the cradle of Western civilization.",
        "response": "Athens",
    },
    {
        "query": "What is the price of a Tesla Model 3?",
        "context": "Tesla Model 3 is a fully electric car.",
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "query": "What is a shooting star?",
        "context": "Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.",
        "response": "A shooting star is a meteor that burns up in the atmosphere.",
    }
]

dataset = RagLoader().load_dict(raw_data)
pd.DataFrame(dataset)

In [None]:
eval_model = "gpt-3.5-turbo"
ContextContainsEnoughInformation(model=eval_model).run_batch(data=dataset).to_df()

In [None]:
# Checks if the LLM response answers the user query sufficiently
eval_model = "gpt-3.5-turbo"
DoesResponseAnswerQuery(model=eval_model).run_batch(data=dataset).to_df()

In [None]:
# Checks if the LLM response is faithful to the information provided to it
eval_model = "gpt-3.5-turbo"
Faithfulness(model=eval_model).run_batch(data=dataset).to_df()

### You can run our function based evaluators as follows

In [None]:
# Imports
from athina.evals import ContainsAny, Regex
from athina.loaders import ResponseLoader

In [None]:
# Load dataset
raw_data = [ 
    { 
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "response": "A shooting star is a meteor that burns up in the atmosphere.",
    }
]
dataset = ResponseLoader().load_dict(raw_data)
pd.DataFrame(dataset)

In [None]:
# Eval checks if the response contains any of the keywords
ContainsAny(keywords=["star"]).run_batch(data=dataset).to_df()


In [None]:
# Load dataset
raw_data = [ 
    { 
        "response": "I cannot answer this question as prices vary from country to country.",
    },
    {
        "response": "Contact us at hello@athina.ai to get access to our LLM observability platform where you can run the tests you've defined here against your LLM responses in production.",
    }
]
dataset = ResponseLoader().load_dict(raw_data)
pd.DataFrame(dataset)

In [None]:
# Eval checks if the response matches the regex
Regex(regex='([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)').run_batch(data=dataset).to_df()

In [None]:
from athina.evals import ContainsNone

# Example data
raw_data = [
    {
        "response": "This text does not contain the specified keyword.",
    },
    {
        "response": "This is a text without any specified search word.",
    }
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsNone(keywords=["keyword"]).run_batch(data=dataset).to_df()

In [None]:
from athina.evals import Contains

# Example data
raw_data = [
    {
        "response": "The keyword YC present in this text.",
    },
    {
        "response": "This text does not contain the specified word.",
    }
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
Contains(keyword="YC").run_batch(data=dataset).to_df()


In [None]:
from athina.evals import ContainsAll

# Example data
raw_data = [
    {"response": "This text contains both keyword1 and keyword2."},
    {"response": "This text does not contain all specified keywords."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsAll(keywords=["keyword1", "keyword2"]).run_batch(data=dataset).to_df()

In [None]:
from athina.evals import ContainsJson

# Example data
raw_data = [
    {"response": '{"key": "value"}'},
    {"response": '{"invalid : "json"}'},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsJson().run_batch(data=dataset).to_df()

In [None]:
from athina.evals import ContainsEmail

# Example data
raw_data = [
    {"response": "Contact us at contact@example.com."},
    {"response": "This text does not contain any email address."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsEmail().run_batch(data=dataset).to_df()

In [None]:
from athina.evals import IsJson

# Example data
raw_data = [
    {"response": '{"key": "value"}'},
    {"response": 'invalid_json'},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
IsJson().run_batch(data=dataset).to_df()

In [None]:
from athina.evals import IsEmail

# Example data
raw_data = [
    {"response": "john.doe@example.com"},
    {"response": "invalid.email"},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
IsEmail().run_batch(data=dataset).to_df()

In [None]:
from athina.evals import ContainsLink

# Example data
raw_data = [
    {"response": "For more information, visit https://example.com."},
    {"response": "This text does not contain any link."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsLink().run_batch(data=dataset).to_df()

In [None]:
from athina.evals import ContainsValidLink

# Example data
raw_data = [
    {"response": "Visit our official website at http://example.com."},
    {"response": "Visit our official website at https://exampleasdf.com"},
    {"response": "This text does not contain any valid link."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ContainsValidLink().run_batch(data=dataset).to_df()

In [None]:
from athina.evals import NoInvalidLinks

# Example data
raw_data = [
    {"response": "Visit our website at https://example.com."},
    {"response": "Visit our official website at https://exampleasdf.com"},
    {"response": "This text does not contain any valid link."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)

# Example calls
NoInvalidLinks().run_batch(data=dataset).to_df()
NoInvalidLinks().run_batch(data=dataset).to_df()


In [None]:
from athina.evals import ApiCall

# Example data
raw_data = [
    {"response": "Response to be sent to the your own API based evaluator"}
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
ApiCall(url="https://8e714940905f4022b43267e348b8a71.api.mockbin.io/", payload={"evaluator": "custom_api_based_evaluator"}, headers={"Authorization": "Bearer token"}).run_batch(data=dataset).to_df()


In [None]:
from athina.evals import Equals

# Example data
raw_data = [
    {"response": "This is the expected response"},
    {"response": "This is an unexpected response"},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
Equals(expected_response="This is the expected response").run_batch(data=dataset).to_df()

In [None]:
from athina.evals import StartsWith

# Example data
raw_data = [
    {"response": "The text starts with this substring."},
    {"response": "This text does not start with the specified substring."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
StartsWith(substring="The text starts with").run_batch(data=dataset).to_df()

In [None]:
from athina.evals import EndsWith

# Example data
raw_data = [
    {"response": "The text ends with this substring."},
    {"response": "This text does not end with the specified substring."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
EndsWith(substring="with this substring.").run_batch(data=dataset).to_df()

In [None]:
from athina.evals import LengthLessThan

# Example data
raw_data = [
    {"response": "Short text"},
    {"response": "This is a longer text."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
LengthLessThan(max_length=20).run_batch(data=dataset).to_df()

In [None]:
from athina.evals import LengthGreaterThan

# Example data
raw_data = [
    {"response": "Short text"},
    {"response": "This is a longer text."},
]

# Load data into dataset
dataset = ResponseLoader().load_dict(raw_data)
LengthGreaterThan(min_length=20).run_batch(data=dataset).to_df()