In [None]:
import os
from athina.evals import DoesResponseAnswerQuery, ContextContainsEnoughInformation, Faithfulness
from athina.loaders import RagLoader
from athina.keys import AthinaApiKey, OpenAiApiKey
from athina.runner.run import EvalRunner
from athina.datasets import yc_query_mini
from athina.interfaces.athina import AthinaExperiment
import pandas as pd

OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

In [None]:
# Create batch dataset from list of dict objects
raw_data = yc_query_mini.data

dataset = RagLoader().load_dict(raw_data)
pd.DataFrame(dataset)

In [None]:
# Run the eval suite
eval_model = "gpt-4-1106-preview"
experiment = AthinaExperiment(
    experiment_name="yc-question-chatbot",
    experiment_description="YC Q&A with RAG model and few-shot examples",
    language_model_provider="openai",
    language_model_id="gpt-3.5-turbo",
    prompt_template=[],
    dataset_name="yc_dataset_mini",
)
eval_suite = [
    DoesResponseAnswerQuery(model=eval_model),
    Faithfulness(model=eval_model),
    ContextContainsEnoughInformation(model=eval_model),
]

# Run the evaluation suite
batch_eval_result = EvalRunner.run_suite(
    evals=eval_suite,
    data=dataset,
    experiment=experiment,
    max_parallel_evals=10
)

pd.DataFrame(batch_eval_result)