In [1]:
import os
from athina.evals import DoesResponseAnswerQuery, ContextContainsEnoughInformation, Faithfulness, CustomGrader
from athina.loaders import RagLoader
from athina.keys import AthinaApiKey, OpenAiApiKey
from athina.runner.run import EvalRunner
from athina.datasets import yc_query_mini
from athina.interfaces.athina import AthinaExperiment
import pandas as pd

OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

In [2]:
# Create batch dataset from list of dict objects
raw_data = yc_query_mini.data

dataset = RagLoader().load_dict(raw_data)
pd.DataFrame(dataset)

Unnamed: 0,query,context,response
0,What are some successful companies that went t...,Y Combinator has invested in companies in vari...,"Airbnb, Dropbox, Stripe, Reddit, Coinbase, Ins..."
1,In which city is YC located?,"Y Combinator is located in Mountain View, Cali...",Y Combinator is located in San Francisco
2,How much equity does YC take?,Y Combinator invests $500k in 200 startups twi...,YC invests $150k for 7%.
3,How much equity does YC take?,Y Combinator invests $500k in 200 startups twi...,I cannot answer this question as I do not have...
4,Who founded YC and when was it founded?,Y Combinator was founded in March 2005 by Paul...,Y Combinator was founded in 2005
5,Does Y Combinator invest in startups outside t...,Y Combinator invests in startups from all over...,"Yes, Y Combinator invests in international sta..."
6,How much does YC invest in startups?,YC invests $150k for 7%.,$150k
7,What is YC's motto?,Y Combinator's motto is 'Make something people...,Make something people want


In [3]:
# Run the eval suite
eval_model = "gpt-4-1106-preview"
experiment = AthinaExperiment(
    experiment_name="yc-question-chatbot",
    experiment_description="YC Q&A with RAG model and few-shot examples",
    language_model_provider="openai",
    language_model_id="gpt-3.5-turbo",
    prompt_template=[],
    dataset_name="yc_dataset_mini",
)
eval_suite = [
    DoesResponseAnswerQuery(model=eval_model),
    Faithfulness(model=eval_model),
    ContextContainsEnoughInformation(model=eval_model),
    CustomGrader(model=eval_model, grading_criteria="If response contains profanity, then fail. Otherwise, pass.")
]

# Run the evaluation suite
batch_eval_result = EvalRunner.run_suite(
    evals=eval_suite,
    data=dataset,
    experiment=experiment,
    max_parallel_evals=10
)

pd.DataFrame(batch_eval_result)

Unnamed: 0,results,total_runtime,passed_evals,failed_evals,total_evals,total_datapoints
data,"{'query': 'What is YC's motto?', 'context': 'Y...",22131,8,0,8,8
display_name,Custom,22131,8,0,8,8
failure,False,22131,8,0,8,8
metric,"{'id': 'failed', 'value': 0.0}",22131,8,0,8,8
model,gpt-4-1106-preview,22131,8,0,8,8
name,Custom,22131,8,0,8,8
reason,The response 'Make something people want' does...,22131,8,0,8,8
runtime,2836,22131,8,0,8,8
