In [1]:
import os
from athina.evals import CustomGrader
from athina.loaders import ResponseLoader
from athina.keys import OpenAiApiKey

OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))

### Initialize your dataset

The [`ResponseLoader`](https://github.com/athina-ai/athina-evals/blob/main/athina/loaders/response_loader.py) class is used to load your dataset. 

This loader ensures that the data contains a "response" field and is in the correct format for the `LlmEvaluator` class.

In [2]:
# Create batch dataset from list of dict objects
raw_data = [
    {
        "response": "I'm sorry but I can't help you with that query",
    },
    {
        "response": "I don't know the answer to that question",
    },
]

dataset = ResponseLoader().load_dict(raw_data)

### Configure and Run Evaluator

The easiest way to configure a custom evaluator is to use our [`CustomGrader`](https://github.com/athina-ai/athina-evals/blob/main/athina/evals/llm/custom_grader/evaluator.py) class.

This evaluator simply takes in a grading criteria in the following format:

```
If X, then fail. Otherwise, pass.
```

Optionally, you can also specify what model you would like to use for grading.

In [3]:
# Checks if the LLM response answers the user query sufficiently
eval_model = "gpt-4-1106-preview"

grading_criteria = "If the response says it cannot answer the query, then fail. Otherwise pass."

eval_result = CustomGrader(
    model=eval_model,
    grading_criteria=grading_criteria
).run_batch(data=dataset)

print(eval_result)

[{'name': 'custom_grader', 'data': {'response': "I'm sorry but I can't help you with that query"}, 'failure': True, 'reason': 'The response explicitly states that it cannot answer the query, which according to the grading criteria, results in a fail.', 'runtime': 1830, 'model': 'gpt-4-1106-preview'}, {'name': 'custom_grader', 'data': {'response': "I don't know the answer to that question"}, 'failure': True, 'reason': 'The response explicitly states that it cannot answer the query, which according to the grading criteria, results in a fail.', 'runtime': 1868, 'model': 'gpt-4-1106-preview'}]
