# Experiment Notebook - Test Evaluation
This notebook will illustrate how to evaluate the results of a Test Run

In [None]:
import sqlite3
# Create a connection to the database
db_connection = sqlite3.connect('experiment.db')


In [None]:
# Load the test run
from packages.data.src.eval_data.models.testrun import TestRunModel, TestRunType

test_run = TestRunModel(db_connection).get_test_run_by_name(description="Test Run 4")
print(f"Test Run found with ID: {test_run.id}")

## - Setting up Evaluation Functions
In order to evaluate a test, we need to create library of functions that will evalute the responses. We can then assign these Evaluation Functions to different test runs.

The following will create a map of Evaluation Functions

In [None]:
from packages.data.src.eval_data.models.evalfunction import EvalFunctionModel, EvalFunctionType
from eval_functions import eval_ragas_precision, eval_ragas_recall, eval_ragas_answer_relevancy, eval_ragas_faithfulness

# Create Test Evals
eval_function_model = EvalFunctionModel(db_connection)
eval_function_list = [
    eval_function_model.add_or_get_eval_function(
        EvalFunctionType(
            name="ragas-precision", 
            description="Ragas - Context precision", 
            eval_function=eval_ragas_precision)
    ),
    eval_function_model.add_or_get_eval_function(
        EvalFunctionType(
            name="ragas-recall", 
            description="Ragas - Context recall",
            eval_function=eval_ragas_recall)
    ),
    eval_function_model.add_or_get_eval_function(
        EvalFunctionType(
            name="ragas-answer-relevancy", 
            description="Ragas - Answer Relevancy",
            eval_function=eval_ragas_answer_relevancy)
    ),
    eval_function_model.add_or_get_eval_function(
        EvalFunctionType(
            name="ragas-faithfulness", 
            description="Ragas - Faithfulness",
            eval_function=eval_ragas_faithfulness)
    )
]

print(f"Eval Function: {[e.id for e in eval_function_list]}")

## - Assign Evaluation Functions to a Test Run

In [None]:
# Create a Test Eval Config to assign Test Evals to the Test Run
from packages.data.src.eval_data.models.testevalconfig import TestEvalConfigModel, TestEvalConfigType
from typing import List

test_eval_config_model = TestEvalConfigModel(db_connection)
test_eval_list: List[TestEvalConfigType] = test_eval_config_model.get_test_eval_configs_by_test_run_id(test_run.id)

# TODO: Resume an exising eval run
for eval_function in eval_function_list:
    if eval_function.id not in [test_eval.eval_function_id for test_eval in test_eval_list]:
        test_eval_config = TestEvalConfigType(test_run_id=test_run.id, eval_function_id=eval_function.id)
        test_eval_config.id = test_eval_config_model.add_test_eval_config(test_eval_config)
        test_eval_list.append(test_eval_config)
        print(f"Added Test Eval Config: {test_eval_config.id}")
    else:
        print(f"Eval Function already assigned to Test Run: {eval_function.id}")

## - Run the evaluation Functions on all Responses and Contexts

In [None]:
import nest_asyncio
nest_asyncio.apply()

# Get all responses
from packages.data.src.eval_data.models.response import ResponseModel
from packages.data.src.eval_data.models.question import QuestionModel
from packages.data.src.eval_data.models.context import ContextModel
from packages.data.src.eval_data.models.responseeval import ResponseEvalModel, ResponseEvalType

from pandas import DataFrame

question_model = QuestionModel(db_connection)
context_model = ContextModel(db_connection)
response_model = ResponseModel(db_connection)
response_eval_model = ResponseEvalModel(db_connection)

responses = response_model.get_responses_by_test_run_id(test_run.id)
response_evals = response_eval_model.get_response_evals_by_test_run_id(test_run.id)

# filter out responses that have already been evaluated
response_ids = [r.response_id for r in response_evals]
responses_trimed = [r for r in responses if r.id not in response_ids]
count_skipped = len(responses) - len(responses_trimed)

for i, r in enumerate(responses_trimed):
    print(f"Processing Response {i+count_skipped+1}/{len(responses)}")

    question = question_model.get_question_by_id(r.question_id)
    context = context_model.get_contexts_by_response_id(response_id=r.id)
    evals = response_eval_model.get_response_evals_by_response_id(response_id=r.id)

    # Skip evals that were done
    test_eval_ids = [e.test_eval_config_id for e in evals]
    test_eval_list_trimed = [e for e in test_eval_list if e.id not in test_eval_ids]
    for test_eval in test_eval_list_trimed:
        func = eval_function_model.get_eval_function_by_id(test_eval.eval_function_id).eval_function
        eval_result: float = func(
            questions=[question.question],
            answers=[r.response],
            ground_truths=[question.answer],
            contexts=[[c.text for c in context]]
        )
        response_eval_model.add_response_eval(
            ResponseEvalType(
                test_run_id=test_run.id,
                question_id=question.id,
                response_id=r.id,
                test_eval_config_id=test_eval.id,
                eval_score=eval_result if eval_result is not None else 0
            )
        )
    