In [1]:
%load_ext autoreload 
%autoreload 2

import sys
sys.path.append('../')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from src.database.weaviate_interface_v4 import WeaviateWCS
from src.database.database_utils import get_weaviate_client
from src.preprocessor.preprocessing import FileIO
from src.reranker import ReRanker
from src.evaluation.custom_eval_models import AnswerCorrectnessMetric, EvalResponse, CustomAzureOpenAI
from src.llm.llm_interface import LLM
from src.llm.llm_utils import load_azure_openai
from src.llm.prompt_templates import (huberman_system_message, question_answering_prompt_series,
                                     create_context_blocks, generate_prompt_series)
from deepeval.test_case import LLMTestCase
from deepeval.models import DeepEvalBaseLLM
from nest_asyncio import apply
from tqdm import tqdm
import asyncio 
apply()

### Load Data

In [3]:
data_path = '../data/golden_datasets/golden_256.json'
data = FileIO().load_json(data_path)
queries = list(data['queries'].values())

### Set Components

In [79]:
client = get_weaviate_client()
collection_name = 'Huberman_minilm_256'
reranker= ReRanker()
llm = load_azure_openai()
azure_eval_model = CustomAzureOpenAI('graphrag-gpt4-turbo')

In [80]:
async def aget_actual_outputs(user_messages: list[str]):
    tasks = [llm.achat_completion(huberman_system_message, user_message, temperature=1.0) for user_message in user_messages]
    responses = await asyncio.gather(*tasks)
    return responses

In [81]:
async def acreate_test_cases( queries: list[str],
                              client: WeaviateWCS,
                              collection_name: str,
                              llm: LLM,
                              ) -> list[LLMTestCase]:
    '''
    Creates a list of LLM Test Cases based on query retrievals. 
    '''
    results = [client.hybrid_search(query, collection_name, limit=200) for query in tqdm(queries, 'QUERIES')]
    reranked = [reranker.rerank(result, queries[i], top_k=3) for i, result in enumerate(tqdm(results, 'RERANKING'))]
    user_messages = [generate_prompt_series(queries[i], rerank, 1) for i, rerank in enumerate(reranked)]
    actual_outputs = await aget_actual_outputs(user_messages)
    retrieval_contexts = [create_context_blocks(rerank) for rerank in reranked]
    test_cases = [LLMTestCase(input=input, actual_output=output, retrieval_context=context) \
                  for input, output, context in list(zip(queries, actual_outputs, retrieval_contexts))]
    return test_cases

### Get Test Cases

In [26]:
import time
time.sleep(50)
test_cases = await acreate_test_cases(queries, client, collection_name, llm)

QUERIES: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:16<00:00,  6.10it/s]
RERANKING: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:21<00:00,  4.66it/s]


In [30]:
#convert to list of dicts
test_case_list = [test_case.__dict__ for test_case in test_cases]

In [16]:
# #save results to disk
# FileIO.save_as_json('../data/test_cases_256.json', test_case_list)
test_cases = FileIO.load_json('../data/test_cases_256.json')
test_cases = [LLMTestCase(input=tc['input'], actual_output=tc['actual_output'], retrieval_context=tc['retrieval_context'])
              for tc in test_cases]

### Launch Testing

In [82]:
async def asingle_eval_call(test_case: LLMTestCase,
                            model: str | DeepEvalBaseLLM,
                            metric: AnswerCorrectnessMetric,
                            threshold: float=None,
                            return_context_data: bool=True
                            ) -> EvalResponse:
    # if metric == FaithfulnessMetric:
    #     threshold = threshold if threshold else 0.5
    #     metric = FaithfulnessMetric(model=model, threshold=threshold)
    if metric == AnswerCorrectnessMetric:
        metric = AnswerCorrectnessMetric(model)
    await metric.a_measure(test_case)
    response = load_eval_response(metric, test_case, return_context_data)
    return response

In [83]:
async def asystem_evaluation(test_cases: list[LLMTestCase],
                             model: DeepEvalBaseLLM,
                             metric: AnswerCorrectnessMetric,
                             batch_size: int=10,
                             threshold: float=None
                            ):
    from tqdm import tqdm
    from math import ceil
    completed = []
    batches = ceil(len(test_cases)/batch_size)
    for i in tqdm(range(batches), 'BATCHES'):
        batch = test_cases[i*batch_size:(i+1)*batch_size]
        tasks = [asingle_eval_call(case, model, metric, threshold) for case in batch]
        responses = await asyncio.gather(*tasks)
        completed.extend(responses)
        await asyncio.sleep(30)
    return completed

In [87]:
responses = await asystem_evaluation(test_cases, azure_eval_model, AnswerCorrectnessMetric, 20)

BATCHES:   0%|                                                                                                                   | 0/5 [00:00<?, ?it/s]

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

BATCHES:   0%|                                                                                                                   | 0/5 [00:03<?, ?it/s]

In [42]:
def load_eval_response(metric, test_case, return_context_data: bool=True):
    return EvalResponse(score=metric.score,
                        reason=metric.reason,
                        metric=metric.__class__.__name__,
                        cost=metric.evaluation_cost, 
                        eval_model=metric.evaluation_model,
                        eval_steps=metric.evaluation_steps,
                        input=test_case.input if return_context_data else None,
                        actual_output=test_case.actual_output if return_context_data else None,
                        retrieval_context=None #retrieval_context if return_context_data else None
                        )

In [76]:
import pandas as pd
turbo35_df = pd.DataFrame([r.score for r in responses])

In [78]:
turbo35_df.describe()

Unnamed: 0,0
count,100.0
mean,0.901
std,0.269491
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [85]:
gpt4_df=pd.DataFrame([r.score for r in responses])

In [86]:
gpt4_df.describe()

Unnamed: 0,0
count,100.0
mean,0.901
std,0.269491
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [89]:
len(responses)

100

In [90]:
responses[:10]

[EvalResponse(score=1.0, reason='The actual output effectively addresses the specific information requirement, comprehensively covers all key aspects mentioned in the input, and accurately explains the practices for deliberately increasing adrenaline while staying calm mentally and their utility in dealing with unwanted events.', metric='AnswerCorrectnessMetric', cost=None, eval_model='Custom Azure OpenAI Model', eval_steps=['Compare the actual output with the retrieval context to verify factual accuracy.', 'Assess if the actual output effectively addresses the specific information requirement stated in the input.', 'Determine the comprehensiveness of the actual output in addressing all key aspects mentioned in the input.', 'Score the actual output between 0 and 1, based on the accuracy and comprehensiveness of the information provided.', 'If there is not enough information in the retrieval context to correctly answer the input, and the actual output indicates that the input cannot be 