In [204]:
%load_ext autoreload 
%autoreload 2

import sys
sys.path.append('../')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [205]:
from deepeval.metrics import FaithfulnessMetric
from deepeval.test_case import LLMTestCase
from deepeval import evaluate
from deepeval.models import DeepEvalBaseLLM
from evaluation.custom_eval_models import (CustomAzureOpenAI, AnswerCorrectnessMetric, 
                                            EvalResponse, TestCaseBundle)
from src.database.weaviate_interface_v4 import WeaviateWCS
from src.database.database_utils import get_weaviate_client
from src.preprocessor.preprocessing import FileIO
from src.reranker import ReRanker
from src.llm.prompt_templates import context_block
from src.llm.llm_interface import LLM
from src.llm.llm_utils import load_azure_openai
from src.llm.prompt_templates import (huberman_system_message, question_answering_prompt_series,
                                     create_context_blocks, generate_prompt_series)
from litellm import model_cost

from loguru import logger
from random import sample
from tqdm import tqdm
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [112]:
def get_sortable_model_cost(model_cost_dict: dict) -> list[dict]:
    '''
    Converts a dict of dicts into a list of dicts of model names 
    and their metadata.  Only return models with an 
    'input_cost_per_token' key
    '''
    sortable = []
    for k,v in model_cost_dict.items():
        model_dict = {'model': k}
        if 'input_cost_per_token' in v:
            sortable.append({**model_dict, **v})
    return sortable

In [114]:
models = get_sortable_model_cost(model_cost)

### Load Data

In [202]:
data_path = '../data/golden_datasets/golden_256.json'
data = FileIO().load_json(data_path)
queries = list(data['queries'].values())

#get random set of questions for eavl
random_questions = sample(queries, k=25)
assert len(random_questions) == len(set(random_questions))

### Set System Components

In [217]:
client = get_weaviate_client()
collection_name = 'Huberman_minilm_256'
reranker= ReRanker()
llm = load_azure_openai()

  client = get_weaviate_client()


### Create Retrieval Contexts

In [7]:
def create_context_blocks(results: list[dict],
                          summary_key: str='summary',
                          guest_key: str='guest',
                          content_key: str='content'):
    context_series = [context_block.format(summary=res[summary_key],
                                          guest=res[guest_key],
                                          transcript=res[content_key]) 
                      for res in results]
    return context_series

### Set Eval Model and Metrics

In [8]:
azure_eval_model = CustomAzureOpenAI('gpt-4')
acm = AnswerCorrectnessMetric(model=azure_eval_model, strict=False).get_metric()
faith = FaithfulnessMetric(model=azure_eval_model)
# metrics = [AnswerCorrectnessMetric(model=azure_eval_model, strict=False).get_metric(), FaithfulnessMetric(threshold=0.7, model=azure_eval_model)]

In [195]:
LLM.valid_models

{'cohere': ['command-r', 'command-r-plus'],
 'anthropic': ['claude-3-haiku-20240307',
  'claude-3-sonnet-2024022',
  'claude-3-opus-20240229'],
 'openai': ['gpt-4-turbo-preview',
  'gpt-4-0125-preview',
  'gpt-4-1106-preview',
  'gpt-4',
  'gpt-4-0613',
  'gpt-3.5-turbo',
  'gpt-3.5-turbo-1106',
  'gpt-3.5-turbo-0125']}

### Create Test Case(s)

In [10]:
def system_evaluation(queries: list[str],
                      client: WeaviateWCS,
                      collection_name: str,
                      llm: LLM,
                      ) -> list[dict]:
    '''
    LLM Evaluation harness that given a list of queries does the following:
       1. Retrieves relevant context and reranks
       2. Generates evaluated LLM actual output
       3. Creates retrieval context for test case
       4. Creates a text case on the fly
       5. Given a metric execute metric evaluation
       6. Returns list of metric evaluations
    '''

    eval_results = []
    for query in tqdm(queries):
        try:
            result = client.hybrid_search(query, collection_name, limit=200)
            reranked = reranker.rerank(result, query, top_k=3)
            user_message = generate_prompt_series(query, reranked)
            actual_output = llm.chat_completion(huberman_system_message, user_message, temperature=1.0)
            retrieval_context = create_context_blocks(reranked)
            test_case = LLMTestCase(input=query, actual_output=actual_output, retrieval_context=retrieval_context)
            metric.measure(test_case)
            # logger.info(test_case.input)
            response = EvalResponse(metric=metric,
                                    eval_model=metric.evaluation_model,
                                    input=test_case.input,
                                    actual_output=test_case.actual_output,
                                    retrieval_context=test_case.retrieval_context,
                                    score=metric.score,
                                    reason=metric.reason
                                    cost=metric.evaluation_cost,
                                    eval_steps=metric.evaluation_steps)
            eval_results.append(response)
        except Exception as e:
            print(e)
            continue
    return eval_results

In [11]:
%%time
# eval_results = system_evaluation(random_questions[:5], client, collection_name, llm,faith)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 10 µs


In [208]:
async def aget_actual_outputs(user_messages: list[str]):
    tasks = [llm.achat_completion(huberman_system_message, user_message, temperature=1.0) for user_message in user_messages]
    responses = await asyncio.gather(*tasks)
    return responses

In [209]:
async def acreate_test_cases( queries: list[str],
                              client: WeaviateWCS,
                              collection_name: str,
                              llm: LLM,
                              ) -> list[LLMTestCase]:
    '''
    Creates a list of LLM Test Cases based on query retrievals. 
    '''
    results = [client.hybrid_search(query, collection_name, limit=200) for query in tqdm(queries, 'QUERIES')]
    reranked = [reranker.rerank(result, queries[i], top_k=3) for i, result in enumerate(tqdm(results, 'RERANKING'))]
    user_messages = [generate_prompt_series(queries[i], rerank) for i, rerank in enumerate(reranked)]
    actual_outputs = await aget_actual_outputs(user_messages)
    retrieval_contexts = [create_context_blocks(rerank) for rerank in reranked]
    test_cases = [LLMTestCase(input=input, actual_output=output, retrieval_context=context) \
                  for input, output, context in list(zip(queries, actual_outputs, retrieval_contexts))]
    return test_cases

In [206]:
def create_test_cases(queries: list[str],
                      client: WeaviateWCS,
                      collection_name: str,
                      llm: LLM,
                      ) -> list[LLMTestCase]:
    '''
    Creates a list of LLM Test Cases based on query retrievals. 
    '''
    results = [client.hybrid_search(query, collection_name, limit=200) for query in tqdm(queries, 'QUERIES')]
    reranked = [reranker.rerank(result, queries[i], top_k=3) for i, result in enumerate(tqdm(results, 'RERANKING'))]
    user_messages = [generate_prompt_series(queries[i], rerank) for i, rerank in enumerate(reranked)]
    actual_outputs = [llm.chat_completion(huberman_system_message, user_message, temperature=1.0) for
                      user_message in tqdm(user_messages, 'LLM Calls')]
    retrieval_contexts = [create_context_blocks(rerank) for rerank in reranked]
    test_cases = [LLMTestCase(input=input, actual_output=output, retrieval_context=context) \
                  for input, output, context in list(zip(queries, actual_outputs, retrieval_contexts))]
    return test_cases

In [207]:
%%time
questions = sample(queries, k=5)
test_cases = create_test_cases(questions, client, collection_name, llm)

QUERIES: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.28it/s]
RERANKING: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.70it/s]
LLM Calls: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.69it/s]

CPU times: user 1.85 s, sys: 0 ns, total: 1.85 s
Wall time: 5.55 s





In [213]:
atest_cases = await acreate_test_cases(questions, client, collection_name, llm)

QUERIES: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.35it/s]
RERANKING: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.66it/s]


In [218]:
async def asingle_eval_call(test_case: LLMTestCase,
                            model: DeepEvalBaseLLM,
                            metric: FaithfulnessMetric | AnswerCorrectnessMetric,
                            threshold: float=None
                           ) -> EvalResponse:
    if metric == FaithfulnessMetric:
        threshold = threshold if threshold else 0.5
        metric = FaithfulnessMetric(model=model, threshold=threshold)
    if metric == AnswerCorrectnessMetric:
        metric = AnswerCorrectnessMetric(model=model, strict=False).get_metric()
    if metric == LatencyMetric:
        metric = LatencyMetric(max_seconds=threshold)
        metric.measure(test_case)
    # await metric.a_measure(test_case)
    response = EvalResponse(metric=metric,
                            model=azure_eval_model.model,
                            input=test_case.input,
                            actual_output=test_case.actual_output,
                            retrieval_context=test_case.retrieval_context,
                            score=metric.score,
                            reason=metric.reason)
    return response

In [168]:
AnswerCorrectnessMetric(model='gpt-4-turbo', strict=False).get_metric()

<deepeval.metrics.g_eval.g_eval.GEval at 0x7f9f317ce200>

In [169]:
LatencyMetric

evaluation.custom_eval_models.LatencyMetric

In [170]:
async def asystem_evaluation(
                             test_cases: list[LLMTestCase],
                             model: DeepEvalBaseLLM,
                             metric: FaithfulnessMetric | AnswerCorrectnessMetric,
                             threshold: float=None
                            ):
    tasks = [single_faith_eval(case, model, metric, threshold) for case in test_cases]
    responses = await asyncio.gather(*tasks)
    return responses

In [171]:
from evaluation.custom_eval_models import LatencyMetric

In [186]:
%%time
responses = asyncio.run(asystem_evaluation(test_cases, 'gpt-4-turbo', LatencyMetric, threshold=0.75))

CPU times: user 660 µs, sys: 0 ns, total: 660 µs
Wall time: 603 µs


In [187]:
results = [r.metric for r in responses]

In [138]:
scores = [r.score for r in responses2]

In [139]:
scores

[0.9347887500708539,
 0.8140108828118768,
 0.8073533432458813,
 0.8776524051199395,
 0.7873294138077769]

In [140]:
reasons = [r.reason for r in responses2]

In [141]:
scorereasons = list(zip(scores, reasons))

In [142]:
def print_results(data: list[tuple]):
    for atuple in data:
        print(f'SCORE: {atuple[0]}')
        reason = atuple[1][:25]
        print(f'REASON: {reason}\n')

In [143]:
print_results(scorereasons)

SCORE: 0.9347887500708539
REASON: The actual output provide

SCORE: 0.8140108828118768
REASON: The actual output success

SCORE: 0.8073533432458813
REASON: The output accurately ref

SCORE: 0.8776524051199395
REASON: The actual output aligns 

SCORE: 0.7873294138077769
REASON: The actual output effecti



In [144]:
reasons

['The actual output provides a factual and relevant response by specifying the website and discount available for the supplements discussed on the Huberman Lab Podcast, aligning well with the retrieval context. It addresses the information requirement directly and succinctly, although it could have elaborated slightly more on the types of supplements or specific episodes that discuss them for enhanced completeness.',
 'The actual output successfully aligns with the retrieval context, particularly emphasizing the role of the anterior mid-singulate cortex and the engagement in challenging activities that are pivotal for maintaining cognitive functions akin to those of younger individuals. However, it could have included more detailed examples or specific protocols mentioned in the retrieval context to enhance comprehensiveness.',
 'The output accurately reflects the information from the retrieval context regarding the impact of metabolites and lactate pathways on brain function, specific

In [132]:
responses2[0].metric.evaluation_cost

0.0214

In [104]:
total_cost = [r.metric.evaluation_cost for r in responses]
sum(total_cost)
    

0.126

In [136]:
responses2[0].metric.evaluation_steps

['Compare the actual output with the retrieval context to verify factual accuracy.',
 'Assess if the actual output effectively addresses the specific information requirement stated in the input.',
 'Determine the comprehensiveness of the actual output in covering all key aspects mentioned in the retrieval context.',
 'Score the actual output based on the accuracy, relevance, and completeness of the information provided between 0 and 1.']

In [199]:
vars(responses2[0].metric)

{'name': 'answer_correctness',
 'evaluation_params': [<LLMTestCaseParams.INPUT: 'input'>,
  <LLMTestCaseParams.ACTUAL_OUTPUT: 'actual_output'>,
  <LLMTestCaseParams.RETRIEVAL_CONTEXT: 'retrieval_context'>],
 'criteria': None,
 'model': <deepeval.models.gpt_model.GPTModel at 0x7f9f4e7ab130>,
 'using_native_model': True,
 'evaluation_model': 'gpt-4-turbo',
 'evaluation_steps': ['Compare the actual output with the retrieval context to verify factual accuracy.',
  'Assess if the actual output effectively addresses the specific information requirement stated in the input.',
  'Determine the comprehensiveness of the actual output in covering all key aspects mentioned in the retrieval context.',
  'Score the actual output based on the accuracy, relevance, and completeness of the information provided between 0 and 1.'],
 '_threshold': 0.5,
 'strict_mode': False,
 'async_mode': True,
 'evaluation_cost': 0.02305,
 'reason': 'The actual output provides a factual and relevant response by specifyin