In [1]:
%load_ext autoreload 
%autoreload 2

import sys
sys.path.append('../')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
from deepeval.metrics import FaithfulnessMetric
from deepeval.test_case import LLMTestCase
from deepeval import evaluate
from deepeval.models import DeepEvalBaseLLM
from evaluation.custom_eval_models import (CustomAzureOpenAI, AnswerCorrectnessMetric, 
                                            EvalResponse, TestCaseBundle)
from src.database.weaviate_interface_v4 import WeaviateWCS
from src.database.database_utils import get_weaviate_client
from src.preprocessor.preprocessing import FileIO
from src.reranker import ReRanker
from src.llm.prompt_templates import context_block
from src.llm.llm_interface import LLM
from src.llm.llm_utils import load_azure_openai
from src.llm.prompt_templates import huberman_system_message, question_answering_prompt_series
from app_features import generate_prompt_series

from loguru import logger
from random import sample
from tqdm import tqdm
import asyncio
import nest_asyncio
nest_asyncio.apply()

### Load Data

In [5]:
data_path = '../data/golden_datasets/golden_256.json'
data = FileIO().load_json(data_path)
queries = list(data['queries'].values())

#get random set of questions for eavl
random_questions = sample(queries, k=25)
assert len(random_questions) == len(set(random_questions))

### Set System Components

In [6]:
client = get_weaviate_client()
collection_name = 'Huberman_minilm_256'
reranker= ReRanker()
llm = load_azure_openai()

  return self.fget.__get__(instance, owner)()


### Create Retrieval Contexts

In [7]:
def create_context_blocks(results: list[dict],
                          summary_key: str='summary',
                          guest_key: str='guest',
                          content_key: str='content'):
    context_series = [context_block.format(summary=res[summary_key],
                                          guest=res[guest_key],
                                          transcript=res[content_key]) 
                      for res in results]
    return context_series

### Set Eval Model and Metrics

In [8]:
azure_eval_model = CustomAzureOpenAI('gpt-4')
acm = AnswerCorrectnessMetric(model=azure_eval_model, strict=False).get_metric()
faith = FaithfulnessMetric(model=azure_eval_model)
# metrics = [AnswerCorrectnessMetric(model=azure_eval_model, strict=False).get_metric(), FaithfulnessMetric(threshold=0.7, model=azure_eval_model)]

In [9]:
from deepeval.metrics import BaseMetric

### Create Test Case(s)

In [10]:
def system_evaluation(queries: list[str],
                      client: WeaviateWCS,
                      collection_name: str,
                      llm: LLM,
                      metric: BaseMetric
                      ) -> list[dict]:
    '''
    LLM Evaluation harness that given a list of queries does the following:
       1. Retrieves relevant context and reranks
       2. Generates evaluated LLM actual output
       3. Creates retrieval context for test case
       4. Creates a text case on the fly
       5. Given a metric execute metric evaluation
       6. Returns list of metric evaluations
    '''

    eval_results = []
    for query in tqdm(queries):
        try:
            result = client.hybrid_search(query, collection_name, limit=200)
            reranked = reranker.rerank(result, query, top_k=3)
            user_message = generate_prompt_series(query, reranked)
            actual_output = llm.chat_completion(huberman_system_message, user_message, temperature=1.0)
            retrieval_context = create_context_blocks(reranked)
            test_case = LLMTestCase(input=query, actual_output=actual_output, retrieval_context=retrieval_context)
            metric.measure(test_case)
            # logger.info(test_case.input)
            response = EvalResponse(metric=metric,
                                    model=azure_eval_model.model,
                                    input=test_case.input,
                                    actual_output=test_case.actual_output,
                                    retrieval_context=test_case.retrieval_context,
                                    score=metric.score,
                                    reason=metric.reason)
            eval_results.append(response)
        except Exception as e:
            print(e)
            continue
    return eval_results

In [11]:
%%time
# eval_results = system_evaluation(random_questions[:5], client, collection_name, llm,faith)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 10 µs


In [18]:
def create_test_cases(queries: list[str],
                      client: WeaviateWCS,
                      collection_name: str,
                      llm: LLM,
                      ) -> list[LLMTestCase]:
    '''
    Creates a list of LLM Test Cases based on query retrievals. 
    '''
    results = [client.hybrid_search(query, collection_name, limit=200) for query in tqdm(queries, 'QUERIES')]
    reranked = [reranker.rerank(result, queries[i], top_k=3) for i, result in enumerate(tqdm(results, 'RERANKING'))]
    user_messages = [generate_prompt_series(queries[i], rerank) for i, rerank in enumerate(reranked)]
    actual_outputs = [llm.chat_completion(huberman_system_message, user_message, temperature=1.0) for
                      user_message in tqdm(user_messages, 'LLM Calls')]
    retrieval_contexts = [create_context_blocks(rerank) for rerank in reranked]
    test_cases = [LLMTestCase(input=input, actual_output=output, retrieval_context=context) \
                  for input, output, context in list(zip(queries, actual_outputs, retrieval_contexts))]
    return test_cases

In [55]:
%%time
questions = sample(queries, k=25)
test_cases = create_test_cases(questions[:15], client, collection_name, llm)

  return {
  return {
QUERIES: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:03<00:00,  4.78it/s]
RERANKING: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:03<00:00,  4.70it/s]
LLM Calls: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:23<00:00,  1.60s/it]

CPU times: user 6.05 s, sys: 25.8 ms, total: 6.07 s
Wall time: 30.3 s





In [98]:
async def single_faith_eval(test_case: LLMTestCase,
                            model: DeepEvalBaseLLM,
                            metric: FaithfulnessMetric | AnswerCorrectnessMetric,
                            threshold: float=None
                           ) -> EvalResponse:
    if metric == FaithfulnessMetric:
        threshold = threshold if threshold else 0.5
        metric = FaithfulnessMetric(model=model, threshold=threshold)
    elif metric == AnswerCorrectnessMetric:
        metric = AnswerCorrectnessMetric(model=model).get_metric()
    await metric.a_measure(test_case)
    response = EvalResponse(metric=metric,
                            model=azure_eval_model.model,
                            input=test_case.input,
                            actual_output=test_case.actual_output,
                            retrieval_context=test_case.retrieval_context,
                            score=metric.score,
                            reason=metric.reason)
    return response

In [99]:
AnswerCorrectnessMetric(model=azure_eval_model).get_metric().a_measure

<bound method GEval.a_measure of <deepeval.metrics.g_eval.g_eval.GEval object at 0x7fb6bfc7b4f0>>

In [100]:
FaithfulnessMetric, AnswerCorrectnessMetric

(deepeval.metrics.faithfulness.faithfulness.FaithfulnessMetric,
 evaluation.custom_eval_models.AnswerCorrectnessMetric)

In [102]:
async def asystem_evaluation(
                             test_cases: list[LLMTestCase],
                             model: DeepEvalBaseLLM,
                             metric: FaithfulnessMetric | AnswerCorrectnessMetric,
                             threshold: float=None
                            ):
    tasks = [single_faith_eval(case, model, metric, threshold) for case in test_cases]
    responses = await asyncio.gather(*tasks)
    return responses

In [103]:
%%time
responses = asyncio.run(asystem_evaluation(test_cases[:10], azure_eval_model, AnswerCorrectnessMetric))

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

CPU times: user 8.72 s, sys: 763 ms, total: 9.48 s
Wall time: 1min 2s


In [104]:
len(responses)

10

In [105]:
scores = [r.score for r in responses]

In [106]:
scores

[0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0]

In [107]:
reasons = [r.reason for r in responses]

In [109]:
reasons

[autoreload of sentence_transformers.util failed: Traceback (most recent call last):
  File "/anaconda/envs/openai/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/anaconda/envs/openai/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "/anaconda/envs/openai/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/anaconda/envs/openai/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 309, in update_function
    setattr(old, name, getattr(new, name))
ValueError: snapshot_download() requires a code object with 0 free vars, not 3
]
[autoreload of sentence_transformers.SentenceTransformer failed: Traceback (most recent call last):
  File "/anaconda/envs/openai/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 276, in check
  

['The actual output correctly identifies adequate sleep and stress management as foundational modulators for engaging tenacity and willpower, as emphasized by Dr. Huberman in the retrieval context. The output reflects the focus on these elements and accurately summarizes the content from the source material, thereby addressing the question effectively and accurately. The detailed reference to Dr. Huberman and his recommendations on sleep reinforce the direct correlation with the highlighted discussion in the podcast. No contradictory information is presented, and the actual output displays a comprehensive understanding of the material. However, a slight mark deduction is applied because the output does not mention the importance of the anterior mid-cingulate cortex and the physiological aspect of glucose availability in the brain, which are also discussed as key elements in the retrieval context.',
 'The actual output aligns with the retrieval context by accurately presenting the relat

Bad pipe message: %s [b'zT`\xb2\x990\xd2\xe3\xaf\xb8\t\x87\xea\x9b\x15Zs% 1\xe7\x7f\x19\xcd\xb2t\xf5\x10rf\xfd\xe0\x90\xe9lP\xa6\x8fD*\xc9\x11\xb0>\x0f\x1a\xc4\xa0\xc2\xe6\xd8\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06', b'']
Bad pipe message: %s [b'\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \xcex\xf1\xd5\x0f\x8f\xad/\x07\x13\x0b\x19\x9a\x0ejk\x81%\x90\xf0\xa9\xaa']
Bad pipe message: %s [b'U#\x9b\xe7<n\xf570H\x87\xe4\xfd\x05\xdf\xd4\x18E\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae']
Bad pipe message: %s [b"\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0

In [66]:
scorereasons = list(zip(scores, reasons))

In [77]:
def print_results(data: list[tuple]):
    for atuple in data:
        print(f'SCORE: {atuple[0]}')
        reason = atuple[1][:25]
        print(f'REASON: {reason}\n')