In [1]:
%load_ext autoreload 
%autoreload 2

import sys
sys.path.append('../')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from deepeval.metrics import FaithfulnessMetric
from deepeval.test_case import LLMTestCase
from deepeval import evaluate
from evaluation.custom_eval_models import (CustomAzureOpenAI, AnswerCorrectnessMetric, 
                                            EvalResponse, TestCaseBundle)
from src.database.weaviate_interface_v4 import WeaviateWCS
from src.database.database_utils import get_weaviate_client
from src.preprocessor.preprocessing import FileIO
from src.reranker import ReRanker
from src.llm.prompt_templates import context_block
from src.llm.llm_interface import LLM
from src.llm.llm_utils import load_azure_openai
from src.llm.prompt_templates import huberman_system_message, question_answering_prompt_series
from app_features import generate_prompt_series

from random import sample
from tqdm import tqdm
import asyncio
import nest_asyncio
nest_asyncio.apply()

### Load Data

In [4]:
data_path = '../data/golden_datasets/golden_256.json'
data = FileIO().load_json(data_path)
queries = list(data['queries'].values())

#get random set of questions for eavl
random_questions = sample(queries, k=25)
assert len(random_questions) == len(set(random_questions))

### Set System Components

In [5]:
client = get_weaviate_client()
collection_name = 'Huberman_minilm_256'
reranker= ReRanker()
llm = load_azure_openai()

  return self.fget.__get__(instance, owner)()


### Create Retrieval Contexts

In [6]:
def create_context_blocks(results: list[dict],
                          summary_key: str='summary',
                          guest_key: str='guest',
                          content_key: str='content'):
    context_series = [context_block.format(summary=res[summary_key],
                                          guest=res[guest_key],
                                          transcript=res[content_key]) 
                      for res in results]
    return context_series

### Set Eval Model and Metrics

In [7]:
azure_eval_model = CustomAzureOpenAI('gpt-4')
acm = AnswerCorrectnessMetric(model=azure_eval_model, strict=False).get_metric()
# metrics = [AnswerCorrectnessMetric(model=azure_eval_model, strict=False).get_metric(), FaithfulnessMetric(threshold=0.7, model=azure_eval_model)]

### Create Test Case(s)

In [14]:
def system_evaluation(queries: list[str],
                      client: WeaviateWCS,
                      collection_name: str,
                      llm: LLM,
                      metric: AnswerCorrectnessMetric
                      ) -> list[dict]:
    '''
    LLM Evaluation harness that given a list of queries does the following:
       1. Retrieves relevant context and reranks
       2. Generates evaluated LLM actual output
       3. Creates retrieval context for test case
       4. Creates a text case on the fly
       5. Given a metric execute metric evaluation
       6. Returns list of metric evaluations
    '''

    eval_results = []
    for query in tqdm(queries):
        try:
            result = client.hybrid_search(query, collection_name, limit=200)
            reranked = reranker.rerank(result, query, top_k=3)
            user_message = generate_prompt_series(query, reranked)
            actual_output = llm.chat_completion(huberman_system_message, user_message, temperature=1.0)
            retrieval_context = create_context_blocks(reranked)
            test_case = LLMTestCase(input=input, actual_output=actual_output, retrieval_context=retrieval_context)
            metric.measure(test_case)
            response = EvalResponse(metric=metric.name,
                                    model=azure_eval_model.model,
                                    input=test_case.input,
                                    actual_output=test_case.actual_output,
                                    retrieval_context=test_case.retrieval_context,
                                    score=metric.score,
                                    reason=metric.reason)
            eval_results.append(response)
        except Exception as e:
            print(e)
            continue
    return eval_results

In [114]:
# %%time
# eval_results = system_evaluation(random_questions[:3], client, collection_name, llm, acm)

In [8]:
def create_test_cases(queries: list[str],
                      client: WeaviateWCS,
                      collection_name: str,
                      llm: LLM,
                      ) -> list[LLMTestCase]:
    '''
    Creates a list of LLM Test Cases and returns test cases along with all 
    associated context data i.e. inputs, actual_outputs, retrieval_contexts
    '''
    results = [client.hybrid_search(query, collection_name, limit=200) for query in tqdm(queries, 'QUERIES')]
    reranked = [reranker.rerank(result, queries[i], top_k=3) for i, result in enumerate(tqdm(results, 'RERANKING'))]
    user_messages = [generate_prompt_series(queries[i], rerank) for i, rerank in enumerate(reranked)]
    actual_outputs = [llm.chat_completion(huberman_system_message, user_message, temperature=1.0) for
                      user_message in tqdm(user_messages, 'LLM Calls')]
    retrieval_contexts = [create_context_blocks(rerank) for rerank in reranked]
    test_cases = [LLMTestCase(input=input, actual_output=output, retrieval_context=context) \
                  for input, output, context in list(zip(queries, actual_outputs, retrieval_contexts))]
    return test_cases

In [9]:
%%time
questions = sample(queries, k=25)
test_cases = create_test_cases(questions, client, collection_name, llm)

QUERIES: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:05<00:00,  4.45it/s]
RERANKING: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:05<00:00,  4.62it/s]
LLM Calls: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:35<00:00,  1.42s/it]

CPU times: user 10.7 s, sys: 949 ms, total: 11.7 s
Wall time: 46.4 s





In [10]:
async def single_eval_call(metric: AnswerCorrectnessMetric,
                           test_case: LLMTestCase
                           ) -> EvalResponse:
    await metric.a_measure(test_case)
    response = EvalResponse(metric=metric.name,
                            model=azure_eval_model.model,
                            input=test_case.input,
                            actual_output=test_case.actual_output,
                            retrieval_context=test_case.retrieval_context,
                            score=metric.score,
                            reason=metric.reason)
    return response

In [11]:
async def asystem_evaluation(metric: AnswerCorrectnessMetric,
                             test_cases: list[LLMTestCase]
                            ):
    tasks = [single_eval_call(metric, case) for case in test_cases]
    responses = await asyncio.gather(*tasks)
    return responses

In [15]:
%%time
responses = asyncio.run(asystem_evaluation(acm, test_cases))

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

CPU times: user 23 s, sys: 3.06 s, total: 26.1 s
Wall time: 44.3 s


In [16]:
len(responses)

25

In [17]:
scores = [r.score for r in responses]

In [18]:
sum([r.score for r in responses])/len(responses)

0.784

In [19]:
reasons = [r.reason for r in responses]

In [22]:
reasons[2]

"The actual output provided discusses the melanocortin system's response to sunlight exposure on the skin and mentions effects such as tanning and potential impacts on mood and libido. However, the retrieval context primarily focuses on the discussion of different peptides and their effects on health, with only a brief overlapping mention of how the melanocortin system is stimulated by UV light. Significant parts of the actual output, such as the eye's role in signaling the hypothalamus and pituitary gland, as well as the specific stimuli of mood and libido, are not corroborated by the retrieval context provided. Crucial information from the retrieval context that should inform the answer is missing or ignored, such as the role of peptides like melanotan in mimicking melanocyte-stimulating hormone and their varying effects depending on receptor type. The actual output also fails to address the breadth and specificity of the retrieval context in terms of the peptides discussed, their pu

In [25]:
responses[2].actual_output

'The melanocortin system responds to sunlight on the skin by being stimulated. When sunlight, specifically ultraviolet B light, hits the skin, it triggers the melanocortin system. This stimulation begins with the eyes receiving the sunlight, which then signals the hypothalamus, and subsequently the pituitary gland. As a result, melanocyte-stimulating hormone is released into the bloodstream, leading to the pigmentation of the skin, commonly known as tanning. The melanocortin system also plays a role in various other physiological responses such as mood, libido, and even breeding behaviors in animals. Therefore, the melanocortin system is activated by sunlight exposure to the skin, leading to a cascade of physiological effects.'

In [26]:
responses[2].retrieval_context

["\nShow Summary: In this Huberman Lab podcast episode, Andrew Huberman discusses the benefits and risks of peptide therapeutics for physical and mental health. Peptides are small proteins made up of chains of amino acids, and they play various roles in the body. For tissue healing and repair, BPC-157 is a synthetic peptide that promotes angiogenesis and fibroblast migration. It can be used to accelerate the healing of injuries, but caution should be taken as it may promote tumor growth. Another peptide called thymosin beta-4 (TB-500) is also used for tissue rejuvenation and repair. It increases the growth of various cell types and stimulates the growth of new blood vessels to the injury site. However, like BPC-157, TB-500 may raise concerns regarding tumor growth. There are also peptides that stimulate the release of growth hormone, which can enhance metabolism and muscle growth. Category one peptides, such as sermorelin and tesamorelin, are FDA approved and function by mimicking the 