In [36]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv, find_dotenv
envs = load_dotenv(find_dotenv(), override=True)

import time
import sys
import os
sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
from deepeval import evaluate
from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset


from src.database.database_utils import get_weaviate_client
from src.database.weaviate_interface_v4 import WeaviateWCS
from src.llm.llm_interface import LLM
from src.llm.llm_utils import get_token_count
from src.llm.prompt_templates import question_answering_prompt_series, huberman_system_prompt
from app_features import generate_prompt_series

from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from datasets import Dataset
from litellm import ModelResponse

In [4]:
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.7, model='gpt-4', strict_mode=True)
test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    # Replace this with the actual output from your LLM application
    actual_output="We offer a 30-day full refund at no extra costs.",
    retrieval_context=["All customers are eligible for a 30 day full refund at no extra costs."]
)
evaluate([test_case], [answer_relevancy_metric], run_async=False, ignore_errors=False)

Output()

Evaluating test cases...




Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 1, strict: True, evaluation model: gpt-4, reason: The score is 1.00 because the response perfectly addressed the concern raised in the input without any irrelevant statements., error: None)

For test case:

  - input: What if these shoes don't fit?
  - actual output: We offer a 30-day full refund at no extra costs.
  - expected output: None
  - context: None
  - retrieval context: ['All customers are eligible for a 30 day full refund at no extra costs.']

----------------------------------------------------------------------


[TestResult(success=True, metrics=[<deepeval.metrics.answer_relevancy.answer_relevancy.AnswerRelevancyMetric object at 0x7f40b86ea920>], input="What if these shoes don't fit?", actual_output='We offer a 30-day full refund at no extra costs.', expected_output=None, context=None, retrieval_context=['All customers are eligible for a 30 day full refund at no extra costs.'])]

In [28]:
questions = ["Give a brief explanation of how brain neuroplasticity works",
             "What is the role of dopamine in the body",
             "What is a catecholimine",
             "What does Jocko have to say about leadership",
             "What does Fridman think about the evolution of AI", 
             "Who is the host of the Huberman Labs podcast",
             "Why do people make self-destructive decisions",
             "Provide better sleep protocol in list format",
             "What are the topcis that Lex Fridman discusses",
             "Is there a generally positive outlook on the future of AI",
            ]

In [29]:
client = get_weaviate_client()
turbo = LLM(model_name='gpt-3.5-turbo-0125')
collection_name = 'Huberman_minilm_512'

In [30]:
def get_answer_bundle(query: str,
                      client: WeaviateWCS,
                      collection_name: str,
                      answer_llm: LLM,
                      ground_truth_llm: LLM=None
                     ) -> tuple[str, list[list[str]], str]:
    '''
    Returns answer, ground truth and associated context from a single query.
    '''
    def format_llm_response(response: ModelResponse) -> str:
        return response.choices[0].message.content

    #1st-stage retrieval (get contexts)
    context = client.hybrid_search(query, collection_name, 
                                   query_properties=['content', 'title', 'short_description'],
                                   limit=3, 
                                   return_properties=['content', 'guest', 'short_description'])
    #create contexts from content field
    contexts = [d['content'] for d in context]
    
    #generate assistant message prompt
    assist_message = generate_prompt_series(query, context, summary_key='short_description')

    #generate answers from model being evaluated
    answer = format_llm_response(answer_llm.chat_completion(huberman_system_prompt, assist_message))

    #create ground truth answers
    if ground_truth_llm:
        ground_truth = format_llm_response(ground_truth_llm.chat_completion(huberman_system_prompt, assist_message))
        return query, contexts, answer, ground_truth
    return query, contexts, answer

In [38]:
from math import ceil
from time import sleep

def create_test_dataset(questions: list[str], 
                          client: WeaviateWCS,
                          collection_name: str,
                          answer_llm: LLM,
                          ground_truth_llm: LLM=None, 
                          batch_size: int=5, 
                          disable_internal_tqdm: bool=False):
    total = len(questions)
    progress = tqdm('Queries', total=total, disable=disable_internal_tqdm)
    data = []
    batches = ceil(total/batch_size)
    for i in range(batches):
        batch = questions[i*batch_size:(i+1)*batch_size]
        with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
            futures = [executor.submit(get_answer_bundle, query, client, collection_name, answer_llm) for query in batch]
            for future in as_completed(futures):
                progress.update(1)
                data.append(future.result())
        print(f"Finished with batch {i+1}, taking a break...")
    queries = [d[0] for d in data]
    contexts = [d[1] for d in data]
    answers = [d[2] for d in data]
    if len(data) == 4:
        ground_truths = [d[3] for d in data]
        return queries, contexts, answers, ground_truths
    return queries, context, answers

In [None]:
data = create_test_dataset(questions, client, collection_name, turbo)


  0%|                                                                                                                            | 0/10 [00:00<?, ?it/s][A