In [1]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv, find_dotenv
envs = load_dotenv(find_dotenv(), override=True)

import time
import sys
import os
sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
from deepeval import evaluate
from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset

from src.database.database_utils import get_weaviate_client
from src.database.weaviate_interface_v4 import WeaviateWCS
from src.llm.llm_interface import LLM
from src.llm.llm_utils import get_token_count, load_azure_openai
from src.llm.prompt_templates import question_answering_prompt_series, huberman_system_message
from app_features import generate_prompt_series

from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from datasets import Dataset
from litellm import ModelResponse

import asyncio
import nest_asyncio
nest_asyncio.apply()
from rich import print

In [6]:
# answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.7, model='gpt-4', strict_mode=True)
# test_case = LLMTestCase(
#     input="What if these shoes don't fit?",
#     # Replace this with the actual output from your LLM application
#     actual_output="We offer a 30-day full refund at no extra costs.",
#     retrieval_context=["All customers are eligible for a 30 day full refund at no extra costs."]
# )
# evaluate([test_case], [answer_relevancy_metric], run_async=False, ignore_errors=False)

In [46]:
questions = ["Give a brief explanation of how brain neuroplasticity works",
             "What is the role of dopamine in the body",
             "What is a catecholimine",
             "What does Jocko Willink have to say about leadership",
             "What does Lex Fridman think about the evolution of AI", 
             "How can I support the Huberman Lab podcst",
             "Why do people make self-destructive decisions",
             "Provide a better sleep protocol in list format",
             "What are the topcis that Lex Fridman discusses",
             "Is there a generally positive outlook on the future of AI",
            ]

In [47]:
client = get_weaviate_client()
turbo = LLM(model_name='gpt-3.5-turbo-0125')
azure = load_azure_openai(model_name='gpt-4')
collection_name = 'Huberman_minilm_128'

  client = get_weaviate_client()


In [48]:
def get_answer_bundle(query: str,
                      client: WeaviateWCS,
                      collection_name: str,
                      answer_llm: LLM,
                      ground_truth_llm: LLM=None
                     ) -> tuple[str, list[list[str]], str]:
    '''
    Returns answer, ground truth and associated context from a single query.
    '''
    def format_llm_response(response: ModelResponse) -> str:
        return response.choices[0].message.content

    #1st-stage retrieval (get contexts)
    context = client.hybrid_search(query, collection_name, 
                                   query_properties=['content', 'title', 'summary'],
                                   limit=3, 
                                   return_properties=['content', 'guest', 'summary'])
    #create contexts from content field
    contexts = [d['content'] for d in context]
    
    #generate assistant message prompt
    assist_message = generate_prompt_series(query, context)

    #generate answers from model being evaluated
    answer = format_llm_response(answer_llm.chat_completion(huberman_system_message, assist_message))

    #create ground truth answers
    if ground_truth_llm:
        ground_truth = format_llm_response(ground_truth_llm.chat_completion(huberman_system_message, assist_message))
        return query, contexts, answer, ground_truth
    return query, contexts, answer

In [64]:
from math import ceil
from time import sleep

async def create_test_dataset(questions: list[str], 
                              client: WeaviateWCS,
                              collection_name: str,
                              answer_llm: LLM,
                              ground_truth_llm: LLM=None, 
                              batch_size: int=5, 
                              async_mode: bool=True,
                              disable_internal_tqdm: bool=False):
    total = len(questions)
    progress = tqdm('Queries', total=total, disable=disable_internal_tqdm)
    data = []
    batches = ceil(total/batch_size)
    for i in range(batches):
        batch = questions[i*batch_size:(i+1)*batch_size]
        if async_mode:
            results = await asyncio.gather(*[aget_answer_bundle(query, 
                                                                client, 
                                                                collection_name, 
                                                                answer_llm,
                                                                ground_truth_llm) for query in batch])
            if any(results):
                data.extend(results)
            else:
                raise "No results returned for initial batch, double-check your inputs."
        else:
            with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
                futures = [executor.submit(get_answer_bundle, query, client, collection_name, answer_llm, ground_truth_llm) for query in batch]
                for future in as_completed(futures):
                    progress.update(1)
                    data.append(future.result())
        print(f"Finished with batch {i+1}, taking a break...")
    
    queries = [d[0] for d in data]
    contexts = [d[1] for d in data]
    answers = [d[2] for d in data]
    dataset = {'queries': queries, 'contexts': contexts, 'answers': answers}
    if len(data[0]) == 4:
        ground_truths = [d[3] for d in data]
        dataset.update(ground_truths=ground_truths)
        return dataset
    return dataset

In [65]:
async def aget_answer_bundle( query: str,
                              client: WeaviateWCS,
                              collection_name: str,
                              answer_llm: LLM,
                              ground_truth_llm: LLM=None
                             ) -> tuple[str, list[list[str]], str]:
    '''
    Returns answer, ground truth and associated context from a single query.
    '''
    #1st-stage retrieval (get contexts)
    context = client.hybrid_search(query, collection_name, 
                                   query_properties=['content', 'title', 'summary'],
                                   limit=3, 
                                   return_properties=['content', 'guest', 'summary'])
    
    #create contexts from content field
    contexts = [d['content'] for d in context]
    
    #generate assistant message prompt
    assist_message = generate_prompt_series(query, context, 2)

    #generate answers from model being evaluated
    answer = await answer_llm.achat_completion(huberman_system_message, assist_message)

    #create ground truth answers
    if ground_truth_llm:
        ground_truth = await ground_truth_llm.achat_completion(huberman_system_message, assist_message)
        return query, contexts, answer, ground_truth
    return query, contexts, answer

In [66]:
data = await create_test_dataset(questions, client, collection_name, turbo, azure)






  async def wrapped(*args: P.args, **kwargs: P.kwargs) -> LegacyAPIResponse[R]:
  async def wrapped(*args: P.args, **kwargs: P.kwargs) -> LegacyAPIResponse[R]:


  0%|                                                                                                                                          | 0/10 [00:37<?, ?it/s]


In [67]:
data

{'queries': ['Give a brief explanation of how brain neuroplasticity works',
  'What is the role of dopamine in the body',
  'What is a catecholimine',
  'What does Jocko Willink have to say about leadership',
  'What does Lex Fridman think about the evolution of AI',
  'How can I support the Huberman Lab podcst',
  'Why do people make self-destructive decisions',
  'Provide a better sleep protocol in list format',
  'What are the topcis that Lex Fridman discusses',
  'Is there a generally positive outlook on the future of AI'],
 'contexts': [["And I promise you, I'm not going to just list off a bunch of different brain areas that are active during meditation. That wouldn't be useful to you. In fact, I don't believe in throwing out a lot of nomenclature without also giving some mechanistic explanation as to what different brain areas do. And you could say, well, what good is it knowing what different brain areas do and their names if I can't actually manipulate those brain areas? But th

Bad pipe message: %s [b'fa5\xa1y\xb5=\x1by\x82C\x98\x89>M\xcf\xde\x0e m_\xad\xb9\xa8\xe8\xac#6\xf8\xef\xda\x17G\xa1t\xbc\xff\xc9M\xf2\x9b$\xbd(\xce\xa5\x1d\x85 \xd3{\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00', b'\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 e\xcd\xa6\x1dY\xeb\xddR\x05\x9f\xced\xd8H=|?cA\xccDrPk\x01\t\xca\xe8\xce:\xf5\x16']
Bad pipe message: %s [b'\x86\x1e/\x8d$fDI@k3\x97\x1d6\xec_+\xe9 g-\x17\x81(U_:7\xde\x0f\xfbF\xd1\xc8\xc4\x8a7\xfe\xbb\x0e\xa4P\xa4\x05\xcd\x03\xb3\xee\x0c\xe5s\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00

In [47]:
def create_eval_dataset(questions: list[str],
                        contexts: list[list[str]],
                        answers: list[str]
                       ) -> EvaluationDataset:
    assert len(questions) == len(contexts) == len(answers), 'Mismatched lengths in input values, retry after correcting'
    test_cases = []
    for i in range(len(questions)):
        test_case = LLMTestCase(input=questions[i],
                                actual_output=answers[i],
                                retrieval_context=contexts[i])
        test_cases.append(test_case)
    return EvaluationDataset(alias='Initial test', test_cases=test_cases)