In [1]:
%load_ext autoreload 
%autoreload 2

import sys
sys.path.append('../')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from tqdm import tqdm
from warnings import filterwarnings
filterwarnings('ignore')

from src.database.weaviate_interface_v4 import WeaviateWCS
from src.database.database_utils import get_weaviate_client
from src.preprocessor.preprocessing import FileIO
from src.reranker import ReRanker
from src.evaluation.llm_evaluation import AnswerCorrectnessMetric, EvalResponse, CustomAzureOpenAI, CustomAnthropic
from src.llm.llm_interface import LLM
from src.llm.llm_utils import load_azure_openai
from src.llm.prompt_templates import (huberman_system_message, question_answering_prompt_series,
                                      create_context_blocks, generate_prompt_series)
from deepeval.test_case import LLMTestCase
from deepeval.models import DeepEvalBaseLLM
from deepeval import evaluate

import aiometer
import nest_asyncio
import asyncio 
nest_asyncio.apply()

### Load Data

In [5]:
data_path = '../data/golden_datasets/golden_256.json'
data = FileIO().load_json(data_path)
queries = list(data['queries'].values())

### Set Components

In [6]:
LLM.valid_models

{'cohere': ['command-r', 'command-r-plus'],
 'anthropic': ['claude-3-haiku-20240307',
  'claude-3-sonnet-2024022',
  'claude-3-opus-20240229'],
 'openai': ['gpt-4-turbo-preview',
  'gpt-4-0125-preview',
  'gpt-4-1106-preview',
  'gpt-4',
  'gpt-4-0613',
  'gpt-3.5-turbo',
  'gpt-3.5-turbo-1106',
  'gpt-3.5-turbo-0125']}

In [7]:
client = get_weaviate_client()
collection_name = 'Huberman_minilm_256'
reranker= ReRanker()
llm = load_azure_openai()
eval_gpt4 = 'gpt-4-0125-preview'

turbo_eval_model = CustomAzureOpenAI('gpt-35-turbo')
azure_eval_model = CustomAzureOpenAI('gpt-4')
anthro_eval_model = CustomAnthropic(model='claude-3-sonnet-20240229')

### Define TestCase acquisition functions

In [8]:
async def aget_actual_outputs(user_messages: list[str]):
    tasks = [llm.achat_completion(huberman_system_message, user_message, temperature=1.0) for user_message in user_messages]
    responses = await asyncio.gather(*tasks)
    return responses

In [9]:
async def acreate_test_cases( queries: list[str],
                              client: WeaviateWCS,
                              collection_name: str,
                              llm: LLM,
                              ) -> list[LLMTestCase]:
    '''
    Creates a list of LLM Test Cases based on query retrievals. 
    '''
    results = [client.hybrid_search(query, collection_name, limit=200) for query in tqdm(queries, 'QUERIES')]
    reranked = [reranker.rerank(result, queries[i], top_k=3) for i, result in enumerate(tqdm(results, 'RERANKING'))]
    user_messages = [generate_prompt_series(queries[i], rerank, 1) for i, rerank in enumerate(reranked)]
    actual_outputs = await aget_actual_outputs(user_messages)
    retrieval_contexts = [create_context_blocks(rerank) for rerank in reranked]
    test_cases = [LLMTestCase(input=input, actual_output=output, retrieval_context=context) \
                  for input, output, context in list(zip(queries, actual_outputs, retrieval_contexts))]
    return test_cases

### Get Test Cases

In [79]:
fake_queries = ['Who is the Grand High Poo-baa', 'Is Samuel L. Jackson a Christian, and if so which denomination', "What is Andy Huberman's middle name"]
fridman = ['Who is Lex Fridman']

In [13]:
fake_test_cases = await acreate_test_cases(fake_queries, client, collection_name, llm)

QUERIES: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.38it/s]
RERANKING: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.53it/s]


In [80]:
test_cases = await acreate_test_cases(fridman, client, collection_name, llm)

QUERIES: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.20it/s]
RERANKING: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.24it/s]


In [81]:
test_cases[0].actual_output

'Lex Fridman is an expert in electrical and computer engineering, artificial intelligence, and robotics. He hosts the Lex Fridman podcast, which covers a wide range of topics including technology, science, sports such as Brazilian Jiu-Jitsu, and mental health. He is known for engaging with various guests, spanning from scientists and engineers to comedians and friends, showcasing a diverse range of subjects and perspectives on his podcast. Additionally, Lex Fridman has been described as a thought leader who seeks to share the human experience and broaden knowledge for the benefit of all.'

In [12]:
#convert to list of dicts
test_case_list = [test_case.__dict__ for test_case in test_cases]

In [13]:
# #save results to disk
FileIO.save_as_json('../data/test_cases_50_256.json', test_case_list)
# test_cases = FileIO.load_json('../data/test_cases_256.json')
# test_cases = [LLMTestCase(input=tc['input'], actual_output=tc['actual_output'], retrieval_context=tc['retrieval_context'])
#               for tc in test_cases]

[32m2024-05-01 00:28:06.071[0m | [1mINFO    [0m | [36msrc.preprocessor.preprocessing[0m:[36msave_as_json[0m:[36m111[0m - [1mData saved as json file here: ../data/test_cases_50_256.json[0m


### Launch Testing

In [17]:
from src.evaluation.llm_evaluation import load_eval_response

In [55]:
async def asingle_eval_call(test_case: LLMTestCase,
                            model: str | DeepEvalBaseLLM,
                            metric: AnswerCorrectnessMetric,
                            return_context_data: bool=True
                            ) -> EvalResponse:
    if metric == AnswerCorrectnessMetric:
        metric = AnswerCorrectnessMetric(model)
    await metric.a_measure(test_case)
    response = load_eval_response(metric, test_case, return_context_data)
    return response

In [57]:
async def asystem_evaluation(test_cases: list[LLMTestCase],
                             model: DeepEvalBaseLLM,
                             metric: AnswerCorrectnessMetric,
                             return_context_data: bool=True
                             ):
    tasks = [asingle_eval_call(case, model, metric, return_context_data) for case in test_cases]
    responses = await asyncio.gather(*tasks)
    return responses

In [58]:
azure_evaluations_extra25 = await asystem_evaluation(test_cases, azure_eval_model, AnswerCorrectnessMetric)

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

In [23]:
%%time
azure_evaluations_first25 = evaluate(test_cases[:25], [acm], print_results=False)

Output()

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

CPU times: user 6.44 s, sys: 572 ms, total: 7.01 s
Wall time: 3min 7s


In [124]:
# cost = [eval.metrics[0].evaluation_cost for eval in azure_evaluations]
# sum(cost)

In [59]:
scores = [eval.score for eval in azure_evaluations_extra25]
sum(scores)/len(scores)

0.8360000000000001

In [34]:
goldens_first25 = [eval for eval in azure_evaluations_first25 if eval.metrics[0].score >= 0.9]

In [52]:
goldens_last25 = [eval for eval in azure_evaluations_last25 if eval.score >= 0.9]

In [62]:
goldens_extra25 = [eval for eval in azure_evaluations_extra25 if eval.score >= 0.9][:5]

In [63]:
len(goldens_first25), len(goldens_last25), len(goldens_extra25)

(7, 13, 5)

In [65]:
goldens_first25.extend(goldens_extra25)

In [70]:
golden_dicts = [eval.__dict__ for eval in goldens_first25]

In [72]:
golden_dicts = [{k:v for k,v in d.items() if k in ['input', 'actual_output', 'retrieval_context']} for d in golden_dicts]

In [77]:
import json
with open('../data/golden_datasets/llm_eval_testcases_initial.json', 'w') as f:
    json.dump(golden_dicts, f)

Bad pipe message: %s [b'\x01+W\x0bP\x8c\xfbjP\x1f0\xdeWG\x0b\x0c\x12\xd9 Bj\xfd\xbc\x88\xb3\x95\xda\xcdS_\x12IZ\xd2\xb3 8k/A\x7f\xeegc\xb2\xcf\xd5<A7\x15\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 @\x9fx\x7f\x08\x9c%\xba\xa8\xacH\x10K2t\xd8\xe5\xd1[\x0e']
Bad pipe message: %s [b'\xb9\x1c\xc0\xb7\xfa6{H\x00']
Bad pipe message: %s [b'"\xbf==*\xcf\xab\x9d\xb4)TN\xb3\x18\xd6\xd0\xe7\\\x00\x00>\xc0']
Bad pipe message: %s [b'\n\x009\x008\x007\x006\xc0\x0f\xc0\x05\x005\xc0\x13\xc0\t']
Bad pipe message: %s [b'6nL\x95fX\x83\xf0\xfc\xd2/\xf4\xb5\x9dN\x0c=\x8d\x00\x00\xa2\xc0\x14\xc0\n\x009\x008\x007\x006\x0

In [127]:
something = []
for eval in azure_evaluations:
    adict = eval.__dict__
    metrics = adict['metrics'][0].__dict__
    adict['metrics'] = metrics
    something.append(adict)

In [128]:
for d in something:
    del d['metrics']['evaluation_params']

In [129]:
FileIO.save_as_json('./azure_eval_results25.json', something,overwrite=True)

[32m2024-04-30 04:20:03.722[0m | [1mINFO    [0m | [36msrc.preprocessor.preprocessing[0m:[36msave_as_json[0m:[36m111[0m - [1mData saved as json file here: ./azure_eval_results25.json[0m


In [134]:
from rich import print

for res in azure_evaluations:
    if res.metrics['score'] < 0.9:
        print(res.metrics)
        print(res.input)
        print(res.actual_output)
        print(res.retrieval_context)
        print('\n')
        print('*'*100)
        print('\n')

In [9]:
azure_evaluations_first25

NameError: name 'azure_evaluations_first25' is not defined