In [266]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv, find_dotenv
envs = load_dotenv(find_dotenv(), override=True)

import time
import sys
sys.path.append('../')
import os
from rich import print
from tqdm import tqdm
from pathlib import Path
from openai import OpenAI
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context, multi_context_question_prompt
from ragas.metrics import (
context_relevancy,
answer_correctness,
answer_relevancy,
faithfulness,
context_recall,
context_precision)
from ragas import evaluate

from litellm import ModelResponse
from src.database.database_utils import get_weaviate_client
from src.llm.llm_interface import LLM
from src.llm.llm_utils import get_token_count
from src.llm.prompt_templates import question_answering_prompt_series, huberman_system_prompt
from app_features import generate_prompt_series

from concurrent.futures import ThreadPoolExecutor, as_completed
from datasets import Dataset
litellm.set_verbose = False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [267]:
api_key = os.environ['OPENAI_API_KEY']

### Test Set Generation

In [268]:
client = get_weaviate_client()
turbo = LLM(model_name='gpt-3.5-turbo-0125')
claude = LLM('claude-3-opus-20240229', os.environ['ANTHROPIC_API_KEY'])
gpt4 = LLM(model_name='gpt-4')

In [269]:
questions = ["Give a brief explanation of how brain neuroplasticity works",
             "What is the role of dopamine in the body",
             "What is a catecholimine",
             "What does Jocko have to say about leadership",
             "What does Fridman think about the evolution of AI", 
             "Who is the host of the Huberman Labs podcast",
             "Why do people make self-destructive decisions",
             "Provide better sleep protocol in list format",
             "What are the topcis that Lex Fridman discusses",
             "Is there a generally positive outlook on the future of AI",
            ]

In [204]:
collection_names = client.show_all_collections()
collection_names[2]

'Huberman_minilm_512'

In [205]:
def get_answer_bundle(client, answer_llm, ground_truth_llm, collection_name, query):
    '''
    Returns answer, ground truth and associated context from a single query.
    '''
    def format_llm_response(response: ModelResponse) -> str:
        return response.choices[0].message.content

    #1st-stage retrieval (get contexts)
    context = client.hybrid_search(query, collection_name, 
                                   query_properties=['content', 'title', 'short_description'],
                                   limit=3, 
                                   return_properties=['content', 'guest', 'short_description'])
    #create contexts from content field
    contexts = [d['content'] for d in context]
    
    #generate assistant message prompt
    assist_message = generate_prompt_series(query, context, summary_key='short_description')

    #generate answers from model being evaluated
    answer = format_llm_response(answer_llm.chat_completion(huberman_system_prompt, assist_message))

    #create ground truth answers
    ground_truth = format_llm_response(ground_truth_llm.chat_completion(huberman_system_prompt, assist_message))
        
    return query, contexts, answer, ground_truth

In [206]:
def capture_data(client,
                 answer_llm: LLM,
                 ground_truth_llm: LLM,
                 collection_name: str, 
                 questions: list[str]
                 ) -> tuple[list[list[str]], list[str], list[str]]:
    contexts = []
    answers = []
    ground_truths = []

    def format_llm_reponse(response: ModelResponse) -> str:
        return response.choices[0].message.content
        
    for i, q in enumerate(tqdm(questions)):
        context = client.hybrid_search(q, collection_name, 
                                       query_properties=['content', 'title', 'short_description'],
                                       limit=3, 
                                       return_properties=['content', 'guest', 'short_description'])
        #create contexts from content field
        contexts.append([d['content'] for d in context])
        
        #generate assistant message prompt
        assist_message = generate_prompt_series(q, context, summary_key='short_description')

        #generate answers from model being evaluated
        print('Making Answer LLM call...')
        answer = answer_llm.chat_completion(huberman_system_prompt, assist_message)
        answers.append(format_llm_reponse(answer))

        #create ground truth answers
        print('Making Ground Truth LLM call...')
        ground_truth = ground_truth_llm.chat_completion(huberman_system_prompt, assist_message)
        ground_truths.append(format_llm_reponse(ground_truth))
        
    return contexts, answers, ground_truths

In [230]:
from math import ceil
from time import sleep

def capture_data_threaded(questions: list[str], 
                          client, 
                          answer_llm, 
                          ground_truth_llm, 
                          collection_name, 
                          batch_size: int=5, 
                          disable_internal_tqdm: bool=False):
    total = len(questions)
    progress = tqdm('Queries', total=total, disable=disable_internal_tqdm)
    data = []
    batches = ceil(total/batch_size)
    for i in range(batches):
        batch = questions[i*batch_size:(i+1)*batch_size]
        with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
            futures = [executor.submit(get_answer_bundle, client, answer_llm, ground_truth_llm, collection_name, query) for query in batch]
            for future in as_completed(futures):
                progress.update(1)
                data.append(future.result())
        print(f"Finished with batch {i+1}, taking a break...")
    queries = [d[0] for d in data]
    contexts = [d[1] for d in data]
    answers = [d[2] for d in data]
    ground_truths = [d[3] for d in data]
    return queries, contexts, answers, ground_truths

In [174]:
twenty_questions = questions * 2

In [208]:
data = capture_data_threaded(questions, client, turbo, gpt4, collection_names[2])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  1.66s/it]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  2.33s/it]


In [134]:
%%time
contexts, answers, ground_truths = capture_data(client, turbo, gpt4, collection_name, questions)

  0%|                                                                                                                             | 0/5 [00:00<?, ?it/s]

 20%|███████████████████████▍                                                                                             | 1/5 [00:05<00:23,  5.99s/it]

 40%|██████████████████████████████████████████████▊                                                                      | 2/5 [00:14<00:22,  7.58s/it]

 60%|██████████████████████████████████████████████████████████████████████▏                                              | 3/5 [00:22<00:15,  7.60s/it]

 80%|█████████████████████████████████████████████████████████████████████████████████████████████▌                       | 4/5 [00:29<00:07,  7.38s/it]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:37<00:00,  7.50s/it]

CPU times: user 535 ms, sys: 12.6 ms, total: 548 ms
Wall time: 37.5 s





In [215]:
def format_dataset(questions: list[str],
                   contexts: list[list[str]],
                   answers: list[str], 
                   groundtruth: list[str]
                   ):
    data_samples = {
                    'question': questions,
                    'contexts': contexts,
                    'answer': answers,
                    'ground_truth': groundtruth
                    }
    return Dataset.from_dict(data_samples)

In [209]:
data[1]

[["Catecholamines are things like dopamine, epinephrine, norepinephrine. These are chemicals in your nervous system and body that promote states of alertness. Dopamine, of course, part of the reward and motivation pathways. They explored the levels of these molecules in blood, in plasma, during and after this breathing protocol. And it was interesting, as I mentioned before, epinephrine showed robust increases compared to the control group. Norepinephrine, significant increases occurred in the breathing group, but in the cyclic hyperventilation retention breathing group, of course, but less so. And dopamine levels actually dropped somewhat. But this is very interesting because there's a new and emerging literature, largely from ISA, A-Y-S-A, Roll's lab in Israel. What her laboratory has shown is that motivational state and mindset has a powerful impact on various aspects of the immune system that were thought to be independent of the brain and mind and thinking. So this brings us back 

In [210]:
dataset = format_dataset(*data)

In [211]:
dataset.to_pandas()

Unnamed: 0,question,contexts,answer,ground_truth
0,What is a catecholimine,"[Catecholamines are things like dopamine, epin...",A catecholamine is a type of neurochemical tha...,A catecholamine is a chemical in your nervous ...
1,What is the role of dopamine in the body,"[The whole brain acts as a buffer, and the fro...",The role of dopamine in the body is multifacet...,"Dopamine plays multiple roles in the body, pri..."
2,Give a brief explanation of how brain neuropla...,"[And in order to understand that process, we r...",Brain neuroplasticity is the brain and nervous...,Brain neuroplasticity refers to the brain and ...
3,What does Jocko have to say about leadership,"[Welcome to the Huberman Lab Podcast, where we...",Jocko emphasizes the importance of detachment ...,Jocko Willink discusses several key aspects of...
4,What does Fridman think about the evolution of AI,[Pamela McCordick said AI was the ancient wish...,Fridman believes that the evolution of AI invo...,Fridman views the evolution of AI as multiface...


In [80]:
# for i, q in enumerate(questions):
#     print(f'Questions {i}: {q}')
#     print(f'Context: {contexts[i][0]}')
#     print(f'{contexts[i][1]}')
#     print(f'{contexts[i][2]}')
#     print()
#     print('-'*100)
#     print()

In [234]:
def evaluation_harness(indexes: list[str], questions: list[str], batch_size: int=5):
    start = time.perf_counter()
    results = {}
    metrics = [answer_correctness, answer_relevancy, context_relevancy, faithfulness]
    for index in tqdm(indexes):
        data = capture_data_threaded(questions, client, turbo, gpt4, index, batch_size, True)
        dataset = format_dataset(*data)
        evaluation = evaluate(dataset, metrics)
        results[index] = evaluation
    end = time.perf_counter() - start
    print(f'Total Time for Evaluation: {round(end/60, 2)} minutes')
    return results

In [235]:
len(questions)

10

In [236]:
results = evaluation_harness(client.show_all_collections(), questions)

  0%|                                                                                                                             | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

No statements were generated from the answer.
 33%|███████████████████████████████████████                                                                              | 1/3 [00:35<01:10, 35.48s/it]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

 67%|██████████████████████████████████████████████████████████████████████████████                                       | 2/3 [01:09<00:34, 34.91s/it]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:45<00:00, 35.07s/it]


In [237]:
for k, v in sorted(results.items()):
    print(k)
    print(v)
    print('-'*100)

Bad pipe message: %s [b'\xa2\x00\x97C\xac\xf2b\xa4j\x7f?@>n\xd0\xf6V\x11 \x15A\x17K\xb5\xcd\xa7B/\xcf!y\xc3\xb6L)I(\r\xc8_\x187\x9a\xa4\xf9\xb4\xc2\x1e.B\x19\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00']
Bad pipe message: %s [b'\xf0\x8b\x91.&\x07\x11\xf3\x12\x7fE\xbb\x9a/\xe4Z\xe5\xcf \xd7f\xa0\xac"\x8d\t\xdeF->%\x18k\xce\xa7\xde\xec\xee\xa8\xa1\xb0\xdc\xba\x8b\xa3\x13', b'sXo\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00']
Bad pipe message: %s [b' \xae\xdd\x99\x11\xd8\xa7(\xe8\x1e|v\x92\xe2\x1e\xf4\xc6\x00<\xf5k\xd0\x9f\xb8\xc3\x05s\xd2']
Bad pipe message: %s [b'\xd7\xa5\xa5\xdeo\x96$\x9

In [216]:
metrics = [answer_correctness, answer_relevancy, context_relevancy, faithfulness]

In [213]:
results_512 = evaluate(dataset, metrics)

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

In [214]:
results_512

{'answer_correctness': 0.7425, 'answer_relevancy': 0.9382, 'context_relevancy': 0.2418, 'faithfulness': 0.9600}

In [203]:
results_128

{'answer_correctness': 0.5783, 'answer_relevancy': 0.9028, 'context_relevancy': 0.3534, 'faithfulness': 0.9100}

In [202]:
threaded_results

{'answer_correctness': 0.6315, 'answer_relevancy': 0.9385, 'context_relevancy': 0.1621, 'faithfulness': 1.0000}

In [140]:
turbo_results

{'answer_correctness': 0.6432, 'answer_relevancy': 0.9236, 'context_relevancy': 0.1621, 'faithfulness': 1.0000}

In [114]:
turbo_results

{'answer_correctness': 0.6332, 'answer_relevancy': 0.9271, 'context_relevancy': 0.1621, 'faithfulness': 1.0000}

In [239]:
q = 'What does Huberman recommend for a morning routine'

In [242]:
collection_name = 'Huberman_minilm_256'

In [246]:
counts = []
for q in questions:
    context = client.hybrid_search(q, collection_name, 
                                   query_properties=['content', 'title', 'short_description'],
                                   limit=3, 
                                   return_properties=['content', 'guest', 'short_description'])
    #generate assistant message prompt
    assist_message = generate_prompt_series(q, context, summary_key='short_description')
    combined = huberman_system_prompt + ' ' + assist_message
    counts.append(get_token_count(combined))
    

In [254]:
total_tokens = 5000 * 100
total_tokens

500000

In [256]:
output, input = 50000, 450000

In [257]:
output_price = (1.5/1000000) * output
output_price

0.075

In [258]:
input_price = (0.50/1000000) * input
input_price

0.22499999999999998

In [259]:
total_gpt35 = output_price + input_price

In [261]:
turbo4_output_price = (30/1000000) * output
turbo4_input_price = (10/1000000) * input
total_gpt4_price = turbo4_input_price + turbo4_output_price
total_gpt4_price

6.0

In [263]:
hybrid_price = input_price + turbo4_output_price
hybrid_price

1.725