In [6]:
%load_ext autoreload
%autoreload 2

from warnings import filterwarnings
filterwarnings('ignore')

#load from local .env file
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

import sys
sys.path.append('../../')

#standard libraries
import json
import os
import time
import re

#external files
from src.preprocessor.preprocessing import FileIO
from src.database.weaviate_interface_v4 import WeaviateWCS
from src.database.database_utils import get_weaviate_client
from src.llm.llm_interface import LLM
from src.llm.prompt_templates import huberman_system_message
from app_features import generate_prompt_series

from concurrent.futures import ThreadPoolExecutor, as_completed
from src.reranker import ReRanker



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set Constants

In [82]:
client = get_weaviate_client()

collection_name = 'Huberman_minilm_128'
llm = LLM('gpt-3.5-turbo')
reranker = ReRanker()
initial_query = 'Why would comeone get a homosysteine blood draw'

In [83]:
client.return_properties

['guest',
 'title',
 'summary',
 'content',
 'video_id',
 'doc_id',
 'episode_url',
 'thumbnail_url']

#### Prompt Message

In [84]:
system_msg = """
You are an AI language model assistant. Your task is to generate {n}
different versions of the user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.\n\nOriginal question: {question}
""".format(n=3, question=initial_query)

print(system_msg)


You are an AI language model assistant. Your task is to generate 3
different versions of the user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.

Original question: Why would comeone get a homosysteine blood draw



### Stage - 1: Query Rewrite

In [85]:
response = llm.chat_completion(system_msg, temperature=1.0, raw_response=False)

In [86]:
response.split('\n')

['1. What are the reasons for someone to undergo a homocysteine blood test?',
 '2. What medical conditions or risk factors prompt individuals to have their homocysteine levels measured via a blood draw?',
 '3. In what situations is it advisable for individuals to get their homocysteine levels checked through a blood test?']

In [87]:
#parse questions
questions = [
                re.sub(r"^[-\d]+[\).\s]", "", question).strip() for question in response.split('\n')
            ]
questions.append(initial_query)
questions

['What are the reasons for someone to undergo a homocysteine blood test?',
 'What medical conditions or risk factors prompt individuals to have their homocysteine levels measured via a blood draw?',
 'In what situations is it advisable for individuals to get their homocysteine levels checked through a blood test?',
 'Why would comeone get a homosysteine blood draw']

### Stage - 2: Call N number of Retriever Calls

In [88]:
def multi_query(queries: list[str], 
                client: WeaviateWCS, 
                limit: int=3, 
                threaded: bool=False
               ) -> list[dict]:
    results = []
    if threaded:
        with ThreadPoolExecutor(max_workers=len(queries)) as executor:
            futures = [executor.submit(client.hybrid_search, q, collection_name, limit=limit) for q in queries]
            for future in as_completed(futures):
                results.append(future.result())
    else:
        results = [client.hybrid_search(q, collection_name, limit=limit) for q in queries]
    client._client.close()
    return results

In [89]:
%%time
single_query_docs = client.hybrid_search(initial_query, collection_name, limit=200)
docs = multi_query(questions, client, limit=100, threaded=False)

CPU times: user 84.4 ms, sys: 640 µs, total: 85.1 ms
Wall time: 1.14 s


### Stage 3 - Deduplicate Docs

In [90]:
def dedupe_docs(results: list[list[dict]]) -> list[dict]:
    '''
    Returns a list of unique documents sorted by "Score" value.
    '''
    results = [r for alist in results for r in alist]
    unique_docs = {d['doc_id']:d for d in results}
    # sorted_unique = sorted([v for k,v in unique_docs.items()], key=lambda x: x['score'], reverse=True)
    unique_docs = [v for k,v in unique_docs.items()]
    return unique_docs

In [91]:
print(f'Before: {len([r for alist in docs for r in alist])}')
unique_docs = dedupe_docs(docs)
print(f'After: {len(unique_docs)}')

Before: 400
After: 280


In [92]:
ranked = reranker.rerank(unique_docs, query=initial_query, top_k=5)
single_ranked = reranker.rerank(single_query_docs, initial_query, top_k=

In [103]:
for r, sr in zip(ranked, single_ranked):
    print(f'CONTENT: {r["content"]}')
    print(f'SCORE: {r["cross_score"]}')
    print('\n')
    print(f'CONTENT: {sr["content"]}')
    print(f'SCORE: {sr["cross_score"]}')
    print('-'*100)
    print('-'*100)

CONTENT: So if somebody is already taking creatine and likes it in response to it, I'll raise my hand, such as myself, would adding betaine help or is it redundant with creatine? Only if their homocysteine is persistently elevated. And homocysteine is kind of like an inflammatory marker that can build up if you're not converting enough of it downstream. How would I know? Just a blood test. Or if you knew your MTHFR polymorphism, which is basically how you add methyl groups to many things in the body. Great.
SCORE: 0.03956398367881775


CONTENT: These people actually were willing to have blood draws taken while watching pornography, they had increases in testosterone that were very modest of about 10%. Whereas when people participated in sex, they actually did this study where people had blood draws and they had real sex with their partners and they had 70% increases in testosterone. So there are increases in testosterone that are quite significant during the physical act of sex and far

### Stage 4 - ReRank results

In [104]:
def reciprocal_rank_fusion(results: list[list[dict]], k=1, top_k: int=5):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for doc_series in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(doc_series, start=1):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            key = doc['doc_id']
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if key not in fused_scores:
                fused_scores[key] = {'score':0,'doc':doc}
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[key]['score']
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[key]['score'] += 1 / (rank + k)
    for k,v in fused_scores.items():
        fused_scores[k]['doc']['reranked_score'] = fused_scores[k]['score']
        
    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [value['doc'] 
        for doc_id, value in sorted(fused_scores.items(), key=lambda x: x[1]['doc']['reranked_score'], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results[:top_k]

In [105]:
reranked_initial = reranker.rerank(single_query_docs, initial_query, top_k=10)
reranked_multi = reciprocal_rank_fusion(docs, top_k=10)

In [106]:
for t in list(zip(reranked_initial, reranked_multi)):
    print(f"Initial:\nScore: {t[0]['cross_score']}\nTitle: {t[0]['title']}\nContent: {t[0]['content']}\n")
    print(f"Multi:\nScore: {t[1]['reranked_score']}\nTitle: {t[1]['title']}\nContent: {t[1]['content']}")
    print('-'*100)
    print('-'*100)

Initial:
Score: 0.0002630468807183206
Title: The Science of How to Optimize Testosterone & Estrogen
Content: These people actually were willing to have blood draws taken while watching pornography, they had increases in testosterone that were very modest of about 10%. Whereas when people participated in sex, they actually did this study where people had blood draws and they had real sex with their partners and they had 70% increases in testosterone. So there are increases in testosterone that are quite significant during the physical act of sex and far less so during observing sex.

Multi:
Score: 1.1666666666666665
Title: Dr. Kyle Gillett: Tools for Hormone Optimization in Males | Huberman Lab Podcast 102
Content: So if somebody is already taking creatine and likes it in response to it, I'll raise my hand, such as myself, would adding betaine help or is it redundant with creatine? Only if their homocysteine is persistently elevated. And homocysteine is kind of like an inflammatory mark

### Stage 3 - Submit Context to LLM

In [107]:
for d in reranked_multi:
    d['summary'] = d['summary'].split('\n\n')[0]
for d in reranked_initial:
    d['summary'] = d['summary'].split('\n\n')[0]

In [108]:
len(reranked_initial)

10

In [109]:
initial_assist_message = generate_prompt_series(initial_query, reranked_initial[:5], summary_key='summary')

In [112]:
initial_response = llm.chat_completion(system_message=huberman_system_message, 
                               user_message=initial_assist_message,
                               temperature=1.25,
                               raw_response=False)
print(initial_response)

Someone may choose to get a homocysteine blood draw as a marker for cardiovascular health. Homocysteine is an amino acid that is linked to an increased risk of cardiovascular disease when found at high levels in the blood. Elevated homocysteine levels have been associated with increased risk of heart attacks, strokes, and other cardiovascular issues. Therefore, getting a homocysteine blood draw allows individuals and healthcare providers to assess this specific marker and potentially take preventive or corrective actions to lower homocysteine levels and reduce the risk of cardiovascular complications.


In [113]:
multi_assist_message = generate_prompt_series(initial_query, reranked_multi[:5], summary_key='summary')

In [114]:
multi_response = llm.chat_completion(system_message=huberman_system_message, 
                               user_message=multi_assist_message,
                               temperature=1.25,
                               raw_response=False)
print(multi_response)

Someone would get a homocysteine blood draw to monitor their homocysteine levels. Elevated homocysteine levels can be an inflammatory marker signaling potential issues with the conversion of homocysteine downstream in the body. By getting a homocysteine blood draw, individuals can assess if their levels are persistently elevated, which could indicate a need to address potential inflammatory markers through appropriate interventions or supplements like betaine if necessary, based on the guest's discussion in the given transcript.


Bad pipe message: %s [b'/t\xfb(\x11s\x00;\x19\xbdC\x1f-\xc06\xe2\x90\xcf\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R']
Bad pipe message: %s [b"\xc0(\x00k\x00j\xc0s\xc0w\x00\xc4\x00\xc3\xc0#\xc0'\x00g\x00@\xc0r\xc0v\x00\xbe\x00\xbd\xc0\n\xc0\x14\x009\x008\x00\x88\x00\x87\xc0\t\xc0\x13\x003\x002\x00\x9a\x00\x99\x00E\x00D\xc0\x07\xc0\x11\xc0\x08\xc0\x12\x00\x16\x00\x13\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00\xc0\x00<\x00\xba\x005\x00\x84\x00/\x00\x96\x00A\x00\x05\x00\n\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x000\x00.\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08"]
Bad pipe message: %s [b'\x05\x08\x06']
Bad pipe message: %s [b'\x05\x0