In [3]:
%load_ext autoreload
%autoreload 2

from warnings import filterwarnings
filterwarnings('ignore')

#load from local .env file
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

import sys
sys.path.append('..')

#standard libraries
import json
import os
import time
import re

#external files
from src.preprocessor.preprocessing import FileIO
from src.database.weaviate_interface_v4 import WeaviateWCS
from src.database.database_utils import get_weaviate_client
from src.llm.llm_interface import LLM
from src.llm.prompt_templates import huberman_system_prompt
from app_features import generate_prompt_series

from concurrent.futures import ThreadPoolExecutor, as_completed
from src.reranker import ReRanker

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set Constants

In [16]:
client = get_weaviate_client()
client._client.close()
client.return_properties = client.return_properties + ['short_description']
collection_name = 'Huberman_minilm_128'
llm = LLM('gpt-3.5-turbo')
reranker = ReRanker()
initial_query = 'How can advancements in AI improve human health'

In [17]:
client.return_properties

['title', 'video_id', 'content', 'guest', 'doc_id', 'short_description']

#### Prompt Message

In [18]:
system_msg = """
You are an AI language model assistant. Your task is to generate {n}
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.\n\nOriginal question: {question}
""".format(n=3, question=initial_query)

print(system_msg)


You are an AI language model assistant. Your task is to generate 3
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.

Original question: How can advancements in AI improve human health



### Stage - 1: Query Rewrite

In [19]:
response = llm.chat_completion(system_msg, temperature=1.0, raw_response=False)

In [20]:
response.split('\n')

['What impact do AI advancements have on enhancing human health outcomes?',
 'How are AI innovations contributing to the improvement of human health?',
 'In what ways can AI progress positively influence human health and well-being?']

In [21]:
#parse questions
questions = [
                re.sub(r"^[-\d]+[\).\s]", "", question).strip() for question in response.split('\n')
            ]
questions.append(initial_query)
questions

['What impact do AI advancements have on enhancing human health outcomes?',
 'How are AI innovations contributing to the improvement of human health?',
 'In what ways can AI progress positively influence human health and well-being?',
 'How can advancements in AI improve human health']

### Stage - 2: Call N number of Retriever Calls

In [72]:
def multi_query(queries: list[str], 
                client: WeaviateWCS, 
                limit: int=3, 
                threaded: bool=False
               ) -> list[dict]:
    results = []
    if threaded:
        with ThreadPoolExecutor(max_workers=len(queries)) as executor:
            futures = [executor.submit(client.hybrid_search, q, collection_name, limit=limit) for q in queries]
            for future in as_completed(futures):
                results.append(future.result())
    else:
        results = [client.hybrid_search(q, collection_name, limit=limit) for q in queries]
    client._client.close()
    return results

In [73]:
%%time
single_query_docs = client.hybrid_search(initial_query, collection_name, limit=40)
docs = multi_query(questions, client, limit=10)

CPU times: user 128 ms, sys: 11 ms, total: 139 ms
Wall time: 1.84 s


### Stage 3 - Deduplicate Docs

In [74]:
def dedupe_docs(results: list[list[dict]]) -> list[dict]:
    '''
    Returns a list of unique documents sorted by "Score" value.
    '''
    results = [r for alist in results for r in alist]
    unique_docs = {d['doc_id']:d for d in results}
    sorted_unique = sorted([v for k,v in unique_docs.items()], key=lambda x: x['score'], reverse=True)
    return sorted_unique

In [76]:
print(f'Before: {len([r for alist in docs for r in alist])}')
unique_docs = dedupe_docs(docs)
print(f'After: {len(unique_docs)}')

Before: 40
After: 19


### Stage 4 - ReRank results

In [107]:
def reciprocal_rank_fusion(results: list[list[dict]], k=1, top_k: int=5):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for doc_series in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(doc_series, start=1):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            key = doc['doc_id']
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if key not in fused_scores:
                fused_scores[key] = {'score':0,'doc':doc}
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[key]['score']
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[key]['score'] += 1 / (rank + k)
    for k,v in fused_scores.items():
        fused_scores[k]['doc']['reranked_score'] = fused_scores[k]['score']
        
    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [value['doc'] 
        for doc_id, value in sorted(fused_scores.items(), key=lambda x: x[1]['doc']['reranked_score'], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results[:top_k]

In [108]:
reranked_initial = reranker.rerank(single_query_docs, initial_query, top_k=10)
reranked_multi = reciprocal_rank_fusion(docs, top_k=10)

In [109]:
for t in list(zip(reranked_initial, reranked_multi)):
    print(f"Initial:\nScore: {t[0]['cross_score']}\nTitle: {t[0]['title']}\nContent: {t[0]['content']}\n")
    print(f"Multi:\nScore: {t[1]['reranked_score']}\nTitle: {t[1]['title']}\nContent: {t[1]['content']}")
    print('-'*100)
    print('-'*100)

Initial:
Score: 0.6812052726745605
Title: Marc Andreessen: How Risk Taking, Innovation & Artificial Intelligence Transform Human Experience
Content: In doing so, Mark provides a stark counter-argument for those that argue that AI is going to diminish human experience. So if you're hearing about and or concerned about the ways that AI is likely to destroy us, today you are going to hear about the many different ways that AI technologies now in development are likely to enhance our human experience at every level. What you'll soon find is that while today's discussion does center around technology and technology development, it is really a discussion about human beings and human psychology.

Multi:
Score: 1.4166666666666667
Title: Marc Andreessen: How Risk Taking, Innovation & Artificial Intelligence Transform Human Experience
Content: In doing so, Mark provides a stark counter-argument for those that argue that AI is going to diminish human experience. So if you're hearing about and or 

### Stage 3 - Submit Context to LLM

In [110]:
for d in reranked_multi:
    d['short_description'] = d['short_description'].split('\n\n')[0]
for d in reranked_initial:
    d['short_description'] = d['short_description'].split('\n\n')[0]

In [111]:
len(reranked_initial)

10

In [112]:
initial_assist_message = generate_prompt_series(initial_query, reranked_initial[:5], summary_key='short_description')

In [116]:
initial_response = llm.chat_completion(system_message=huberman_system_prompt, 
                               assistant_message=initial_assist_message,
                               temperature=1.25,
                               raw_response=False)
print(initial_response)

Advancements in AI can improve human health by serving as personalized coaches and guides for making decisions about health, relationships, finances, and more. AI technologies, as discussed in the transcripts, are aiming to enhance the human experience at every level. These advancements may include AI assisting with personalized health decisions, suggesting workout plans, aiding in recovery strategies, providing travel ideas, and recommending dietary choices. By utilizing AI in these ways, individuals can potentially optimize their health outcomes, receive tailored recommendations, and have access to support systems that cater to their specific needs and preferences. Alger-



In [117]:
multi_assist_message = generate_prompt_series(initial_query, reranked_multi[:5], summary_key='short_description')

In [118]:
multi_response = llm.chat_completion(system_message=huberman_system_prompt, 
                               assistant_message=multi_assist_message,
                               temperature=1.25,
                               raw_response=False)
print(multi_response)

Advancements in AI can significantly improve human health by serving as personalized coaches and guides for decision-making in areas such as health, psychology, relationships, and finances. AI assistants are likely to provide highly informed health and psychological advice, as well as assist individuals in making daily decisions. With the use of large language models, AI can help analyze complex biological data, gain insights, and identify trends related to health, thus potentially transforming the understanding and management of various health conditions. Additionally, AI can be utilized to build defenses against the creation of harmful pathogens, contributing to bolstering biosecurity measures.
