In [1]:
#visualization tool for displaying long load/processing times
#!pip install tqdm --quiet
#workhorse for converting text into embeddings/vectors
#!pip install sentence-transformers==3.3.1 --quiet
#data framework for LLM applications
#!pip install llama-index==0.11.22 --quiet
#logging output
#!pip install loguru --quiet
#convenient pretty printing library
#!pip install rich --quiet
#openai Tokenizer library
#!pip install tiktoken --quiet

In [2]:
#!curl -o preprocessing.py https://raw.githubusercontent.com/americanthinker/rag-applications/main/src/preprocessor/preprocessing.py

In [3]:
#!curl -o unitesting_utils.py https://raw.githubusercontent.com/americanthinker/rag-applications/main/unitesting_utils.py

In [4]:
#!curl -o huberman_labs.json https://raw.githubusercontent.com/americanthinker/rag-applications/main/data/huberman_labs.json

In [5]:
%load_ext autoreload
%autoreload 2

#standard libraries
import sys
sys.path.append('../')

import os
import time ,json
from typing import List, Tuple, Any
from math import ceil

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

#external libraries
import pandas as pd
import numpy as np
from rich import print
from rich.pretty import pprint #nifty library for pretty printing
from torch import cuda
from tqdm import tqdm

#external files
from src.preprocessor.preprocessing import FileIO
from src.database.weaviate_interface_v4 import WeaviateIndexer, WeaviateWCS




#external files
try:
    from preprocessing import FileIO
except ModuleNotFoundError:
    from src.preprocessor.preprocessing import FileIO

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [6]:
#!pip install weaviate-client 

In [7]:
#read env vars from local .env file
api_key = os.environ['WEAVIATE_API_KEY']
url = os.environ['WEAVIATE_ENDPOINT']
#model_path = 'sentence-transformers/all-MiniLM-L6-v2'

#instantiate client
#client = WeaviateWCS(endpoint=url, api_key=api_key, model_name_or_path=model_path)

#example of using the private _client attribute
#client._client.is_connected()

In [8]:
#root folder on Google Colab is: /content/
root_folder = '../data/' 
data_file = 'huberman_labs.json'
data_path = os.path.join(root_folder, data_file)
data_path

'../data/huberman_labs.json'

In [9]:
data = FileIO.load_json(data_path)
print(f'Total # of episodes: {len(data)}')

In [10]:
import tiktoken # bad ass tokenizer library for use with OpenAI LLMs 

gpt_model = 'gpt-4o-mini'

#instantiate tokenizer for use with our selected gpt model
encoding = tiktoken.encoding_for_model(gpt_model)

In [11]:
from llama_index.core.text_splitter import SentenceSplitter #one of the best on the market

#set chunk size and instantiate your SentenceSplitter
chunk_size = 512
gpt_txt_splitter = SentenceSplitter(chunk_size=chunk_size, tokenizer=encoding.encode, chunk_overlap=0)

In [12]:
def split_contents(corpus: list[dict],
                   text_splitter: SentenceSplitter,
                   content_field: str='content'
                   ) -> list[list[str]]:
    all_chunks = []
    for document in corpus:
        content = document.get(content_field, '')
        chunks = text_splitter.split_text(content)
        all_chunks.append(chunks)
    
    return all_chunks

In [13]:
#content_splits = split_contents(data, gpt_txt_splitter)

In [14]:
from sentence_transformers import SentenceTransformer

#define the model you want to use
#model_name = 'sentence-transformers/all-MiniLM-L6-v2'
#model_name = "Qwen/Qwen3-Embedding-0.6B"
#model_name = "intfloat/e5-large-v2"
model_name = "BAAI/bge-large-en-v1.5"
model = SentenceTransformer(model_name)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [15]:
def encode_content_splits(content_splits: list[list[str]],
                          model: SentenceTransformer,
                          device: str='cuda:0' if cuda.is_available() else 'cpu'
                          ) -> list[list[tuple[str, list[float]]]]:
    text_vector_tuples = []
    for episode in tqdm(content_splits):
        content_splits_vector_episode = model.encode(episode)
        episode_tuples = []
        for i in range(len(episode)):
            episode_tuples.append((episode[i],list(content_splits_vector_episode[i])))
        text_vector_tuples.append(episode_tuples)
    return text_vector_tuples 

In [16]:
import torch

def encode_content_splits_gpu(content_splits: list[list[str]],
                          model: SentenceTransformer,
                          device: str = None,
                          batch_size: int = 32
                          ) -> list[list[tuple[str, list[float]]]]:
    
    # Auto-detect best device
    if device is None:
        if torch.backends.mps.is_available():
            device = 'mps'
        elif torch.cuda.is_available():
            device = 'cuda'
        else:
            device = 'cpu'
    
    print(f"Using device: {device}")
    model = model.to(device)
    
    text_vector_tuples = []
    for episode in tqdm(content_splits, desc="Encoding episodes"):
        # Encode all chunks in the episode at once
        vectors = model.encode(
            episode, 
            device=device,
            batch_size=batch_size,
            show_progress_bar=False,
            convert_to_numpy=True  # Slightly faster for list conversion
        )
        
        # Create tuples more efficiently
        episode_tuples = [(text, vector.tolist()) for text, vector in zip(episode, vectors)]
        text_vector_tuples.append(episode_tuples)
    
    return text_vector_tuples

In [17]:
#text_vector_tuples = encode_content_splits(content_splits, model)

In [18]:
def join_metadata(corpus: list[dict], 
                  text_vector_tuples: list[list[tuple[str, list]]],
                  unique_id_field: str='video_id',
                  content_field: str='content',
                  embedding_field: str='content_embedding'
                  ) -> list[dict]:
    joined_documents = []
    
    for episode_metadata, episode_chunks in zip(corpus, text_vector_tuples):
        
        # Get the video_id for this episode
        video_id = episode_metadata[unique_id_field]
        
        # For each chunk in this episode (with counter)
        for j, (chunk_text, chunk_vector) in enumerate(episode_chunks):
            
            # Create a new document by copying the original metadata
            new_document = episode_metadata.copy()
            
            # Replace the full content with the chunk text
            new_document[content_field] = chunk_text
            
            # Add the embedding vector
            new_document[embedding_field] = chunk_vector
            
            # Add doc_id using video_id + chunk counter
            new_document['doc_id'] = f'{video_id}_{j}'
            
            # Add to our results
            joined_documents.append(new_document)
    
    
    return joined_documents

In [19]:
#docs = join_metadata(data, text_vector_tuples)

In [20]:
def create_dataset(corpus: list[dict],
                   embedding_model: SentenceTransformer,
                   text_splitter: SentenceSplitter,
                   save_to_disk: bool,
                   file_outpath: str=None,
                   unique_id_field: str='video_id',
                   content_field: str='content',
                   embedding_field: str='content_embedding',
                   device: str='cuda:0' if cuda.is_available() else 'cpu'
                   ) -> list[dict]:
    
    '''
    Given a raw corpus of data, this function creates a new dataset where each dataset 
    doc contains episode metadata and it's associated text chunk and vector representation. 
    Output is directly saved to disk. 
    '''
    if save_to_disk and not file_outpath:
        raise ValueError(f'Saving to disk is enabled but file_outpath was left as a None value.\n\
            Enter a valid file_outpath or mark save_to_disk as False')
    
    io = FileIO()

    chunk_size = text_splitter.chunk_size
    print(f'Creating dataset using chunk_size: {chunk_size}')
    start = time.perf_counter()
    ########################
    # START YOUR CODE HERE #
    ########################
    content_splits = split_contents(corpus, gpt_txt_splitter)
    text_vector_tuples = encode_content_splits(content_splits, model)
    joined_docs = join_metadata(corpus, text_vector_tuples)
    ########################
    # END YOUR CODE HERE #
    ########################

    # Convert your joined_documents to DataFrame first
    df = pd.DataFrame(joined_docs)

    ##########################
    if save_to_disk:
        io.save_as_parquet(file_path=file_outpath, data=df, overwrite=True)
    end = time.perf_counter() - start
    print(f'Total Time to process dataset of chunk_size ({chunk_size}): {round(end/60, 2)} minutes')
    return joined_docs

In [21]:
dataset = create_dataset(data,model,gpt_txt_splitter,save_to_disk=True,file_outpath="../data/bge-large-en-512.parquet")

100%|████████████████████████████████████████████████████████████████████████████████████████| 193/193 [19:42<00:00,  6.13s/it]
[32m2025-07-21 15:39:15.091[0m | [1mINFO    [0m | [36mpreprocessing[0m:[36msave_as_parquet[0m:[36m40[0m - [1mDataFrame saved as parquet file here: ../data/bge-large-en-512.parquet[0m


In [22]:
len(dataset)

11238

In [23]:
#data_path = "../data/huberman-minilmL6-256.parquet"
#data_path = "../data/Qwen3-Embedding-0.6B-2.parquet"
data_path = "../data/bge-large-en-512.parquet"
data = FileIO.load_parquet(data_path)

Shape of data: (11238, 13)
Memory Usage: 1.11+ MB


In [24]:
from src.database.properties_template import properties

#print(properties)

In [25]:
# collection_name = 'Huberman_minilm_256'
# collection_name = 'Huberman_Qwen3_256'
collection_name = 'Huberman_bge_large_en_512'
model_path = "BAAI/bge-large-en-v1.5"  # Match your current model

# Re-instantiate client with the correct model
client = WeaviateWCS(endpoint=url, api_key=api_key, model_name_or_path=model_path)

This method is deprecated and will be removed in a future release. Use :func:`connect_to_weaviate_cloud` instead.

  self._client = weaviate.connect_to_wcs(
  return function(*args, **kwargs)


In [26]:
client.create_collection(collection_name=collection_name, properties=properties, description='Huberman Labs: 193 full-length transcripts bge_large_en Embedding 512 chunck')

Collection "Huberman_bge_large_en_512" created


In [27]:
print(client.show_collection_config(collection_name))

Huberman_bge_large_en_512
{'Huberman_Qwen3_256': _CollectionConfig(name='Huberman_Qwen3_256', description='Huberman Labs: 193 full-length transcripts Qwen3 Embedding', generative_config=None, inverted_index_config=_InvertedIndexConfig(bm25=_BM25Config(b=0.75, k1=1.2), cleanup_interval_seconds=60, index_null_state=False, index_property_length=False, index_timestamps=False, stopwords=_StopwordsConfig(preset=<StopwordsPreset.EN: 'en'>, additions=None, removals=None)), multi_tenancy_config=_MultiTenancyConfig(enabled=False, auto_tenant_creation=False, auto_tenant_activation=False), properties=[_Property(name='video_id', description='Unique identifier of the video episode', data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={}), _Property(name='title', description='Title of the video episode', data_type=<DataT

In [28]:
indexer = WeaviateIndexer(client)
batch_object = indexer.batch_index_data(data, collection_name)

100%|███████████████████████████████████████████████████████████████████████████████████| 11238/11238 [00:16<00:00, 701.72it/s]


Processing finished in 0.41 minutes.
Batch job completed with zero errors.


In [29]:
total_docs = client.get_doc_count(collection_name)

In [30]:
#get properties that are part of the class
display_properties = [prop.name for prop in client.show_collection_properties(collection_name)]

# for this example we don't want to see the 
# summary or keywords so remove them
display_properties.remove('summary')
display_properties.remove('keywords')
display_properties

['video_id',
 'title',
 'length_seconds',
 'thumbnail_url',
 'view_count',
 'episode_num',
 'doc_id',
 'content',
 'guest',
 'episode_url',
 'expanded_content']

In [31]:
#query="What can I do to increase both my healthspan and lifespan"
#query="How does narrowing visual attention impact engagement during exercise?"
query="How can deep rest states help reset dopamine levels in the brain's basal ganglia?"

In [32]:
response = client.keyword_search(request=query,
                                 collection_name=collection_name,
                                 query_properties=['title', 'guest', 'content'],  # change these or remove one or two and see how the results change
                                 limit=5,
                                 filter=None,       # filtering is discussed as an optional final part of this notebook
                                 return_properties=display_properties,
                                 return_raw=False)  # turn this flag on and off to see how the responses are being reformatted

print(response)

In [33]:
vector_response = client.vector_search(request=query,
                                       collection_name=collection_name,
                                       limit=5, 
                                       return_properties=display_properties,
                                       filter=None,
                                       return_raw=False,
                                       device='cpu')
print(vector_response)

In [34]:
keyword_ids = [doc['doc_id'] for doc in response]
vector_ids = [doc['doc_id'] for doc in vector_response]

print(f'Keyword IDs: {keyword_ids}')
print(f'Vector IDs: {vector_ids}')

In [35]:
from weaviate.classes.query import Filter

In [36]:
guest_filter = Filter.by_property('guest').equal('Jocko Willink')

In [37]:
query = 'why is trust so important within the context of leadership'
response = client.vector_search(query, collection_name, limit=10, filter=guest_filter)

#verify that the filter works as promised
show_guest = [resp['guest'] for resp in response]
#print(show_guest)
#print(response)

In [38]:
multi_filter = Filter.by_property('title').like('*mental health*') & \
               Filter.by_property('view_count').greater_than(500000)

In [39]:
titles = [resp['title'] for resp in response]

# All titles should contain the words "mental" and "health" and all response view_counts should be > 500K. 
#print(titles)
#print(response)

In [40]:
data_path = '../data/golden_datasets/golden_512.json'
golden_dataset = FileIO.load_json(data_path)
retriever = WeaviateWCS(endpoint=url, api_key=api_key, model_name_or_path=model_path)
#collection_name = 'Huberman_minilm_256'
#collection_name = 'Huberman_Qwen3_256'
print(f'Num queries in Golden Dataset: {len(golden_dataset["queries"])}')

In [41]:
from src.evaluation.retrieval_evaluation import calc_hit_rate_scores, calc_mrr_scores, record_results

def retrieval_evaluation(dataset: dict, 
                         collection_name: str, 
                         retriever: WeaviateWCS,
                         retrieve_limit: int=5,
                         chunk_size: int=512,
                         query_properties: list[str]=['content'],
                         return_properties: list[str]=['doc_id', 'content'],
                         dir_outpath: str='./eval_results',
                         include_miss_info: bool=False
                         ) -> dict[str, str|int|float]:
    '''
    Given a dataset and a retriever evaluate the performance of the retriever. Returns a dict of kw and vector
    hit rates and mrr scores. If inlude_miss_info is True, will also return a list of kw and vector responses 
    and their associated queries that did not return a hit, for deeper analysis. Text file with results output
    is automatically saved in the dir_outpath directory.

    Args:
    -----
    dataset: dict
        Dataset to be used for evaluation
    collection_name: str
        Name of Collection on Weaviate host to be used for retrieval
    retriever: WeaviateWCS
        WeaviateWCS object to be used for retrieval 
    retrieve_limit: int=5
        Number of documents to retrieve from Weaviate host, increasing this value too high 
        will artificially inflate the hit rate score of your retriever.
    chunk_size: int=256
        Number of tokens used to chunk text. This value is purely for results 
        recording purposes and does not affect results. 
    query_properties: list[str] = ['content']
        List of properties over which keyword search will query.  Can add multiple properties
        to this list. 
    return_properties: list[str]=['doc_id', 'content']
        list of properties to be returned from Weaviate host for display in response
    dir_outpath: str='./eval_results'
        Directory path for saving results.  Directory will be created if it does not
        already exist. 
    include_miss_info: bool=False
        Option to include queries and their associated kw and vector response values
        for queries that are "total misses"
    '''

    results_dict = {'n':retrieve_limit, 
                    'Retriever': retriever.model_name_or_path, 
                    'chunk_size': chunk_size,
                    'query_props': query_properties,
                    'kw_raw_hits': 0,
                    'vector_raw_hits': 0,
                    'kw_mrr': 0,
                    'vector_mrr': 0,
                    'total_misses': 0,
                    'total_questions':0
                    }
    
    start = time.perf_counter()
    miss_info = []
    for query_id, q in tqdm(dataset['queries'].items(), 'Queries'):
        results_dict['total_questions'] += 1
        hit = False
        
        try:
            kw_response = retriever.keyword_search(request=q, collection_name=collection_name, query_properties=query_properties,
                                                   limit=retrieve_limit, return_properties=return_properties)
            vector_response = retriever.vector_search(request=q, collection_name=collection_name, 
                                                   limit=retrieve_limit, return_properties=return_properties)
            
            #collect doc_ids and position of doc_ids to check for document matches
            kw_doc_ids = {result['doc_id']:i for i, result in enumerate(kw_response, 1)}
            vector_doc_ids = {result['doc_id']:i for i, result in enumerate(vector_response, 1)}
            
            #extract doc_id for scoring purposes
            doc_id = dataset['relevant_docs'][query_id]
 
            #increment hit_rate counters and mrr scores
            if doc_id in kw_doc_ids:
                results_dict['kw_raw_hits'] += 1
                results_dict['kw_mrr'] += 1/kw_doc_ids[doc_id]
                #print("hit on keyword\n")
                hit = True
            if doc_id in vector_doc_ids:
                results_dict['vector_raw_hits'] += 1
                results_dict['vector_mrr'] += 1/vector_doc_ids[doc_id]
                #print("hit on Vector\n")
                hit = True
                
            # if no hits, let's capture that
            if not hit:
                results_dict['total_misses'] += 1
                miss_info.append({'query': q, 'kw_response': kw_response, 'vector_response': vector_response})
        except Exception as e:
            print(f'Error due to {e}')
            continue
    
    #use raw counts to calculate final scores
    calc_hit_rate_scores(results_dict, search_type=['kw', 'vector'])
    calc_mrr_scores(results_dict, search_type=['kw', 'vector'])
    
    end = time.perf_counter() - start
    print(f'Total Processing Time: {round(end/60, 2)} minutes')
    record_results(results_dict, chunk_size, dir_outpath=dir_outpath)
    
    if include_miss_info:
        return results_dict, miss_info
    return results_dict

/Users/hnouri/.pyenv/versions/3.10.12/lib/python3.10/site-packages/pydantic/_internal/_config.py:323: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/


In [42]:
eval_results = retrieval_evaluation(golden_dataset, collection_name, retriever,chunk_size=512,include_miss_info=False)

Queries: 100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [01:00<00:00,  1.65it/s]


[32m2025-07-21 15:41:01.056[0m | [1mINFO    [0m | [36msrc.preprocessor.preprocessing[0m:[36msave_as_json[0m:[36m109[0m - [1mData saved as json file here: ./eval_results/retrieval_eval_512_2025-07-21-15-41-01.json[0m


In [43]:
print(eval_results)

In [44]:
client.show_all_collections()

['Huberman_Qwen3_256',
 'Huberman_bge_large_en_256',
 'Huberman_bge_large_en_512',
 'Huberman_minilm_256']