In [1]:
%load_ext autoreload
%autoreload 2

#load from local .env file
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

import sys
sys.path.append('..')

#standard libraries
import json
import os
import time
from typing import Any
from tqdm.notebook import tqdm

#external files
from src.preprocessor.preprocessing import FileIO
from src.database.weaviate_interface_v4 import WeaviateWCS, WeaviateIndexer
#weaviate
from weaviate.auth import AuthApiKey
from weaviate import connect_to_wcs
from weaviate.classes.config import Property
#misc
from rich import print
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor, as_completed

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set Constants

In [2]:
api_key = os.environ['WEAVIATE_API_KEY']
url = os.environ['WEAVIATE_ENDPOINT']
collection_name = 'Huberman_minilm_256'

In [6]:
client = WeaviateWCS(endpoint=url, api_key=api_key)

In [7]:
client._client.is_connected()

True

In [8]:
%%time
response = client.vector_search('what is Tongkat Ali', collection_name, limit=5)

CPU times: user 1.19 s, sys: 834 ms, total: 2.02 s
Wall time: 2.18 s


In [9]:
%%time
bm25 = client.keyword_search('what is the gut brain axis', collection_name, query_properties=['content'], limit=5)

CPU times: user 1.48 ms, sys: 483 µs, total: 1.96 ms
Wall time: 95.5 ms


In [10]:
%%time
hybrid = client.hybrid_search('what is the role of the brain', collection_name)

CPU times: user 12.8 ms, sys: 0 ns, total: 12.8 ms
Wall time: 333 ms


In [20]:
docs = FileIO().load_parquet('../data/huberman_subset_minilm-256.parquet')

Shape of data: (4614, 19)
Memory Usage: 0.48+ MB


In [60]:
def convert_raw_data(raw_data: list[dict]) -> list[dict]:
    '''
    Converts raw YouTube json to correct format for 
    indexing on Weaviate. i.e. drops unused fields, 
    and coerces data types. 
    '''
    drops = ['channelId', 'isOwnerViewing', 'isCrawlable', 'allowRatings', \
             'author', 'isPrivate', 'isUnpluggedCorpus', 'isLiveContent']
    data = list(raw_data.values())
    for d in data:
        d['thumbnail_url'] = d['thumbnail']['thumbnails'][1].get('url')
        d['lengthSeconds'] = int(d['lengthSeconds'])
        d['viewCount'] = int(d['viewCount'])
        del d['thumbnail']
        for field in drops:
            del d[field]
    return data

In [23]:
docs[0]['thumbnail']['thumbnails'][1]['url']

'https://i.ytimg.com/vi/oL3SkPV1_Ik/mqdefault.jpg'

In [25]:
for k in docs[0]:
    print(f'({k} --> {type(docs[0][k])})')

### Create Properties

In [11]:
from src.database.huberman_properties import properties_template
# from weaviate.classes.config import Tokenization, Property

In [12]:
print(properties_template)

In [77]:
collection = client.client.collections.create(
    name='delme_index',
    description='Collection of 189 episodes of the Huberman Labs podcast',
    properties=properties_template)

In [79]:
with collection.batch.dynamic() as batch:
    for i, doc in tqdm(enumerate(docs)):
        batch.add_object(properties={k:v for k,v in doc.items() if not k.endswith('embedding')},
                         vector=doc['content_embedding'])

0it [00:00, ?it/s]

In [13]:
name = 'Demo_collection'

In [14]:
indexer = WeaviateIndexer(client)

In [51]:
indexer.create_collection(name, properties_template, description='test index to delete')

Error creating collection, due to: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "Suck_test" already exists'}]}.


In [56]:
indexer._client.connect()
indexer._client.collections.exists(name)

True

In [57]:
indexer.batch_index_data(docs, name)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4614/4614 [00:07<00:00, 638.44it/s]


Batch job completed in 0.8 minutes.


{'batch_errors': 0, 'failed_objects': [], 'failed_references': []}

In [8]:
questions = [
    "How can I optimize my sleep to improve cognitive function and overall well-being?",
    "What are some effective strategies for managing stress and anxiety based on neuroscience?",
    "Can you explain the science behind meditation and its benefits for mental health?",
    "How does nutrition impact brain health and cognitive performance?",
    "What are the best methods for enhancing focus and concentration?",
    "Can you discuss the relationship between exercise and brain function?",
    "How does exposure to natural light affect sleep quality and circadian rhythms?",
    "What are the most effective techniques for improving memory and learning?",
    "Can you explain the role of neurotransmitters in regulating mood and behavior?",
    "How does chronic stress impact brain structure and function?",
    "What are the implications of neuroplasticity for personal growth and development?",
    "How can we use breathing techniques to modulate our nervous system and reduce stress?",
    "Can you discuss the science of motivation and goal-setting from a neuroscience perspective?",
    "What are some effective ways to optimize brain health as we age?",
    "How does technology use affect brain function and mental well-being?",
    "Can you explain the effects of different types of music on the brain and mood?",
    "What are the benefits of exposure to nature for mental health and cognitive function?",
    "How can we cultivate resilience and adaptability in the face of challenges?",
    "Can you discuss the relationship between gut health and mental health?",
    "What are some practical strategies for improving emotional regulation and self-control?"
  ]

### Single Query: v4

In [9]:
start = time.perf_counter()
answer_v4 = client_v4.hybrid_search("What is serotonin good for", collection_name, return_properties=['content', 'title'])
end = time.perf_counter() - start
print(f'Python v4 client --> single query (n=1): {round(end,2)} seconds')

### Multi-Query (n=20) : v4

In [10]:
start = time.perf_counter()
answers_v4 = []
for q in questions:
    answers_v4.append(client_v4.hybrid_search(q, collection_name, return_properties=['content']))
end = time.perf_counter() - start
print(f'Python v4 client --> sequential queries (n=20): {round(end,2)} seconds')

In [12]:
def main(query: str, collection_name: str, query_fields: list[str]=['content', 'title']):
    return client_v4.hybrid_search(query, collection_name, query_fields)

### Python Multithreading (n=20) : v4   --> Does not execute

In [13]:
start = time.perf_counter()
from warnings import filterwarnings
filterwarnings('ignore')

progress = tqdm(unit=": Queries", total=len(questions))
answers = []
try:
    with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as exec:
        futures = [exec.submit(main, q, collection_name) for q in questions]
        for future in as_completed(futures):
            answers.append(future.result()[0])
            progress.update(1)
    end = time.perf_counter() - start
    print(f'Python v4 client --> multithreading queries (n=20): {round(end,2)} seconds')

except Exception as e:
    print(f'ThreadPool did not execute due to {e}')

  0%|          | 0/20 [00:00<?, ?: Queries/s]

In [17]:
progress.close()

In [22]:
from llama_index.vector_stores import ChromaVectorStore