In [14]:
%load_ext autoreload
%autoreload 2

#load from local .env file
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

import sys
sys.path.append('..')

#standard libraries
import json
import os
import time
from typing import Any
from tqdm.notebook import tqdm

#external files
from src.preprocessor.preprocessing import FileIO
from src.database.weaviate_interface_v4 import WeaviateWCS, WeaviateIndexer
#weaviate
from weaviate.auth import AuthApiKey
from weaviate import connect_to_wcs
from weaviate.classes.config import Property
#misc
from rich import print
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor, as_completed

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set Constants

In [2]:
api_key = os.environ['WEAVIATE_API_KEY']
url = os.environ['WEAVIATE_ENDPOINT']
collection_name = 'Huberman_minilm_256'

In [3]:
client_v4 = WeaviateWCS(endpoint=url, api_key=api_key)

In [4]:
client._client.is_connected()

NameError: name 'client' is not defined

In [17]:
%%time
response = client.vector_search('what is Tongkat Ali', collection_name, limit=5)

CPU times: user 1.4 s, sys: 38.9 ms, total: 1.44 s
Wall time: 1.31 s


In [18]:
%%time
bm25 = client.keyword_search('what is the gut brain axis', collection_name, query_properties=['content'], limit=5)

CPU times: user 68.7 ms, sys: 5.42 ms, total: 74.2 ms
Wall time: 1.51 s


In [22]:
%%time
hybrid = client.hybrid_search('what is the role of the brain', collection_name)

CPU times: user 1.32 s, sys: 75.5 ms, total: 1.4 s
Wall time: 1.64 s


In [20]:
docs = FileIO().load_parquet('../data/huberman_subset_minilm-256.parquet')

Shape of data: (4614, 19)
Memory Usage: 0.48+ MB


In [60]:
def convert_raw_data(raw_data: list[dict]) -> list[dict]:
    '''
    Converts raw YouTube json to correct format for 
    indexing on Weaviate. i.e. drops unused fields, 
    and coerces data types. 
    '''
    drops = ['channelId', 'isOwnerViewing', 'isCrawlable', 'allowRatings', \
             'author', 'isPrivate', 'isUnpluggedCorpus', 'isLiveContent']
    data = list(raw_data.values())
    for d in data:
        d['thumbnail_url'] = d['thumbnail']['thumbnails'][1].get('url')
        d['lengthSeconds'] = int(d['lengthSeconds'])
        d['viewCount'] = int(d['viewCount'])
        del d['thumbnail']
        for field in drops:
            del d[field]
    return data

In [23]:
docs[0]['thumbnail']['thumbnails'][1]['url']

'https://i.ytimg.com/vi/oL3SkPV1_Ik/mqdefault.jpg'

In [25]:
for k in docs[0]:
    print(f'({k} --> {type(docs[0][k])})')

### Create Properties

In [26]:
from src.database.huberman_properties import properties_template
# from weaviate.classes.config import Tokenization, Property

In [37]:
def assign_DataType(dict_value: Any):
    if isinstance(dict_value, str):
        return DataType.TEXT
    elif isinstance(dict_value, int):
        return DataType.INT
    elif isinstance(dict_value, float):
        return DataType.NUMBER
    elif isinstance(dict_value, bool):
        return DataType.BOOL
    elif isinstance(dict_value, list):
        if isinstance(dict_value[0], str):
            return DataType.TEXT_ARRAY
        elif isinstance(dict_value[0], float):
            return DataType.NUMBER_ARRAY
    else:
        raise TypeError(f'Type <{type(dict_value)}> is not an acceptable data type')

In [77]:
collection = client.client.collections.create(
    name='delme_index',
    description='Collection of 189 episodes of the Huberman Labs podcast',
    properties=properties_template)

In [79]:
with collection.batch.dynamic() as batch:
    for i, doc in tqdm(enumerate(docs)):
        batch.add_object(properties={k:v for k,v in doc.items() if not k.endswith('embedding')},
                         vector=doc['content_embedding'])

0it [00:00, ?it/s]

In [44]:
name = 'Suck_test'

In [54]:
indexer = WeaviateIndexer(client)

In [51]:
indexer.create_collection(name, properties_template, description='test index to delete')

Error creating collection, due to: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "Suck_test" already exists'}]}.


In [56]:
indexer._client.connect()
indexer._client.collections.exists(name)

True

In [57]:
indexer.batch_index_data(docs, name)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4614/4614 [00:07<00:00, 638.44it/s]


Batch job completed in 0.8 minutes.


{'batch_errors': 0, 'failed_objects': [], 'failed_references': []}

In [9]:
questions = [
    "How can I optimize my sleep to improve cognitive function and overall well-being?",
    "What are some effective strategies for managing stress and anxiety based on neuroscience?",
    "Can you explain the science behind meditation and its benefits for mental health?",
    "How does nutrition impact brain health and cognitive performance?",
    "What are the best methods for enhancing focus and concentration?",
    "Can you discuss the relationship between exercise and brain function?",
    "How does exposure to natural light affect sleep quality and circadian rhythms?",
    "What are the most effective techniques for improving memory and learning?",
    "Can you explain the role of neurotransmitters in regulating mood and behavior?",
    "How does chronic stress impact brain structure and function?",
    "What are the implications of neuroplasticity for personal growth and development?",
    "How can we use breathing techniques to modulate our nervous system and reduce stress?",
    "Can you discuss the science of motivation and goal-setting from a neuroscience perspective?",
    "What are some effective ways to optimize brain health as we age?",
    "How does technology use affect brain function and mental well-being?",
    "Can you explain the effects of different types of music on the brain and mood?",
    "What are the benefits of exposure to nature for mental health and cognitive function?",
    "How can we cultivate resilience and adaptability in the face of challenges?",
    "Can you discuss the relationship between gut health and mental health?",
    "What are some practical strategies for improving emotional regulation and self-control?"
  ]

### Single Query: v4

In [19]:
%%time
answer_v4 = client_v4.hybrid_search("What is serotonin good for", collection_name, return_properties=['content', 'title'])

CPU times: user 1.37 s, sys: 30.5 ms, total: 1.4 s
Wall time: 1.27 s


### Multi-Query (n=20) : v4

In [11]:
%%time
answers_v4 = []
for q in questions:
    answers_v4.append(client_v4.hybrid_search(q, collection_name, return_properties=['content', 'title']))

CPU times: user 27.3 s, sys: 632 ms, total: 28 s
Wall time: 23.6 s


In [20]:
def main(query: str, collection_name: str, query_fields: list[str]=['content', 'title']):
    return client_v4.hybrid_search(query, collection_name, query_fields)

### Python Multithreading (n=20) : v4   --> Does not execute

In [18]:
%%time
from warnings import filterwarnings
filterwarnings('ignore')

progress = tqdm(unit=": Queries", total=len(questions))
answers = []
with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as exec:
    futures = [exec.submit(main, q, collection_name) for q in questions]
    for future in as_completed(futures):
        answers.append(future.result()[0])
        progress.update(1)

  0%|          | 0/20 [00:00<?, ?: Queries/s]

WeaviateQueryError: Query call with protocol GRPC search failed with message Channel closed!.

In [17]:
progress.close()

In [22]:
from llama_index.vector_stores import ChromaVectorStore