In [32]:
%load_ext autoreload
%autoreload 2

#load from local .env file
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

import sys
sys.path.append('..')

#standard libraries
import json
import os
import time
from typing import Any
from tqdm.notebook import tqdm

#external files
from src.preprocessor.preprocessing import FileIO
from src.database.weaviate_v4 import WeaviateWCS, WeaviateIndexer
#weaviate
from weaviate.auth import AuthApiKey
from weaviate import connect_to_wcs
from weaviate.classes.config import Property
#misc
from rich import print
from sentence_transformers import SentenceTransformer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set Constants

In [33]:
api_key = os.environ['WEAVIATE_API_KEY']
url = os.environ['WEAVIATE_ENDPOINT']
collection_name = 'HubermanLabs_minilm_256'

In [34]:
client = WeaviateWCS(url, api_key)

In [35]:
client.model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [64]:
%%time
response = client.vector_search('what is Tongkat Ali', collection_name, limit=5)

CPU times: user 1.36 s, sys: 30 ms, total: 1.39 s
Wall time: 1.37 s


In [66]:
%%time
bm25 = client.keyword_search('what is the gut brain axis', collection_name, query_properties=['content'], limit=5)

CPU times: user 68.1 ms, sys: 4.77 ms, total: 72.9 ms
Wall time: 1.5 s


In [10]:
%%time
hybrid = client.hybrid_search('what is the role of the brain', 
                     collection_name, 
                     query_properties=['content', 'title']
                    )

CPU times: user 1.41 s, sys: 30.7 ms, total: 1.44 s
Wall time: 1.52 s


In [69]:
client.client.connect()
client.client.collections.delete(name='nothing')

In [57]:
print(client.show_collection_properties(collection_name))

In [20]:
docs = FileIO().load_parquet('../data/huberman_subset_minilm-256.parquet')

Shape of data: (4614, 19)
Memory Usage: 0.48+ MB


In [60]:
def convert_raw_data(raw_data: list[dict]) -> list[dict]:
    '''
    Converts raw YouTube json to correct format for 
    indexing on Weaviate. i.e. drops unused fields, 
    and coerces data types. 
    '''
    drops = ['channelId', 'isOwnerViewing', 'isCrawlable', 'allowRatings', \
             'author', 'isPrivate', 'isUnpluggedCorpus', 'isLiveContent']
    data = list(raw_data.values())
    for d in data:
        d['thumbnail_url'] = d['thumbnail']['thumbnails'][1].get('url')
        d['lengthSeconds'] = int(d['lengthSeconds'])
        d['viewCount'] = int(d['viewCount'])
        del d['thumbnail']
        for field in drops:
            del d[field]
    return data

In [23]:
docs[0]['thumbnail']['thumbnails'][1]['url']

'https://i.ytimg.com/vi/oL3SkPV1_Ik/mqdefault.jpg'

In [25]:
for k in docs[0]:
    print(f'({k} --> {type(docs[0][k])})')

### Create Properties

In [26]:
from src.database.huberman_properties import properties_template
# from weaviate.classes.config import Tokenization, Property

In [37]:
def assign_DataType(dict_value: Any):
    if isinstance(dict_value, str):
        return DataType.TEXT
    elif isinstance(dict_value, int):
        return DataType.INT
    elif isinstance(dict_value, float):
        return DataType.NUMBER
    elif isinstance(dict_value, bool):
        return DataType.BOOL
    elif isinstance(dict_value, list):
        if isinstance(dict_value[0], str):
            return DataType.TEXT_ARRAY
        elif isinstance(dict_value[0], float):
            return DataType.NUMBER_ARRAY
    else:
        raise TypeError(f'Type <{type(dict_value)}> is not an acceptable data type')

In [77]:
collection = client.client.collections.create(
    name='delme_index',
    description='Collection of 189 episodes of the Huberman Labs podcast',
    properties=properties_template)

In [79]:
with collection.batch.dynamic() as batch:
    for i, doc in tqdm(enumerate(docs)):
        batch.add_object(properties={k:v for k,v in doc.items() if not k.endswith('embedding')},
                         vector=doc['content_embedding'])

0it [00:00, ?it/s]

In [44]:
name = 'Suck_test'

In [54]:
indexer = WeaviateIndexer(client)

In [51]:
indexer.create_collection(name, properties_template, description='test index to delete')

Error creating collection, due to: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "Suck_test" already exists'}]}.


In [56]:
indexer._client.connect()
indexer._client.collections.exists(name)

True

In [57]:
indexer.batch_index_data(docs, name)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4614/4614 [00:07<00:00, 638.44it/s]


Batch job completed in 0.8 minutes.


{'batch_errors': 0, 'failed_objects': [], 'failed_references': []}

In [51]:
response = huberman.query.bm25(query='Who is Huberman', limit=5, return_metadata=['creation_time', 'last_update_time', 'distance', 'certainty', 'score', 'explain_score', 'is_consistent'],return_properties=['content'])

In [55]:
model = SentenceTransformer('sentence-transformers/all-miniLM-L6-v2')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [64]:
query = 'interplay of microbes in the gut'
query_vector = model.encode(query).tolist()

In [73]:
from weaviate.classes.query import MetadataQuery
from weaviate.classes.data import DataObject

In [83]:
semantic = huberman.query.near_vector(near_vector=query_vector,
                                      limit=3,
                                      return_metadata=MetadataQuery(distance=True, 
                                                                    explain_score=True,
                                                                    certainty=True),
                                      return_properties=['content', 'title'])

In [88]:
from weaviate.collections.classes.internal import MetadataReturn, QueryReturn

weaviate.collections.classes.internal.QueryReturn

In [84]:
def _get_meta(metadata: MetadataQuery):
    temp_dict = metadata.__dict__
    return {k:v for k,v in temp_dict.items() if v}

In [93]:
semantic.objects[0]

Object(uuid=_WeaviateUUIDInt('68939d5c-34d3-48a8-8043-5895eccef6aa'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.378218412399292, certainty=0.810890793800354, score=None, explain_score='', is_consistent=None, rerank_score=None), properties={'content': "We've done episodes on stress and how to combat stress. And we've done episodes on the so-called gut microbiome. But right now, I just want to mention that the gut microbiome, which are the trillions of little micro bacteria that interact heavily with the immune system and help support the immune system, you want to keep the gut microbiome healthy. So you'll notice that we include some tools related to the gut microbiome here in a moment. And, and this is very important, keep in mind that the microbiome doesn't just exist in the gut. So often these days we hear about the gut microbiome and I'm oh so happy that the gut microbiome is getting the attention that it deserves in the context of mental health, 

In [87]:
for o in semantic.objects:
    print(o.properties)
    print(_get_meta(o.metadata))
    print('-'*80)
    print(o.uuid)