In [43]:
%load_ext autoreload
%autoreload 2

#load from local .env file
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

import sys
sys.path.append('..')

#standard libraries
import json
import os
import time
from rich import print
from typing import Any
from src.preprocessor.preprocessing import FileIO
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
from weaviate.auth import AuthApiKey
from weaviate import connect_to_wcs
from src.database.weaviate_v4 import WeaviateWCS
api_key = os.environ['WEAVIATE_API_KEY']
url = os.environ['WEAVIATE_ENDPOINT']
collection_name = 'HubermanLabs_minilm_256'

In [45]:
client = WeaviateWCS(url, api_key)

  client = WeaviateWCS(url, api_key)


In [25]:
response = client.vector_search('what is Tongkat Ali', collection_name, limit=5)

In [50]:
bm25 = client.keyword_search('what is the gut brain axis', collection_name, query_properties=['content'], limit=5)

In [49]:
client.hybrid_search('what is the role of the mid cingulate cortex', 
                     collection_name, 
                     query_properties=['content', 'title']
                    )

[{'content': "Every brain area is operating in the context of neural circuits, other brain areas that it receives inputs from and gives inputs to and so on. But this one particular brain area really does seem to underlie what we call tenacity and willpower. And we know that through several lines of evidence. First of all, I'll tell you the name of the brain area, although the name itself isn't going to tell you much unless you're a neuroscientist or anatomist. So I'll give a little bit of background about it. The name of the brain area is the anterior mid cingulate cortex. The anterior mid cingulate cortex is part of a larger brain area called the cingulate cortex. And in humans versus animals, it goes by slightly different names, unfortunately. It's just one of the consequences of different researchers in different labs calling the same thing different things. It'd be really frustrating, but we'll make it very simple because today we will refer to this area as the anterior mid-singula

In [8]:
docs = FileIO().load_parquet('../data/huberman_subset_minilm-256.parquet')

Shape of data: (4614, 19)
Memory Usage: 0.48+ MB


In [9]:
drops = ['channelId', 'isOwnerViewing', 'isCrawlable', 'allowRatings', 'author', 'isPrivate', 'isUnpluggedCorpus', 'isLiveContent']

In [10]:
for d in docs:
    for field in drops:
        del d[field]

In [11]:
docs[0]['thumbnail']['thumbnails'][1]['url']

'https://i.ytimg.com/vi/oL3SkPV1_Ik/mqdefault.jpg'

In [12]:
for d in docs:
    # assert d['thumbnail']['thumbnails'][1].get('url') != None
    d['thumbnail_url'] = d['thumbnail']['thumbnails'][1].get('url')
    d['lengthSeconds'] = int(d['lengthSeconds'])
    d['viewCount'] = int(d['viewCount'])
    del d['thumbnail']

In [13]:
for k in docs[0]:
    print(f'({k} --> {type(docs[0][k])})')

(videoId --> <class 'str'>)
(title --> <class 'str'>)
(lengthSeconds --> <class 'int'>)
(keywords --> <class 'list'>)
(shortDescription --> <class 'str'>)
(viewCount --> <class 'int'>)
(episode_num --> <class 'int'>)
(doc_id --> <class 'str'>)
(content --> <class 'str'>)
(content_embedding --> <class 'list'>)
(thumbnail_url --> <class 'str'>)


### Create Properties

In [14]:
from src.database.huberman_properties import properties_template
# from weaviate.classes.config import Tokenization, Property

In [37]:
def assign_DataType(dict_value: Any):
    if isinstance(dict_value, str):
        return DataType.TEXT
    elif isinstance(dict_value, int):
        return DataType.INT
    elif isinstance(dict_value, float):
        return DataType.NUMBER
    elif isinstance(dict_value, bool):
        return DataType.BOOL
    elif isinstance(dict_value, list):
        if isinstance(dict_value[0], str):
            return DataType.TEXT_ARRAY
        elif isinstance(dict_value[0], float):
            return DataType.NUMBER_ARRAY
    else:
        raise TypeError(f'Type <{type(dict_value)}> is not an acceptable data type')

In [16]:
client.collections.create(
    name='HubermanLabs_minilm_256',
    description='Collection of 189 episodes of the Huberman Labs podcast',
    properties=properties_template)
client.close()

In [32]:
collection = client.collections.get(collection_name)

In [33]:
with collection.batch.dynamic() as batch:
    for i, doc in tqdm(enumerate(docs)):
        batch.add_object(properties={k:v for k,v in doc.items() if not k.endswith('embedding')},
                         vector=doc['content_embedding'])

4614it [00:08, 576.46it/s] 


In [35]:
client.connect()

In [37]:
huberman = client.collections.get(collection_name)

In [51]:
response = huberman.query.bm25(query='Who is Huberman', limit=5, return_metadata=['creation_time', 'last_update_time', 'distance', 'certainty', 'score', 'explain_score', 'is_consistent'],return_properties=['content'])

In [55]:
model = SentenceTransformer('sentence-transformers/all-miniLM-L6-v2')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [64]:
query = 'interplay of microbes in the gut'
query_vector = model.encode(query).tolist()

In [73]:
from weaviate.classes.query import MetadataQuery
from weaviate.classes.data import DataObject

In [83]:
semantic = huberman.query.near_vector(near_vector=query_vector,
                                      limit=3,
                                      return_metadata=MetadataQuery(distance=True, 
                                                                    explain_score=True,
                                                                    certainty=True),
                                      return_properties=['content', 'title'])

In [88]:
from weaviate.collections.classes.internal import MetadataReturn, QueryReturn

weaviate.collections.classes.internal.QueryReturn

In [84]:
def _get_meta(metadata: MetadataQuery):
    temp_dict = metadata.__dict__
    return {k:v for k,v in temp_dict.items() if v}

In [93]:
semantic.objects[0]

Object(uuid=_WeaviateUUIDInt('68939d5c-34d3-48a8-8043-5895eccef6aa'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.378218412399292, certainty=0.810890793800354, score=None, explain_score='', is_consistent=None, rerank_score=None), properties={'content': "We've done episodes on stress and how to combat stress. And we've done episodes on the so-called gut microbiome. But right now, I just want to mention that the gut microbiome, which are the trillions of little micro bacteria that interact heavily with the immune system and help support the immune system, you want to keep the gut microbiome healthy. So you'll notice that we include some tools related to the gut microbiome here in a moment. And, and this is very important, keep in mind that the microbiome doesn't just exist in the gut. So often these days we hear about the gut microbiome and I'm oh so happy that the gut microbiome is getting the attention that it deserves in the context of mental health, 

In [87]:
for o in semantic.objects:
    print(o.properties)
    print(_get_meta(o.metadata))
    print('-'*80)
    print(o.uuid)