In [3]:
%load_ext autoreload
%autoreload 2

#load from local .env file
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

import sys
sys.path.append('../../')

#standard libraries
import json
import os
import time
from typing import List
from math import ceil

#external libraries
import pandas as pd
import numpy as np
from rich import print
from torch import cuda
from tqdm import tqdm
import tiktoken # bad ass tokenizer library for use with OpenAI LLMs 
from llama_index.text_splitter import SentenceSplitter #one of the best on the market
from sentence_transformers import SentenceTransformer

#external files
from src.preprocessor.preprocessing import FileIO
from src.database.weaviate_interface_v4 import WeaviateWCS, WeaviateIndexer
from src.database.properties_template import properties
from src.pipelines.pipeline import (chunk_data, create_vectors, join_docs, 
                                    create_dataset, groupby_episode, create_parent_chunks,
                                    convert_raw_data)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set Constants

In [5]:
chunk_size = 512

#tokenizer
encoding = tiktoken.get_encoding(encoding_name='cl100k_base')
#text_splitter
splitter = SentenceSplitter(chunk_overlap=0, chunk_size=chunk_size, tokenizer=encoding.encode)
#model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda:0')
#corpus
data = FileIO.load_json('../../data/huberman_labs.json')
# data = convert_raw_data(raw)

### Create Dataset

In [11]:
%%time
outpath = '../data/huberman_minilm'
docs = create_dataset(data, model, splitter, file_outpath_prefix=outpath, overwrite_existing=True)

CHUNKING:   0%|          | 0/193 [00:00<?, ?it/s]

VECTORS:   0%|          | 0/193 [00:00<?, ?it/s]

[32m2024-04-27 17:45:19.254[0m | [1mINFO    [0m | [36msrc.preprocessor.preprocessing[0m:[36msave_as_parquet[0m:[36m42[0m - [1mDataFrame saved as parquet file here: ../data/huberman_minilm-512.parquet[0m


CPU times: user 50 s, sys: 495 ms, total: 50.5 s
Wall time: 38.2 s


In [20]:
def coerce_to_int(data: list[dict], 
                  fields: list[str]=['length_seconds', 'view_count']
                 ) -> None:
    for d in data:
        for field in fields:
            d[field] = int(d[field])
    for d in data:
        for field in fields:
            assert isinstance(d[field], int)

In [21]:
coerce_to_int(docs)

### Create Expanded Content property 

In [None]:
# grouped = groupby_episode(docs, key_field='video_id')
# pchunks = create_parent_chunks(grouped, window_size=1)

In [47]:
# for i, chunk in enumerate(pchunks):
#     doc_id = list(chunk.keys())[0]
#     assert doc_id == docs[i]['doc_id'], f'failed at line {i}\t{k}'
#     docs[i]['expanded_content'] = chunk[doc_id]

### Create Weaviate Client

In [5]:
#read env vars from local .env file
api_key = os.environ['WEAVIATE_API_KEY']
url = os.environ['WEAVIATE_ENDPOINT']

#instantiate client
client = WeaviateWCS(url, api_key=api_key)

#check if WCS instance is live and ready
print(client._client.is_live(), client._client.is_ready())

indexer = WeaviateIndexer(client)

### Load data from disk?

In [4]:
# docs = FileIO().load_parquet('../data/huberman_minilm-256.parquet')



Shape of data: (23905, 13)
Memory Usage: 2.37+ MB


### Create Schema and Index Docs

In [12]:
collection_name = 'Huberman_minilm_512'
client.show_all_collections()

['Huberman_subset_minilm_test',
 'Huberman_minilm_128',
 'Huberman_minilm_512',
 'Huberman_minilm_256']

In [13]:
client.delete_collection(collection_name)

Collection "Huberman_minilm_512" deleted


In [14]:
indexer.create_collection(collection_name, properties, description='Full index of 193 Huberman Labs episodes as of April 5, 2024')

Collection "Huberman_minilm_512" created


In [15]:
batch = indexer.batch_index_data(docs, collection_name, properties=properties)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11602/11602 [00:13<00:00, 873.49it/s]


Processing finished in 0.53 minutes.
Batch job completed with zero errors.


Bad pipe message: %s [b"\xb9\x8f\x88Vc\xe5\xe2\xdb\x08\xfa\x84\xf3<*\xdfI\xbe\x1a\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0", b'3\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0']
Bad pipe message: %s [b'=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00']
Bad pipe message: %s [b"\xde\x1d\x13\xe8}\xe2\xca\xc8\x1a\x9e&\xf9\x0bj\xc3\x8d\xca\\\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00

## Free HF Inference API

In [17]:
import requests
API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-mpnet-base-v2"
headers = {"Authorization": f"Bearer {os.environ['HF_TOKEN']}"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

## Small-to-Big Retrieval

In [3]:
data = FileIO().load_parquet('../impact-theory-new-ft-model-256.parquet')

Shape of data: (26448, 12)
Memory Usage: 2.42+ MB


### Remove Embeddings

In [4]:
data = [{k:v for k,v in d.items() if k != 'content_embedding'} for d in data]

## Breakout Episodes

In [5]:
def break_into_episodes(data: List[dict]) -> List[list]:
    '''
    Separates entire Impact Theory corpus into individual 
    lists of discrete episodes.
    '''
    all_episodes = []
    episode = []
    cur_video = ''
    count = 0
    for d in data:
        video_id = d['video_id']
        if not cur_video:
            cur_video = video_id
        if cur_video == video_id:
            episode.append(d)
            count += 1
        else:
            all_episodes.append(episode)
            count = 0
            episode = []
            episode.append(d)
            cur_video = video_id
    all_episodes.append(episode)
    assert len(all_episodes) == 384
    return all_episodes

In [29]:
from itertools import groupby

def groupby_episode(data: List[dict], key_field: str='video_id') -> List[List[dict]]:
    '''
    Separates entire Impact Theory corpus into individual 
    lists of discrete episodes.
    '''
    episodes = []
    for key, group in groupby(data, lambda x: x[key_field]):
        episode = [chunk for chunk in group]
        episodes.append(episode)
    return episodes

In [30]:
all_episodes = groupby_episode(data)

### Combine episode chunks into Parent Chunks one for each doc_id

In [78]:
def create_parent_chunks(episode_list: List[list], window_size: int=2) -> List[dict]:
    '''
    Creates parent chunks from original chunk of text, for use with 
    small to big retrieval.  Window size sets number of chunks before
    and after the original chunk.  For example a window_size of 2 will 
    return five joined chunks.  2 chunks before original, the original, 
    and 2 chunks after the original.  Chunks are kept in sequence by 
    using the doc_id field. 
    '''
    parent_chunks = []
    for episode in episode_list:
        contents = [d['content'] for d in episode]
        for i, d in enumerate(episode):
            doc_id = d['doc_id']
            start = max(0, i-window_size)
            end = i+window_size+1
            chunk = ' '.join(contents[start:end])
            parent_chunks.append({doc_id:chunk})
    return parent_chunks

In [88]:
pchunks = create_parent_chunks(all_episodes, window_size=3)

In [89]:
def create_parent_chunk_cache(parent_chunks: List[dict]) -> dict:
    '''
    Creates a simple in-memory cache for quick parent chunk lookup.
    Used for small-to-big retrieval in a RAG system.
    '''
    content_cache = {}
    for chunk in pchunks:
        for k,v in chunk.items():
            content_cache[k] = v
    return content_cache

In [90]:
cache = create_parent_chunk_cache(pchunks)

In [91]:
alltext = ' '.join(list(cache.values()))

In [92]:
import tiktoken

In [93]:
encoding = tiktoken.encoding_for_model('gpt-4')

In [94]:
len(encoding.encode(alltext))

40777582

# ------------------- BREAK ---------------------------

# Small to Big Retrieval

In [47]:
print(f'Size of original data: {len(data)}')
print(f'Size of cached content: {len(content_cache)}')

In [13]:
class_name =  'Fine_tuned_on_300'

### Hybrid Search call

In [10]:
query = 'does this show discuss the use of generative ai'
response = client.hybrid_search('does this show discuss the use of generative ai', 
                                collection_name = 'Huberman_minilm_128',
                                query_properties=['content', 'short_description', 'guest'], alpha=0.45)
# response

In [7]:
from src.reranker import ReRanker

In [8]:
reranker = ReRanker()

In [13]:
reranker.rerank(response, query, apply_sigmoid=True, top_k=10)

[{'guest': 'Dr. Paul Conti',
  'content': "And by now in this episode, I'm sure people are well on board the understanding that the generative drive is not just about going out and doing things. It's about doing things in service to and in a way that supports learning, knowing, creating, not just of others and in the world, but inside. Yes, yes. I love the map imagery because you can almost see the map changing, right? As a person, I imagine the person is busying away in the cupboards, right?",
  'title': 'Dr. Paul Conti: How to Build and Maintain Healthy Relationships | Huberman Lab Guest Series',
  'video_id': 'eMqWH3LYiII',
  'score': 0.44999998807907104,
  'cross_score': 0.27966332},
 {'guest': 'Marc Andreessen',
  'content': "In doing so, Mark provides a stark counter-argument for those that argue that AI is going to diminish human experience. So if you're hearing about and or concerned about the ways that AI is likely to destroy us, today you are going to hear about the many diff

Bad pipe message: %s [b'\xfdR\xbe8@\x93\\\xd3\x15\xd2\xc2\x1a\xc2\xec\x07\xc97\x05 _j\x82\xb6U0\xb2\x012\xb4\xb7]', b"\xef\x1af\xf4,\xe4\xce{Q\x1b\n\xcb\xed'QuV\xce\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04"]
Bad pipe message: %s [b'\x01\x02']
Bad pipe message: %s [b'\t\x88\xac\xa9\xd5\xaf@\xf2\xaaId\xb1\xfd\x0e\x9d%WE &\xc2;\xc3{\xba?ZS\xf4g7\xf7\xfe\xb9\xae\x88(J\xdb\x8aR|\xbe)\x84\xc5\x17\xdd\xe7\xaeR\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f']
Bad pipe message: %s [b'\x15\xf4b\xb0\x15\xb01\x1d\x12\x16\xdc\xf3\xac\x1a\xc8K\xf32\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0', b"\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07

### View larger context from response

In [82]:
content_cache['zm0QVutAkYg_3']

"So your chief AI officer is scanning the horizon, understanding it, and then advising members of your team. So every part of your team, right? There's going to be AI supporting sales, and marketing, and engineering, and HR. We're all going to have, in the near term, an AI co-pilot, right? This is an AI that helps you do your job better, because we are so limited as carbon life forms. But ultimately is going to be able to operate and do a number of the things repetitively, because we do a lot of repetitive tasks, and AIs are much better at that. I think if you've got, we've got, say, a 30 person company, every single person needs to be trained in AI, and using these chatbot auto GPT tools, and absolutely augment themselves 10, 20, 100x. I have said to my company, okay, everybody here needs to figure out in your department, what are the tools that exist in AI? And how can you immediately implement them? But even that's pretty vague. Like I'm just sort of dumping it on them. Where do peo

### Extract top-n results from response

In [85]:
top_n = 3

def get_top_n(response: List[dict], top_n: int=3):
    top_docs = [d['doc_id'] for d in response[:top_n]]
    cache_responses = [content_cache[doc_id] for doc_id in top_docs]
    return cache_responses

In [86]:
get_top_n(response)

["So your chief AI officer is scanning the horizon, understanding it, and then advising members of your team. So every part of your team, right? There's going to be AI supporting sales, and marketing, and engineering, and HR. We're all going to have, in the near term, an AI co-pilot, right? This is an AI that helps you do your job better, because we are so limited as carbon life forms. But ultimately is going to be able to operate and do a number of the things repetitively, because we do a lot of repetitive tasks, and AIs are much better at that. I think if you've got, we've got, say, a 30 person company, every single person needs to be trained in AI, and using these chatbot auto GPT tools, and absolutely augment themselves 10, 20, 100x. I have said to my company, okay, everybody here needs to figure out in your department, what are the tools that exist in AI? And how can you immediately implement them? But even that's pretty vague. Like I'm just sort of dumping it on them. Where do pe

### Compare with original response content

In [42]:
response_content = [d['content'] for d in response[:top_n]]
response_content

In [87]:
import tiktoken

In [88]:
encoder = tiktoken.encoding_for_model('gpt-4')

In [89]:
data_path = '/home/elastic/notebooks/datasets/acled_reports/'

In [96]:
paths = sorted([os.path.join(data_path, file) for file in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, file))])

In [103]:
strings = []

for path in paths:
    with open(path) as f:
        string = f.read()
        strings.append(string)

In [110]:
import pandas as pd

df = pd.DataFrame(list(map(len, encoder.encode_batch(strings))), columns=['lens'])

In [113]:
df.sum()

lens    21279
dtype: int64