In [4]:
%load_ext autoreload
%autoreload 2

#load from local .env file
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

import sys
sys.path.append('..')

#standard libraries
import json
import os
import time
from typing import List
from math import ceil

#external libraries
import pandas as pd
import numpy as np
from rich import print
from torch import cuda
from tqdm.notebook import tqdm
from dotenv import load_dotenv
env = load_dotenv('.env', override=True)

#external files
from src.preprocessor.preprocessing import FileIO
import tiktoken # bad ass tokenizer library for use with OpenAI LLMs 
from llama_index.text_splitter import SentenceSplitter #one of the best on the market
from sentence_transformers import SentenceTransformer
from src.weaviate_stuff.weaviate_interface import WeaviateClient, WeaviateIndexer
from src.weaviate_stuff.class_templates import impact_theory_class_properties
from src.pipelines.pipeline import chunk_data, create_vectors, join_docs, create_dataset, groupby_episode, create_parent_chunks

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
files.append(("files", open(path, 'rb')))

In [11]:
files = []
path = '../data/unicode_test.txt'
with open(path, 'rb') as f:
    data = f.read()
    files.append(("files", f))
files

[('files', <_io.BufferedReader name='../data/unicode_test.txt'>)]

In [28]:
files[0][1].__class__.__name__

'BufferedReader'

In [30]:
with open('../data/unicode_test.txt', 'rb') as f:
    mystring = f.read().decode(encoding='utf-8', errors='strict')
    mystring = mystring.encode()
type(mystring)

bytes

In [39]:
BytesIO(mystring).__class__

_io.BytesIO

Bad pipe message: %s [b'\xdfMo\x86>\x0b`\xbc\xe1G\x9a\xab\xe2BH\x14r\x90 \xf8X\x12\x88u\xfd\x81\x10)', b'\x0f\x07K\xafJ\xb2\x03\x93\x0b\x9e\xb4\xcf\xa5\xa2\xe0\xc7\xd3\n\xfe\xa4\xf0\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02']
Bad pipe message: %s [b'\x00-']
Bad pipe message: %s [b'[\xec\xcd\x9a\xe2\x10\x87.\xbd\xb8\x8d\xa5\x15\xbe\xcd\xb36\xb2 ^&\xa4zG\xb3!\xc7A\xca\xca\x18:\x8d1\xdb:\xac?\x14\xc5\x1f\xb9\x1e]\xc4\xfb\xeeD4\xe7y\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x1

In [29]:
from io import BytesIO, BufferedReader

In [20]:
filelike = BytesIO(mystring.encode())
filelike.read()

b"a little bit of code, 'eh? \xc2\xac\xcb\x9a\xe2\x88\x86\xc2\xa9\xcb\x99\xe2\x88\x9e\xc2\xa7\xc2\xb6\xe2\x80\xa2\xc2\xaa"

### Set Constants

In [2]:
chunk_size = 128

#tokenizer
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
#text_splitter
splitter = SentenceSplitter(chunk_overlap=0, chunk_size=chunk_size, tokenizer=encoding.encode)
#model
model = SentenceTransformer('./models/finetuned-all-MiniLM-L6-v2-300/')
#corpus
data = FileIO().load_json('./data/impact_theory_data.json')

### Create Dataset

In [4]:
outpath = '../vsa_practice/practice_data/impact-theory-finetuneminilm'
docs = create_dataset(data, model, splitter, file_outpath_prefix=outpath)

CHUNKING:   0%|          | 0/384 [00:00<?, ?it/s]

VECTORS:   0%|          | 0/384 [00:00<?, ?it/s]

[32m2023-12-20 13:24:06.325[0m | [1mINFO    [0m | [36mpreprocessing[0m:[36msave_as_parquet[0m:[36m41[0m - [1mDataFrame saved as parquet file here: ../vsa_practice/practice_data/impact-theory-finetuneminilm-128.parquet[0m


### Create Expanded Content property 

In [10]:
grouped = groupby_episode(docs)

In [12]:
pchunks = create_parent_chunks(grouped, window_size=1)

In [47]:
for i, chunk in enumerate(pchunks):
    doc_id = list(chunk.keys())[0]
    assert doc_id == docs[i]['doc_id'], f'failed at line {i}\t{k}'
    docs[i]['expanded_content'] = chunk[doc_id]

### Create Weaviate Client

In [44]:
#read env vars from local .env file
api_key = os.environ['WEAVIATE_API_KEY']
url = os.environ['WEAVIATE_ENDPOINT']

#instantiate client
client = WeaviateClient(api_key, url, model_name_or_path='./models/finetuned-all-MiniLM-L6-v2-300/')

#check if WCS instance is live and ready
client.is_live(), client.is_ready()

indexer = WeaviateIndexer(client, batch_size=200, num_workers=2)

In [45]:
client.show_classes()

['Fine_tuned_minilm_256']

### Load data from disk?

In [6]:
# docs = FileIO().load_parquet('../vsa_practice/practice_data/impact-theory-finetuneminilm-256.parquet')

Shape of data: (26448, 12)
Memory Usage: 2.42+ MB


### Configure Index

In [54]:
class_name = 'Fine_tuned_minilm_128'

class_config = {'classes': [

                      {"class": class_name,        
                       
                       "description": "Episodes of Impact Theory up to Nov 2023", 
                       
                       "vectorIndexType": "hnsw", 
                       
                       # Vector index specific settings
                       "vectorIndexConfig": {                   
                          
                        "ef": 60,
                        "efConstruction": 500,
                        "maxConnections": 128,   
                                            },
                       
                       "vectorizer": "none",            
                       
                       # pre-defined property mappings
                       "properties": impact_theory_class_properties }         
                      ]
               }

In [61]:
client.delete_class(class_name)

'Class "Fine_tuned_minilm_128" deleted'

### Create Schema and Index Docs

In [62]:
client.schema.create(class_config)
# client.schema.update_config(class_name, config={'vectorIndexConfig': {'ef': 500}})

In [63]:
# print(client.show_class_config(class_name))

In [64]:
indexer.batch_index_data(docs, class_name)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 55537/55537 [09:23<00:00, 98.60it/s]


Batch job completed in 9.4 minutes.
{'class': 'Fine_tuned_minilm_128', 'name': 'nQa6XGARE5nY', 'objectCount': 55537, 'vectorIndexingStatus': 'READY', 'vectorQueueLength': 0}


## Small-to-Big Retrieval

In [3]:
data = FileIO().load_parquet('../impact-theory-new-ft-model-256.parquet')

Shape of data: (26448, 12)
Memory Usage: 2.42+ MB


### Remove Embeddings

In [4]:
data = [{k:v for k,v in d.items() if k != 'content_embedding'} for d in data]

## Breakout Episodes

In [5]:
def break_into_episodes(data: List[dict]) -> List[list]:
    '''
    Separates entire Impact Theory corpus into individual 
    lists of discrete episodes.
    '''
    all_episodes = []
    episode = []
    cur_video = ''
    count = 0
    for d in data:
        video_id = d['video_id']
        if not cur_video:
            cur_video = video_id
        if cur_video == video_id:
            episode.append(d)
            count += 1
        else:
            all_episodes.append(episode)
            count = 0
            episode = []
            episode.append(d)
            cur_video = video_id
    all_episodes.append(episode)
    assert len(all_episodes) == 384
    return all_episodes

In [29]:
from itertools import groupby

def groupby_episode(data: List[dict], key_field: str='video_id') -> List[List[dict]]:
    '''
    Separates entire Impact Theory corpus into individual 
    lists of discrete episodes.
    '''
    episodes = []
    for key, group in groupby(data, lambda x: x[key_field]):
        episode = [chunk for chunk in group]
        episodes.append(episode)
    return episodes

In [30]:
all_episodes = groupby_episode(data)

### Combine episode chunks into Parent Chunks one for each doc_id

In [78]:
def create_parent_chunks(episode_list: List[list], window_size: int=2) -> List[dict]:
    '''
    Creates parent chunks from original chunk of text, for use with 
    small to big retrieval.  Window size sets number of chunks before
    and after the original chunk.  For example a window_size of 2 will 
    return five joined chunks.  2 chunks before original, the original, 
    and 2 chunks after the original.  Chunks are kept in sequence by 
    using the doc_id field. 
    '''
    parent_chunks = []
    for episode in episode_list:
        contents = [d['content'] for d in episode]
        for i, d in enumerate(episode):
            doc_id = d['doc_id']
            start = max(0, i-window_size)
            end = i+window_size+1
            chunk = ' '.join(contents[start:end])
            parent_chunks.append({doc_id:chunk})
    return parent_chunks

In [88]:
pchunks = create_parent_chunks(all_episodes, window_size=3)

In [89]:
def create_parent_chunk_cache(parent_chunks: List[dict]) -> dict:
    '''
    Creates a simple in-memory cache for quick parent chunk lookup.
    Used for small-to-big retrieval in a RAG system.
    '''
    content_cache = {}
    for chunk in pchunks:
        for k,v in chunk.items():
            content_cache[k] = v
    return content_cache

In [90]:
cache = create_parent_chunk_cache(pchunks)

In [91]:
alltext = ' '.join(list(cache.values()))

In [92]:
import tiktoken

In [93]:
encoding = tiktoken.encoding_for_model('gpt-4')

In [94]:
len(encoding.encode(alltext))

40777582

# ------------------- BREAK ---------------------------

# Small to Big Retrieval

In [47]:
print(f'Size of original data: {len(data)}')
print(f'Size of cached content: {len(content_cache)}')

In [13]:
class_name =  'Fine_tuned_on_300'

### Hybrid Search call

In [84]:
response = client.hybrid_search('does this show discuss the use of generative ai', class_name, properties=['content', 'summary', 'guest'], alpha=0.45)
# response

### View larger context from response

In [82]:
content_cache['zm0QVutAkYg_3']

"So your chief AI officer is scanning the horizon, understanding it, and then advising members of your team. So every part of your team, right? There's going to be AI supporting sales, and marketing, and engineering, and HR. We're all going to have, in the near term, an AI co-pilot, right? This is an AI that helps you do your job better, because we are so limited as carbon life forms. But ultimately is going to be able to operate and do a number of the things repetitively, because we do a lot of repetitive tasks, and AIs are much better at that. I think if you've got, we've got, say, a 30 person company, every single person needs to be trained in AI, and using these chatbot auto GPT tools, and absolutely augment themselves 10, 20, 100x. I have said to my company, okay, everybody here needs to figure out in your department, what are the tools that exist in AI? And how can you immediately implement them? But even that's pretty vague. Like I'm just sort of dumping it on them. Where do peo

### Extract top-n results from response

In [85]:
top_n = 3

def get_top_n(response: List[dict], top_n: int=3):
    top_docs = [d['doc_id'] for d in response[:top_n]]
    cache_responses = [content_cache[doc_id] for doc_id in top_docs]
    return cache_responses

In [86]:
get_top_n(response)

["So your chief AI officer is scanning the horizon, understanding it, and then advising members of your team. So every part of your team, right? There's going to be AI supporting sales, and marketing, and engineering, and HR. We're all going to have, in the near term, an AI co-pilot, right? This is an AI that helps you do your job better, because we are so limited as carbon life forms. But ultimately is going to be able to operate and do a number of the things repetitively, because we do a lot of repetitive tasks, and AIs are much better at that. I think if you've got, we've got, say, a 30 person company, every single person needs to be trained in AI, and using these chatbot auto GPT tools, and absolutely augment themselves 10, 20, 100x. I have said to my company, okay, everybody here needs to figure out in your department, what are the tools that exist in AI? And how can you immediately implement them? But even that's pretty vague. Like I'm just sort of dumping it on them. Where do pe

### Compare with original response content

In [42]:
response_content = [d['content'] for d in response[:top_n]]
response_content

In [87]:
import tiktoken

In [88]:
encoder = tiktoken.encoding_for_model('gpt-4')

In [89]:
data_path = '/home/elastic/notebooks/datasets/acled_reports/'

In [96]:
paths = sorted([os.path.join(data_path, file) for file in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, file))])

In [103]:
strings = []

for path in paths:
    with open(path) as f:
        string = f.read()
        strings.append(string)

In [110]:
import pandas as pd

df = pd.DataFrame(list(map(len, encoder.encode_batch(strings))), columns=['lens'])

In [113]:
df.sum()

lens    21279
dtype: int64