In [148]:
from preprocessing import FileIO
from typing import List, Any, Dict, Tuple
from llama_index.evaluation import generate_question_context_pairs, generate_qa_embedding_pairs
from openai_interface import GPT_Turbo
import openai
from opensearch_interface import OpenSearchClient
from reranker import ReRanker
from index_templates import youtube_body
import json
from math import ceil
import time
from tqdm import tqdm

### Ingest data

In [102]:
data_path = './data/impact_theory_gte_196.parquet'
data = FileIO().load_parquet(data_path)

Shape of data: (37007, 16)
Memory Usage: 4.27+ MB


### Randomly select 100 chunks for Q/A pairs

In [103]:
import random

In [104]:
def sample_data(data: List[dict], sample_size: int):
    sample = random.sample(data, sample_size)
    contents = [(d['doc_id'], d['content']) for d in sample]
    return contents

In [105]:
def get_meta(sample: List[dict], key: str="doc_id") -> List[Any]:
    return [d[key] for d in sample]

In [106]:
def get_sample(doc_id: str, corpus: List[dict], full_dict: bool=False):
    result = [d for d in corpus if d['doc_id'] == doc_id][0]
    if full_dict: return result
    else: return result['content']

In [107]:
get_sample('kE3yryW-FiE_33', data)

"that would be nice if we all thought that way But of course, there's this primitive mind that we talked about which is this part of your brain that is not wired for truth it's wired for survival in 50,000 BC and what that often meant was agreeing with The sacred beliefs of your tribe and believing them and the people who could believe what the tribe believed With full conviction they survived. Well, they were you know, they were on the in-group they fit in and that's what was needed What's up, guys?"

In [108]:
def strip_numbers(query: str):
    return query[3:].strip()

In [109]:
def process_questions(question_tuples: List[tuple]) -> List[tuple]:
    question_dict = {}
    for tup in question_tuples:
        doc_id = tup[0]
        questions = tup[1].split('\n')
        questions = [strip_numbers(q) for q in questions]
        question_dict[doc_id] = questions
    return question_dict

In [112]:
def generate_dataset(data: List[dict], num_questions: int=100, batch_size: int=50):
    gpt = GPT_Turbo()
    if batch_size > 50:
        raise ValueError('Due to OpenAI rate limits, batch_size cannot be greater than 50')
        
    sample = sample_data(data, num_questions)
    batches = ceil(num_questions/batch_size)
    all_questions = []
    for n in range(batches):
        batch = sample[n*batch_size:(n+1)*batch_size]
        questions = gpt.batch_generate_question_context_pairs(batch, filepath='./data/100_questions.txt')
        all_questions.append(questions)
        if n < batches - 1:
            print('Pausing for 60 seconds due to OpenAI rate limits...')
            time.sleep(60)
    all_questions = [tup for batch in all_questions for tup in batch]
    processed_questions = process_questions(all_questions)
    return processed_questions

In [113]:
dataset = generate_dataset(data)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.13Generated Questions/s]


Pausing for 60 seconds due to OpenAI rate limits...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.53Generated Questions/s]


Pausing for 60 seconds due to OpenAI rate limits...


In [129]:
osclient = OpenSearchClient(model_name_or_path='/home/elastic/notebooks/vector_search_applications/models/gte-base/')
reranker = ReRanker()

In [117]:
query = 'how to define success'
kw_index = 'kw-impact-theory-196'
vec_index = 'semantic-impact-theory-196'

In [118]:
kw = osclient.keyword_search(query, kw_index, 25)
vec = osclient.vector_search(query, vec_index, 25)
hybrid = osclient.hybrid_search(query, kw_index, vec_index)
len(hybrid)

[32m2023-10-13 18:33:49.922[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36m_deduplicate_results[0m:[36m319[0m - [1mDuplicate Hit: y8bwEgCcOXs_50 on index semantic-impact-theory-196[0m
[32m2023-10-13 18:33:49.923[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36m_deduplicate_results[0m:[36m319[0m - [1mDuplicate Hit: 5m81Qsw0gLw_4 on index semantic-impact-theory-196[0m
[32m2023-10-13 18:33:49.923[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36m_deduplicate_results[0m:[36m319[0m - [1mDuplicate Hit: govmnjHxMUc_13 on index semantic-impact-theory-196[0m
[32m2023-10-13 18:33:49.924[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36m_deduplicate_results[0m:[36m319[0m - [1mDuplicate Hit: JNbUb6FOEKw_0 on index semantic-impact-theory-196[0m
[32m2023-10-13 18:33:49.924[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36m_deduplicate_results[0m:[36m319[0m - [1mDuplicate Hit: Vs7KtoS0eKI_90 on index semantic-impact-th

44

In [119]:
ranked_results = reranker.rerank(hybrid, query, top_k=5, threshold=0)
len(ranked_results)

29

In [161]:
def get_results(dataset: Dict[str, List[str]], 
                kw_index_name: str, 
                vector_index_name: str, 
                response_size: int, 
                top_k: int=None,
                record_result: bool=True
               ) -> Tuple[int, int, int, int, int]:
    
    #set recall counters for hit counts
    top_k = top_k if top_k else response_size + 1
    kw_recall = 0
    vector_recall = 0
    hybrid_recall = 0
    ranked_recall = 0
    total_questions = 0
    
    for doc_id, questions in tqdm(dataset.items(), 'Questions'):
        for q in questions:
            total_questions += 1
            
            #make calls to OpenSearch host of: Keyword, Vector, and Hybrid
            kw_response = osclient.keyword_search(query=q, index=kw_index_name, size=response_size)
            vector_response = osclient.vector_search(query=q, index=vector_index_name, size=response_size)
            hybrid_response = osclient.hybrid_search(q, kw_index_name, vector_index_name, kw_size=response_size, vec_size=response_size)

            #collect doc_ids to check for document matches (include only top_k if top_k > 0)
            kw_doc_ids = [res['_source']['doc_id'] for res in kw_response][:top_k]
            vector_doc_ids = [res['_source']['doc_id'] for res in vector_response][:top_k]
            hybrid_doc_ids = [res['_source']['doc_id'] for res in hybrid_response]
            
            #rerank hybrid results for improved recall
            ranked_results = reranker.rerank(hybrid_response, q, top_k=top_k)
            ranked_doc_ids = [res['_source']['doc_id'] for res in ranked_results]
            
            #increment recall counters as appropriate
            if doc_id in kw_doc_ids:
                kw_recall += 1
            if doc_id in vector_doc_ids:
                vector_recall += 1
            if doc_id in hybrid_doc_ids:
                hybrid_recall += 1
            if doc_id in ranked_doc_ids:
                ranked_recall += 1
                
    #write results to output file
    if record_result:
        with open('./data/retrieval_evalution_196.txt', 'a') as f:
            statement = f'n={response_size} -->\
            \tKeyword: {kw_recall}/{total_questions} = {round(kw_recall/total_questions,2)}\
            \t\tVector: {vector_recall}/{total_questions} = {round(vector_recall/total_questions,2)}\
            \t\tHybrid: {hybrid_recall}/{total_questions} = {round(hybrid_recall/total_questions,2)}\
            \t\tRanked: {ranked_recall}/{total_questions} = {round(ranked_recall/total_questions,2)}'
            f.write(statement)
            f.write('\n')
            print(statement)
    return kw_recall, vector_recall, hybrid_recall, ranked_recall, total_questions
        

In [164]:
kw_index = 'kw-impact-theory-196'
sem_index = 'semantic-impact-theory-196'
chunk_size = 196

for x in range(60,110,10):
    results = get_results(dataset, kw_index, sem_index, x, top_k=10)
    

Questions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:29<00:00,  3.34it/s]


n=60 -->            	Keyword: 139/200 = 0.69            		Vector: 132/200 = 0.66            		Hybrid: 174/200 = 0.87            		Ranked: 167/200 = 0.83


Questions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:31<00:00,  3.19it/s]


n=70 -->            	Keyword: 139/200 = 0.69            		Vector: 132/200 = 0.66            		Hybrid: 175/200 = 0.88            		Ranked: 168/200 = 0.84


Questions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:32<00:00,  3.04it/s]


n=80 -->            	Keyword: 139/200 = 0.69            		Vector: 132/200 = 0.66            		Hybrid: 176/200 = 0.88            		Ranked: 168/200 = 0.84


Questions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:34<00:00,  2.93it/s]


n=90 -->            	Keyword: 139/200 = 0.69            		Vector: 132/200 = 0.66            		Hybrid: 176/200 = 0.88            		Ranked: 168/200 = 0.84


Questions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:35<00:00,  2.78it/s]

n=100 -->            	Keyword: 139/200 = 0.69            		Vector: 132/200 = 0.66            		Hybrid: 176/200 = 0.88            		Ranked: 167/200 = 0.83





In [171]:
queries = ['What is the importance of having clarity in life?','How specific should one be when setting goals?']

In [174]:
# get_sample(doc_id='9aRy7DZ0Ek4_1', corpus=data, full_dict=True)

In [175]:
osclient.keyword_search(queries[0], kw_index)

[{'_index': 'kw-impact-theory-196',
  '_id': '_8CUJIsBbbW07Kw5qBDk',
  '_score': 14.765857,
  '_source': {'length': 12406,
   'episode_num': 372,
   'title': 'Turning Boys Into Men: How To Stop Being WEAK & Become A 1% Man  | Tom Bilyeu',
   'thumbnail_url': 'https://i.ytimg.com/vi/WhLdpjZjUrw/hq720.jpg',
   'doc_id': 'WhLdpjZjUrw_70',
   'publish_date': '06-29-2023',
   'content': "Everything will relent to your superior will. Everything will relent to your superior will. Never forget that. The reason most people fail to achieve their goals is twofold. Number one, they don't have a sufficient level of clarity. And number two, they don't want it badly enough. The statistic is that 92% of all people that set a New Year's resolution fail to stick with it. The way that you go from being in the 92% to being in the 8% is by having a freakish level of clarity and building so much desire in your life that nothing could stop you even if it tried. This is what you need to do. The clarity piece 

Bad pipe message: %s [b'.(W\x11>lW\xf8\xc0\xfd6\xf7\xd9\x1az\xccP\x0e \xf95\xe0\xd6uX6\xb7\x98\x12I\x08&\xf0\x15\xc8\xd4!\x1ao\x0e\x93/\xea\xb3\xfa\x94Y\xed\x97\xe1\xf7\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \xbd\x14,\xed\xdb\xe9;<\x10\x13\xb54W!t\xff\x85\xff\x11.\xe1\x83\xa3O.']
Bad pipe message: %s [b"\x0c\x0f\xb8}\xf9\xbe\xb2d\xad\xd2\x06\xb6\xb66q\x9d\x19\xd8\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\