In [35]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
#external files
from preprocessing import FileIO
from openai_interface import GPT_Turbo
from weaviate_interface import WeaviateClient
from retrieval_evaluation import run_evaluation, calc_hit_rate_scores
from reranker import ReRanker

#standard library imports
import json
import time
import os
from math import ceil
from datetime import datetime
from typing import List, Any, Dict, Tuple, Union

#misc
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
env = load_dotenv('./.env', override=True)

### Ingest data

In [3]:
data_path = './practice_data/impact_theory_minilm_256.parquet'
data = FileIO().load_parquet(data_path)

Shape of data: (26648, 16)
Memory Usage: 3.08+ MB


### Randomly select 100 chunks for Q/A pairs

In [4]:
import random

In [5]:
def sample_data(data: List[dict], sample_size: int):
    sample = random.sample(data, sample_size)
    contents = [(d['doc_id'], d['content']) for d in sample]
    return contents

In [6]:
def get_meta(sample: List[dict], key: str="doc_id") -> List[Any]:
    return [d[key] for d in sample]

In [7]:
def get_sample(doc_id: str, corpus: List[dict], full_dict: bool=False):
    result = [d for d in corpus if d['doc_id'] == doc_id][0]
    if full_dict: return result
    else: return result['content']

In [8]:
get_sample('kE3yryW-FiE_33', data)

"So I'll follow up on that in a minute but I Think you're bang on especially with the idea of identity people end up getting tied up in that but I've heard you say that when it comes to Being right when it comes to people trying to get to the truth that some people are saying no no, like truth is just a power game and even even this idea that there is something right or something better than another thing is is just a structure in essence of oppression So when you said that I think everybody wants the truth Is that true? Well, you're talking about I was probably referring to a very specific ideology kind of this postmodern line of thought which says there is no such Thing as objective truth and then everyone that that everyone has their truth So that's that's a very specific kind of very kind of far-left kind of radical line of thought Which I think I think this is sorry finish sex. That sounds very interesting Oh saying I think it's I think it's perfectly interesting the concept that 

In [9]:
def strip_numbers(query: str):
    return query[3:].strip()

In [10]:
def process_questions(question_tuples: List[tuple]) -> Dict[str, List[str]]:
    question_dict = {}
    for tup in question_tuples:
        doc_id = tup[0]
        questions = tup[1].split('\n')
        questions = [strip_numbers(q) for q in questions]
        question_dict[doc_id] = questions
    return question_dict

In [11]:
def generate_dataset(data: List[dict], dir_path: str, num_questions: int=100, batch_size: int=50):
    gpt = GPT_Turbo()
    if batch_size > 50:
        raise ValueError('Due to OpenAI rate limits, batch_size cannot be greater than 50')

    time_marker = datetime.now().strftime("%Y-%m-%d:%H:%M:%S")
    filepath = os.path.join(dir_path, f"{num_questions}_questions_{time_marker}.json")
    
    sample = sample_data(data, num_questions)
    batches = ceil(num_questions/batch_size)
    all_questions = []
    for n in range(batches):
        batch = sample[n*batch_size:(n+1)*batch_size]
        questions = gpt.batch_generate_question_context_pairs(batch)
        all_questions.append(questions)
        if n < batches - 1:
            print('Pausing for 60 seconds due to OpenAI rate limits...')
            time.sleep(60)
    all_questions = [tup for batch in all_questions for tup in batch]
    processed_questions = process_questions(all_questions)
    with open(filepath, 'w') as f:
        json.dump(processed_questions, f, indent=4)
    return processed_questions

In [12]:
dataset = generate_dataset(data=data, dir_path='./practice_data/', num_questions=50)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:04<00:00, 11.50Generated Questions/s]


In [13]:
client = WeaviateClient(os.environ['WEAVIATE_API_KEY'], os.environ['WEAVIATE_ENDPOINT'])
reranker = ReRanker()
# intfloat = ReRanker(model_name='intfloat/simlm-msmarco-reranker')

In [14]:
client.show_classes()[0]

'Impact_theory_minilm_256'

In [15]:
query = "How did the United States respond to the Soviet Union's advancements in space?"
index = 'Impact_theory_minilm_256'

In [34]:
from retrieval_evaluation import calc_hit_rate_scores

In [52]:
def execute_evaluation( dataset: Dict[str, List[str]], 
                        retriever: WeaviateClient,
                        reranker: ReRanker,
                        class_name: str, 
                        alpha: float=0.5,
                        limit: int=10,
                        top_k: int=5,
                        chunk_size: int=256,
                        rerank_all_responses: bool=False,
                        ) -> Tuple[int, int, int, int]:

    top_k = top_k if top_k else limit
    reranker_name = reranker.model_name if rerank_all_responses else "None"
    
    results_dict = {'n':limit, 
                    'top_k': top_k, 
                    'alpha': alpha,
                    'Retriever': retriever.model_name_or_path, 
                    'Ranker': reranker_name,
                    'chunk_size': chunk_size,
                    'kw_hit_rate': 0,
                    'vector_hit_rate': 0,
                    'hybrid_hit_rate':0,
                    'combined_hit_rate': 0,
                    'total_questions':0
                    }
    for doc_id, questions in tqdm(dataset.items(), 'Questions'):
        for q in questions:
            results_dict['total_questions'] += 1
            
            #make calls to Weaviate host: Keyword, Vector, and Hybrid
            try:
                kw_response = retriever.keyword_search(query=q, class_name=class_name, limit=limit)
                vector_response = retriever.vector_search(query=q, class_name=class_name, limit=limit)
                weaviate_hybrid_response = retriever.hybrid_search(query=q, class_name=class_name, alpha=alpha, limit=limit)
                combined_hybrid_response = kw_response + vector_response                
            
                #rerank returned responses if rerank_all is True
                if rerank_all_responses:
                    kw_response = reranker.rerank(kw_response, q, top_k=top_k)
                    vector_response = reranker.rerank(vector_response, q, top_k=top_k)
                    weaviate_hybrid_response = reranker.rerank(weaviate_hybrid_response, q, top_k=top_k)
                    combined_hybrid_response = reranker.rerank(combined_hybrid_response, q, top_k=top_k)
                
                #collect doc_ids to check for document matches (include only top_k if top_k > 0)
                kw_doc_ids = [res['doc_id'] for res in kw_response][:top_k]
                vector_doc_ids = [res['doc_id'] for res in vector_response][:top_k]
                hybrid_doc_ids = [res['doc_id'] for res in weaviate_hybrid_response][:top_k]
                combined_doc_ids = [res['doc_id'] for res in combined_hybrid_response][:top_k]
                
                #increment hit_rate counters as appropriate
                if doc_id in kw_doc_ids:
                    results_dict['kw_hit_rate'] += 1
                if doc_id in vector_doc_ids:
                    results_dict['vector_hit_rate'] += 1
                if doc_id in hybrid_doc_ids:
                    results_dict['hybrid_hit_rate'] += 1
                if doc_id in combined_doc_ids:
                    results_dict['combined_hit_rate'] += 1
                    
            except (UnexpectedStatusCodeException) as e:
                print(e)
                continue

    #use raw counts to calculate final scores
    calc_hit_rate_scores(results_dict)
    
    return results_dict

In [53]:
%%time

results = []
for alpha in np.linspace(0,0.5,num=3):
    result = execute_evaluation(dataset, client, reranker, index, alpha=alpha, limit=100, rerank_all_responses=True)
    results.append(result)

Questions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:19<00:00,  2.78s/it]
Questions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:18<00:00,  2.76s/it]
Questions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:18<00:00,  2.78s/it]

CPU times: user 4min 16s, sys: 3.32 s, total: 4min 20s
Wall time: 6min 55s





In [54]:
from rich import print

In [55]:
print(results)

Bad pipe message: %s [b"&ec\xce\xae\xc6o\xc2\x19\x11\xc2\xc5\xeb\xd3\xf9\xcb\xa0D\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x000\x00.\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04", b'\x08\x06\x04\x01\x05\x01\x06', b'', b'\x03\x03']
Bad pipe message: %s [b'']
Bad pipe message: %s [b'', b'\x02']
Bad pipe message: %s [b'\x05\x02\x06']
Bad pipe message: %s [b'$\x86\x15}<\xf7bDfn\xa3\x1a\xef\x0e\xd7\xbd\xca\t\