In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
#external files
from preprocessing import FileIO
from openai_interface import GPT_Turbo
from weaviate_interface import WeaviateClient
from retrieval_evaluation import (execute_evaluation, calc_hit_rate_scores, calc_mrr_scores)
from llama_index.finetuning import EmbeddingQAFinetuneDataset
from reranker import ReRanker

#standard library imports
import json
import time
import os
from math import ceil
from datetime import datetime
from typing import List, Any, Dict, Tuple, Union

#misc
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
env = load_dotenv('./.env', override=True)

### Ingest data

In [3]:
data_path = '../impact_theory_minilm_256.parquet'
data = FileIO().load_parquet(data_path)

Shape of data: (26448, 12)
Memory Usage: 2.42+ MB


### Instantiate Weaviate client

In [46]:
client = WeaviateClient(os.environ['WEAVIATE_API_KEY'], os.environ['WEAVIATE_ENDPOINT'])

#check if WCS instance is live and ready
client.is_live(), client.is_ready()

(True, True)

In [48]:
client.show_classes()

['Impact_theory_minilm_256', 'ImpactTest4', 'ImpactTest']

In [10]:
class_name = 'Impact_theory_minilm_256'
train_dataset = EmbeddingQAFinetuneDataset.from_json("./data/training_dataset.json")
# val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

In [116]:
def execute_evaluation( dataset: Dict[str, List[str]], 
                        class_name: str, 
                        retriever: WeaviateClient,
                        reranker: ReRanker=None,
                        alpha: float=0.5,
                        retrieve_limit: int=5,
                        results_top_k: int=5,
                        rerank_top_k: int=5,
                        chunk_size: int=256,
                        display_properties: List[str]=['doc_id', 'content']
                        ) -> Tuple[int, int, int]:

    if results_top_k > retrieve_limit:  # we don't want to retrieve less results than the top_k that we want to see returned
        retrieve_limit = results_top_k
        
    reranker_name = reranker.model_name if reranker else "None"
    
    results_dict = {'n':retrieve_limit, 
                    'top_k': results_top_k,
                    'alpha': alpha,
                    'Retriever': retriever.model_name_or_path, 
                    'Ranker': reranker_name,
                    'chunk_size': chunk_size,
                    'kw_hit_rate': 0,
                    'kw_mrr': 0,
                    'vector_hit_rate': 0,
                    'vector_mrr': 0,
                    'hybrid_hit_rate':0,
                    'hybrid_mrr': 0,
                    'total_misses': 0,
                    'total_questions':0
                    }
    if reranker:
        results_dict['rerank_top_k'] = rerank_top_k  # have to build the results_dict before we can add this information
    
    for query_id, q in tqdm(dataset.queries.items(), 'Queries'):
        results_dict['total_questions'] += 1
        
        #make Keyword, Vector, and Hybrid calls to Weaviate host
        try:
            kw_response = retriever.keyword_search(request=q, class_name=class_name, limit=retrieve_limit, display_properties=display_properties)
            vector_response = retriever.vector_search(request=q, class_name=class_name, limit=retrieve_limit, display_properties=display_properties)
            hybrid_response = retriever._hybrid_search(request=q, class_name=class_name, alpha=alpha, limit=retrieve_limit, display_properties=display_properties)           
    
            #rerank returned responses if reranker is provided
            if reranker:
                kw_response = reranker.rerank(kw_response, q, top_k=rerank_top_k)
                vector_response = reranker.rerank(vector_response, q, top_k=rerank_top_k)
                hybrid_response = reranker.rerank(hybrid_response, q, top_k=rerank_top_k)
            
            #collect doc_ids to check for document matches (include only results_top_k)
            kw_doc_ids = {result['doc_id']:i for i, result in enumerate(kw_response[:results_top_k], 1)}
            vector_doc_ids = {result['doc_id']:i for i, result in enumerate(vector_response[:results_top_k], 1)}
            hybrid_doc_ids = {result['doc_id']:i for i, result in enumerate(hybrid_response[:results_top_k], 1)}
            
            #extract doc_id for scoring purposes
            doc_id = dataset.relevant_docs[query_id][0]

            #increment hit_rate counters and mrr scores
            if doc_id in kw_doc_ids:
                results_dict['kw_hit_rate'] += 1
                results_dict['kw_mrr'] += 1/kw_doc_ids[doc_id]
            if doc_id in vector_doc_ids:
                results_dict['vector_hit_rate'] += 1
                results_dict['vector_mrr'] += 1/vector_doc_ids[doc_id]
            if doc_id in hybrid_doc_ids:
                results_dict['hybrid_hit_rate'] += 1
                results_dict['hybrid_mrr'] += 1/hybrid_doc_ids[doc_id]

            # if no hits, let's capture that
            else:
                results_dict['total_misses'] += 1
                
        except Exception as e:
            print(e)
            continue

    #use raw counts to calculate final scores
    calc_hit_rate_scores(results_dict)
    calc_mrr_scores(results_dict)
    
    return results_dict

In [117]:
type(list(train_dataset.queries.items())[0][1])

str

In [118]:
%%time
execute_evaluation(train_dataset, class_name, client, alpha=0.3, retrieve_limit=5, results_top_k=5)

Queries: 100%|█████████████████████████████████████████████████████████████████| 250/250 [01:38<00:00,  2.53it/s]

CPU times: user 6.42 s, sys: 196 ms, total: 6.62 s
Wall time: 1min 38s





{'n': 5,
 'top_k': 5,
 'alpha': 0.3,
 'Retriever': 'sentence-transformers/all-MiniLM-L6-v2',
 'Ranker': 'None',
 'chunk_size': 256,
 'kw_hit_rate': 0.68,
 'kw_mrr': 0.55,
 'vector_hit_rate': 0.39,
 'vector_mrr': 0.29,
 'hybrid_hit_rate': 0.71,
 'hybrid_mrr': 0.56,
 'total_misses': 73,
 'total_questions': 250}

Bad pipe message: %s [b'\xb0\x0c\xc5(^/<Pc\x8c\xd8w\xb1\xba)j\xf19 "oZ[\xb2\xfe\x0c\xca\x14_s\x9505\xc9XlG\x1a\x03\x0bi\x82\x97\xb5\x9e\xde(F\x1f\x12\xd3\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00', b'\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00']
Bad pipe message: %s [b'\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01']
Bad pipe message: %s [b'\xe7\n>\t\x17s`\xcaL\x8b\xeea\xa9\xfce\xba\x9aN \xa6^\x04@u@\xae/\x84\xf6fS\xf2?\xd7\x8c\x96\xbfR\xeb\x85\x91\xc3\x1c\xc3UM\x89nF\xb5\x86\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.']
Bad pipe message: %s [b'\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\

In [53]:
%%time

results = []
for alpha in np.linspace(0,0.5,num=3):
    result = execute_evaluation(dataset, client, reranker, index, alpha=alpha, limit=100, rerank_all_responses=True)
    results.append(result)

Questions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:19<00:00,  2.78s/it]
Questions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:18<00:00,  2.76s/it]
Questions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:18<00:00,  2.78s/it]

CPU times: user 4min 16s, sys: 3.32 s, total: 4min 20s
Wall time: 6min 55s





In [54]:
from rich import print

In [40]:
results = {'kw_hit_rate': 150, 'vector_hit_rate': 100}

In [41]:
for prefix in ['kw', 'vector']:
    results[f'{prefix}_hit_rate'] = round(results[f'{prefix}_hit_rate']/200,2)

In [42]:
results

{'kw_hit_rate': 0.75, 'vector_hit_rate': 0.5}

In [None]:
{'n': 5,
 'rerank_top_k': 5,
 'alpha': 0,
 'Retriever': 'sentence-transformers/all-MiniLM-L6-v2',
 'Ranker': 'None',
 'chunk_size': 256,
 'kw_hit_rate': 0.68,
 'kw_mrr': 0.55,
 'vector_hit_rate': 0.39,
 'vector_mrr': 0.29,
 'hybrid_hit_rate': 0.68,
 'hybrid_mrr': 0.55,
 'total_questions': 250}