In [1]:
import sys
import os

# Get project root (parent of the "notebooks/" folder)
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [15]:
import logging
logging.getLogger('search').setLevel(logging.WARNING)

In [3]:
from src.metrics import ndcg, mrr, recall
from src.search.embed_index import EmbeddingIndex
from src.search.hybrid_index import HybridIndex
from src.search.bm25_index import BM25Index
from src.search.reranker import ReRanker
import json
import pandas as pd

In [4]:
with open(r'../data/amzn/relevance_labels.json','r', encoding="utf-8") as f:
    relevant_labels = json.load(f)

In [10]:
def results_to_relevance(results, relevant_labels):
    relevance = []
    for result in results:
        id = result['id']
        if id in relevant_labels:
            relevance.append(relevant_labels[id]['relevance'])
        else:
            relevance.append(0)
    return relevance

In [6]:
embedding_index = EmbeddingIndex('amzn')
bm25_index = BM25Index('amzn')
hybrid_index = HybridIndex('amzn')

In [16]:
embedding_results= []
bm25_results = []
hybrid_results = []
top_n = 10
for query,relevance in relevant_labels.items():
    embedding_results.append(results_to_relevance(embedding_index.search(query,top_n),relevant_labels[query]))
    bm25_results.append(results_to_relevance(bm25_index.search(query,top_n),relevant_labels[query]))
    hybrid_results.append(results_to_relevance(hybrid_index.search(query,top_n,alpha=0.6),relevant_labels[query]))

In [12]:
top_k = 5
metrics_result = pd.DataFrame(index = ['BM25','Embedding','Hybrid'],columns = [f'NDCG@{top_k}',f'MRR@{top_k}',f'Recall@{top_k}'])
for (i,result) in enumerate([bm25_results, embedding_results, hybrid_results]):
  for (j,function) in enumerate([ndcg,mrr,recall]):
    metrics_result.iloc[i,j] = function(result,top_k)

In [13]:
metrics_result

Unnamed: 0,NDCG@5,MRR@5,Recall@5
BM25,0.885952,1.0,0.636111
Embedding,0.938175,0.9,0.691667
Hybrid,0.922871,1.0,0.622222


In [15]:
metrics_result

Unnamed: 0,NDCG@5,MRR@5,Recall@5
BM25,0.885952,1.0,0.636111
Embedding,0.938175,0.9,0.691667
Hybrid,0.922871,1.0,0.622222


In [17]:
hybrid_ndcg_by_alpha = pd.Series(index = [x/5 for x in range(6)])
for j in range(6):
    hybrid_results=[]
    for query,relevance in relevant_labels.items():
        hybrid_results.append(results_to_relevance(hybrid_index.search(query,top_n,alpha=j/5),relevant_labels[query]))
    hybrid_ndcg_by_alpha.iloc[j] = ndcg(hybrid_results,top_k)


In [18]:
hybrid_ndcg_by_alpha

0.0    0.938175
0.2    0.936860
0.4    0.940337
0.6    0.922871
0.8    0.906055
1.0    0.885952
dtype: float64

In [19]:
bm25_result = bm25_index.search(query,top_n)