In [1]:
import pandas as pd
import json
import numpy as np
import requests
from sklearn.metrics import confusion_matrix

In [2]:
def get_maxs(l):
    maximum = max(l)
    pos = []
    for i in range(len(l)):
        if l[i] == maximum:
            pos.append(i)
    return pos


def get_mins(l):
    minimum = min(l)
    pos = []
    for i in range(len(l)):
        if l[i] == minimum:
            pos.append(i)
    return pos

def ndcg_per_query(ranked_relevance, k):
    if len(ranked_relevance)<k:
        ranked_relevance_l = ranked_relevance.tolist()
        ranked_relevance_l.extend([0 for i in range(k - len(ranked_relevance))])
        ranked_relevance = np.array(ranked_relevance)
    ranked_relevance = np.array(ranked_relevance)
    ranked_relevance = ranked_relevance[:k]

    dcg = 0
    for i, rel in enumerate(ranked_relevance,start=1):
        dcg+=(2**rel - 1)/ np.log2(i+1)

    relevance_sorted = np.sort(ranked_relevance)[::-1]
    idcg = 0
    for i, rel in enumerate(relevance_sorted,start=1):
        idcg+=(2**rel - 1)/ np.log2(i+1)

    if idcg ==0:
        return 0
    else:
        return dcg/idcg

def ndcg(ranked_relevance_list, k):
    ranked_relevance_list = np.array(ranked_relevance_list)
    result = []


    for ranked_relevance in ranked_relevance_list:
        result.append(ndcg_per_query(ranked_relevance, k))
    return np.mean(result)

def mrr(ranked_relevance_list, k):
    ranked_relevance_list = np.array(ranked_relevance_list)
    assert len(ranked_relevance_list)>0
    result = 0
    valid = 0
    for ranked_relevance in ranked_relevance_list:
        if len(ranked_relevance) < k:
            ranked_relevance_l = ranked_relevance.tolist()
            ranked_relevance_l.extend([0 for i in range(k - len(ranked_relevance))])
            ranked_relevance = np.array(ranked_relevance_l)
        ranked_relevance = ranked_relevance[:k]
        if max(ranked_relevance) ==0:
            continue
        else:
            valid+=1
            for i, v in enumerate(ranked_relevance):
                if v>0:
                    result += 1/(i+1)
                    break
    return result/valid if valid>0 else 0

In [3]:
session = requests.Session()
session.trust_env = False
SEARCH_URL = 'http://127.0.0.1:8000/search'
ROUTE_URL = 'http://127.0.0.1:8002/batch_route'

with open('data/relevance_label.json','r',encoding='utf-8') as f:
    relevance_label = json.loads(f.read())

queries = pd.read_parquet('data/X_test.parquet').squeeze().tolist()
query = queries[0]

target = pd.read_parquet('data/y_test.parquet').squeeze().tolist()

For each query:

   - Run BM25, Dense, and Hybrid retrieval

     - Call vector-search service endpoints

     - Record retrieved doc IDs and latency

     - Convert retrieved docs into a relevance array, e.g. [1, 0, 0, 0, 0]
    
   - Select method that ranks the ground-truth answer highest

   - Break ties by latency

In [4]:
run_result = {}
for query in queries:
    query_result={}
    for mode in ['bm25','dense','hybrid']:
        query_result[mode] = {}
        payload = {'query': query,
                   'corpus': 'stackoverflow',
                   'top_n': 10,
                   'mode': mode,
                   'rerank': False
                   }
        search_result = session.post(SEARCH_URL, json=payload).json()
        result_ids = [relevance_label[query].get(r['id'], 0) for r in search_result['results']]
        query_result[mode]['search_relevance'] = result_ids
        query_result[mode]['latency'] = search_result['latency_seconds']
        query_result[mode]['mrr'] = mrr([query_result[mode]['search_relevance']], 10)
    metrics = [query_result[mode]['mrr'] for mode in ['bm25','dense','hybrid']]
    if max(metrics) == 0:
        print(f'NO HIT Query: {query}')
        print(f'{query_result}')
    else:
        max_pos = get_maxs(metrics)
        if len(max_pos) == 1:
            query_result['target_method'] = max_pos[0]
        else:
            latencies = [query_result[mode]['latency'] for mode in ['bm25','dense','hybrid']]
            min_latency_pos = get_mins(latencies)[0]
            best_method = max_pos[min_latency_pos]
            query_result['target_method'] = best_method
        run_result[query] = query_result

In [14]:
#example run result
run_result[query] 

{'bm25': {'search_relevance': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  'latency': 0.002997159957885742,
  'mrr': 0.3333333333333333},
 'dense': {'search_relevance': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'latency': 0.03401041030883789,
  'mrr': 0},
 'hybrid': {'search_relevance': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'latency': 0.04207277297973633,
  'mrr': 1.0},
 'target_method': 2}

evaluation for bm25, dense and hybrid

In [6]:
relevance_by_mode = {}
latencies_by_mode = {}
for mode in ['bm25','dense','hybrid']:
    latencies_by_mode[mode] = []
    relevance_by_mode[mode] = []
    for query in queries:
        relevance_by_mode[mode].append(run_result[query][mode]['search_relevance'])
        latencies_by_mode[mode].append(run_result[query][mode]['latency'])
eval_by_mode = {}
eval_latencies = {}
for mode in ['bm25','dense','hybrid']:
    eval_latencies[mode]=sum(latencies_by_mode[mode])/len(latencies_by_mode[mode])
    eval_by_mode[mode]={}
    eval_by_mode[mode]['mrr'] = mrr(relevance_by_mode[mode],10)
    eval_by_mode[mode]['ndcg'] = ndcg(relevance_by_mode[mode],10)

get the routing result by calling the Router Service

In [7]:
result = session.post(ROUTE_URL,json = {'queries':queries})
route_result = result.json()

In [15]:
#accuracy
accuracy = (np.array(route_result['route']) == np.array(target)).sum()/len(target)
print(f'accuracy: {accuracy:0.2f}')

accuracy: 0.51


In [16]:
#confusion matrix
#route likes to forecast hybrid
confusion_matrix(target,route_result['route'],labels=['bm25','dense','hybrid'])

array([[51, 24, 25],
       [19, 39, 42],
       [ 4, 32, 64]], dtype=int64)

evaluation for router service

In [10]:
relevance_by_mode['route'] = []
eval_by_mode['route'] = {}
latencies_by_mode['route'] = []
for i,query in enumerate(queries):
    route = route_result['route'][i]
    relevance_by_mode['route'].append(run_result[query][route]['search_relevance'])
    latencies_by_mode['route'].append(run_result[query][route]['latency'])
eval_by_mode['route']['mrr'] = mrr(relevance_by_mode['route'],10)
eval_by_mode['route']['ndcg'] = ndcg(relevance_by_mode['route'],10)
eval_latencies['route'] = sum(latencies_by_mode['route'])/len(latencies_by_mode['route'])

evaluation for router

In [11]:
relevance_by_mode['oracle'] = []
eval_by_mode['oracle'] = {}
latencies_by_mode['oracle'] = []
for i,query in enumerate(queries):
    route = target[i]
    relevance_by_mode['oracle'].append(run_result[query][route]['search_relevance'])
    latencies_by_mode['oracle'].append(run_result[query][route]['latency'])
eval_by_mode['oracle']['mrr'] = mrr(relevance_by_mode['oracle'],10)
eval_by_mode['oracle']['ndcg'] = ndcg(relevance_by_mode['oracle'],10)
eval_latencies['oracle'] = sum(latencies_by_mode['oracle'])/len(latencies_by_mode['oracle'])

In [12]:
pd.DataFrame(eval_by_mode)

Unnamed: 0,bm25,dense,hybrid,route,oracle
mrr,0.558785,0.632364,0.722049,0.681678,0.783455
ndcg,0.531766,0.652385,0.774623,0.713863,0.836487


In [13]:
pd.Series(eval_latencies)

bm25      0.006670
dense     0.040926
hybrid    0.047576
route     0.038084
oracle    0.034011
dtype: float64