In [1]:
from pathlib import Path
import json

import pandas as pd
import numpy as np

from statsmodels.stats.multitest import multipletests
from scipy.stats import permutation_test

In [2]:
run_base = Path('../output/20250317-email/')
runs = [entry for entry in run_base.iterdir() if entry.is_dir()]
runs

[PosixPath('../output/20250317-email/generation_gemma-3-27b-it_oracle'),
 PosixPath('../output/20250317-email/generation_gemma-3-4b-it'),
 PosixPath('../output/20250317-email/generation_gemma-3-27b-it_oracle.bak'),
 PosixPath('../output/20250317-email/generation_llama-3.1-70b_w8a8'),
 PosixPath('../output/20250317-email/bm25_faq_msmarco_gpu'),
 PosixPath('../output/20250317-email/bm25_dense_rerank_minilm-l6'),
 PosixPath('../output/20250317-email/bm25_faq_minilm'),
 PosixPath('../output/20250317-email/bm25_faq_minilm_gpu'),
 PosixPath('../output/20250317-email/generation_llama-3.1-8b'),
 PosixPath('../output/20250317-email/bm25_dense_msmarco_gpu'),
 PosixPath('../output/20250317-email/bm25_dense_minilm'),
 PosixPath('../output/20250317-email/bm25_faq_msmarco'),
 PosixPath('../output/20250317-email/bm25_hyde_gemma-1b'),
 PosixPath('../output/20250317-email/bm25_dense_msmarco'),
 PosixPath('../output/20250317-email/bm25_dense_rerank_minilm-12'),
 PosixPath('../output/20250317-email/bm25_

In [3]:
all_metrics = []
for run_path in runs:
    run_path = Path(run_path)
    with open(run_path / 'metrics.json') as fin:
        metrics = json.load(fin)
        metrics['run_id'] = run_path.name

    with open(run_path / 'output.json') as fin:
        outputs = json.load(fin)
        metrics['secs_per_query'] = np.mean([sample['duration'] for sample in outputs])
        
    all_metrics.append(metrics)

In [4]:
df_metrics = pd.DataFrame(all_metrics)
df_metrics = df_metrics[['run_id', 'MeanReciprocalRank', 'r@1', 'r@5', 'r@50', 'secs_per_query']]
df_metrics['secs_per_query_gpu'] = df_metrics['secs_per_query']
df_metrics = df_metrics.set_index('run_id')
df_metrics.loc['bm25_faq_msmarco', 'secs_per_query_gpu'] = df_metrics.loc['bm25_faq_msmarco_gpu', 'secs_per_query']
df_metrics.loc['bm25_faq_minilm', 'secs_per_query_gpu'] = df_metrics.loc['bm25_faq_minilm_gpu', 'secs_per_query']
df_metrics.loc['bm25_dense_msmarco', 'secs_per_query_gpu'] = df_metrics.loc['bm25_dense_msmarco_gpu', 'secs_per_query']
df_metrics.loc['bm25_dense_minilm', 'secs_per_query_gpu'] = df_metrics.loc['bm25_dense_minilm_gpu', 'secs_per_query']

# set timing to nan for runs which only ran on gpu
df_metrics.loc['bm25', 'secs_per_query_gpu'] = np.nan
df_metrics.loc['bm25_hyde_gemma-1b', 'secs_per_query'] = np.nan
df_metrics.loc['bm25_hyde_gemma-4b', 'secs_per_query'] = np.nan
df_metrics.loc['bm25_hyde_gemma-27b', 'secs_per_query'] = np.nan
df_metrics.loc['bm25_dense_rerank', 'secs_per_query'] = np.nan
df_metrics.loc['bm25_hyde_rerank', 'secs_per_query'] = np.nan
df_metrics.loc['bm25_faq_rerank', 'secs_per_query'] = np.nan

In [5]:
rename_metrics = {
    'MeanReciprocalRank': 'MRR',
    'secs_per_query': 'sec/query',
    'secs_per_query_gpu': 'sec/query$^\dagger$',
    'r@1': 'R@1',
    'r@5': 'R@5',
    'r@50': 'R@50',
}

## Evaluate Reranker models

In [6]:
model_meta = [
    { "run_id": "bm25_dense_msmarco", "run_name": "BM25 + Dense (MS MARCO)", "Params": np.nan },
    { "run_id": "bm25_dense_rerank_minilm-l6", "run_name": "+ Rerank (\\texttt{ms-marco-MiniLM-L6-v2})", "Params": '22.7M' },
    { "run_id": "bm25_dense_rerank_minilm-12", "run_name": "+ Rerank (\\texttt{ms-marco-MiniLM-L12-v2})", "Params": "33.4M" },
    { "run_id": "bm25_dense_rerank_jina-tiny", "run_name": "+ Rerank (\\texttt{jina-reranker-v1-tiny-en})", "Params": "33M" },
    { "run_id": "bm25_dense_rerank_jina-turbo", "run_name": "+ Rerank (\\texttt{jina-reranker-v1-turbo-en})", "Params": "37.8M" },
    { "run_id": "bm25_dense_rerank_mxbai-xsmall", "run_name": "+ Rerank (\\texttt{mxbai-rerank-xsmall-v1})", "Params": "70.8M" },
    { "run_id": "bm25_dense_rerank_mxbai-base", "run_name": "+ Rerank (\\texttt{mxbai-rerank-base-v1})", "Params": "184M" },
    { "run_id": "bm25_dense_rerank_mxbai-large", "run_name": "+ Rerank (\\texttt{mxbai-rerank-large-v1})", "Params": "435M" },
]
rename_run = {model['run_id']: model['run_name'] for model in model_meta}

df_meta = pd.DataFrame(model_meta)
df_meta = df_meta.set_index('run_id')
df_meta = df_meta.rename_axis(None, axis=0)
df_meta = df_meta.drop('run_name', axis=1)

df = df_metrics.loc[rename_run.keys()]
df = df.rename(rename_metrics, axis=1)
df = df.rename_axis(None, axis=0)
df = df.round(2)

# calculate delta
baseline_name = 'bm25_dense_msmarco'
baseline = df.loc[baseline_name]
delta = (df - baseline)/baseline
# delta = (delta * 100)
delta = delta.add_prefix('$\Delta$')
delta.loc[baseline_name] = np.nan
df = pd.concat([df, delta], axis=1)
df = pd.concat([df, df_meta], axis=1)

# bring baseline to top, and have the rest still sorted by MRR
baseline = df.loc[baseline_name]
df = df.drop(baseline_name)
df = df.sort_values('MRR')
df = pd.concat([baseline.to_frame().T, df])
df = df[['MRR', '$\Delta$MRR', 'R@1', '$\Delta$R@1', 'R@5', '$\Delta$R@5', 'sec/query$^\dagger$', '$\Delta$sec/query$^\dagger$', 'Params']]
df = df.rename(rename_run)

display(df)

Unnamed: 0,MRR,$\Delta$MRR,R@1,$\Delta$R@1,R@5,$\Delta$R@5,sec/query$^\dagger$,$\Delta$sec/query$^\dagger$,Params
BM25 + Dense (MS MARCO),0.4,,0.21,,0.5,,0.06,,
+ Rerank (\texttt{ms-marco-MiniLM-L12-v2}),0.23,-0.425,0.08,-0.619048,0.35,-0.3,0.19,2.166667,33.4M
+ Rerank (\texttt{jina-reranker-v1-turbo-en}),0.26,-0.35,0.1,-0.52381,0.4,-0.2,0.17,1.833333,37.8M
+ Rerank (\texttt{ms-marco-MiniLM-L6-v2}),0.27,-0.325,0.11,-0.47619,0.36,-0.28,0.14,1.333333,22.7M
+ Rerank (\texttt{jina-reranker-v1-tiny-en}),0.27,-0.325,0.12,-0.428571,0.37,-0.26,0.14,1.333333,33M
+ Rerank (\texttt{mxbai-rerank-large-v1}),0.37,-0.075,0.2,-0.047619,0.45,-0.1,1.59,25.5,435M
+ Rerank (\texttt{mxbai-rerank-xsmall-v1}),0.42,0.05,0.3,0.428571,0.46,-0.08,0.3,4.0,70.8M
+ Rerank (\texttt{mxbai-rerank-base-v1}),0.46,0.15,0.33,0.571429,0.48,-0.04,0.59,8.833333,184M


In [7]:
tex = df.style \
    .format(na_rep='\color{gray}{---}') \
    .format(subset=['MRR', 'R@1', 'R@5', 'sec/query$^\dagger$','$\Delta$sec/query$^\dagger$'], precision=2, na_rep='\color{gray} ---') \
    .format(subset=['$\Delta$MRR', '$\Delta$R@1', '$\Delta$R@5', '$\Delta$sec/query$^\dagger$'], precision=0, formatter='{:.0%}', na_rep='\color{gray} ---') \
    .format(subset=['$\Delta$MRR', '$\Delta$R@1', '$\Delta$R@5', '$\Delta$sec/query$^\dagger$'], precision=0, formatter='{:.0%}', na_rep='\color{gray} ---') \
    .highlight_max(subset=['MRR', 'R@1', 'R@5', '$\Delta$MRR', '$\Delta$R@1', '$\Delta$R@5'], props='font-weight:bold') \
    .highlight_min(subset=['sec/query$^\dagger$', '$\Delta$sec/query$^\dagger$'], props='font-weight:bold') \
    .to_latex(convert_css=True, caption='', label='', position='t', hrules=True)
tex = tex.replace('%', '\\%')
print(tex)

\begin{table}[t]
\begin{tabular}{llllllllll}
\toprule
 & MRR & $\Delta$MRR & R@1 & $\Delta$R@1 & R@5 & $\Delta$R@5 & sec/query$^\dagger$ & $\Delta$sec/query$^\dagger$ & Params \\
\midrule
BM25 + Dense (MS MARCO) & 0.40 & \color{gray} --- & 0.21 & \color{gray} --- & \bfseries 0.50 & \color{gray} --- & \bfseries 0.06 & \color{gray} --- & \color{gray}{---} \\
+ Rerank (\texttt{ms-marco-MiniLM-L12-v2}) & 0.23 & -42\% & 0.08 & -62\% & 0.35 & -30\% & 0.19 & 217\% & 33.4M \\
+ Rerank (\texttt{jina-reranker-v1-turbo-en}) & 0.26 & -35\% & 0.10 & -52\% & 0.40 & -20\% & 0.17 & 183\% & 37.8M \\
+ Rerank (\texttt{ms-marco-MiniLM-L6-v2}) & 0.27 & -32\% & 0.11 & -48\% & 0.36 & -28\% & 0.14 & \bfseries 133\% & 22.7M \\
+ Rerank (\texttt{jina-reranker-v1-tiny-en}) & 0.27 & -32\% & 0.12 & -43\% & 0.37 & -26\% & 0.14 & \bfseries 133\% & 33M \\
+ Rerank (\texttt{mxbai-rerank-large-v1}) & 0.37 & -8\% & 0.20 & -5\% & 0.45 & -10\% & 1.59 & 2550\% & 435M \\
+ Rerank (\texttt{mxbai-rerank-xsmall-v1}) & 0.42 & 

## Evaluate all models (with best reranker)

In [8]:
rename_runs = {
    'bm25': 'BM25',
    'bm25_dense_minilm': '+ Dense (MINI)',
    'bm25_dense_msmarco': '+ Dense (MARCO)',

    'bm25_hyde_gemma-1b': '+ HyDE (G-1b)',
    'bm25_hyde_gemma-4b': '+ HyDE (G-4b)',
    'bm25_hyde_gemma-27b': '+ HyDE (G-27b)',
    
    'bm25_faq_minilm': '+ FAQ (MINI)',
    'bm25_faq_msmarco': '+ FAQ (MARCO)',

    "bm25_dense_rerank": "+ Dense + Rerank",
    "bm25_hyde_rerank": "+ HyDE + Rerank",
    "bm25_faq_rerank": "+ FAQ + Rerank",
}
df = df_metrics.loc[rename_runs.keys()]
df = df.rename(rename_runs, axis=0)
df = df.rename(rename_metrics, axis=1)
df = df.rename_axis(None, axis=0)
display(df)

tex = df.style \
    .format(precision=2, na_rep='\\color{gray} \\text{---}') \
    .highlight_max(['MRR', 'R@1', 'R@5', 'R@50'], props='font-weight:bold') \
    .highlight_min(['sec/query', 'sec/query$^\dagger$'], props='font-weight:bold') \
    .to_latex(convert_css=True, caption='', label='', position='t', hrules=True)
print(tex)

Unnamed: 0,MRR,R@1,R@5,R@50,sec/query,sec/query$^\dagger$
BM25,0.257616,0.092105,0.370614,0.736842,0.021883,
+ Dense (MINI),0.314677,0.153509,0.436404,0.817982,0.057966,0.037454
+ Dense (MARCO),0.401657,0.20614,0.495614,0.778509,0.233335,0.062761
+ HyDE (G-1b),0.283733,0.133772,0.335526,0.730263,,4.809617
+ HyDE (G-4b),0.375017,0.245614,0.414474,0.824561,,5.219841
+ HyDE (G-27b),0.443001,0.291667,0.486842,0.842105,,20.918692
+ FAQ (MINI),0.701433,0.572368,0.765351,0.842105,0.0528,0.03061
+ FAQ (MARCO),0.605194,0.467105,0.633772,0.796053,0.187107,0.034071
+ Dense + Rerank,0.462562,0.33114,0.475877,0.778509,,0.589226
+ HyDE + Rerank,0.526428,0.377193,0.546053,0.828947,,21.409786


\begin{table}[t]
\begin{tabular}{lrrrrrr}
\toprule
 & MRR & R@1 & R@5 & R@50 & sec/query & sec/query$^\dagger$ \\
\midrule
BM25 & 0.26 & 0.09 & 0.37 & 0.74 & \bfseries 0.02 & \color{gray} \text{---} \\
+ Dense (MINI) & 0.31 & 0.15 & 0.44 & 0.82 & 0.06 & 0.04 \\
+ Dense (MARCO) & 0.40 & 0.21 & 0.50 & 0.78 & 0.23 & 0.06 \\
+ HyDE (G-1b) & 0.28 & 0.13 & 0.34 & 0.73 & \color{gray} \text{---} & 4.81 \\
+ HyDE (G-4b) & 0.38 & 0.25 & 0.41 & 0.82 & \color{gray} \text{---} & 5.22 \\
+ HyDE (G-27b) & 0.44 & 0.29 & 0.49 & \bfseries 0.84 & \color{gray} \text{---} & 20.92 \\
+ FAQ (MINI) & \bfseries 0.70 & \bfseries 0.57 & \bfseries 0.77 & \bfseries 0.84 & 0.05 & \bfseries 0.03 \\
+ FAQ (MARCO) & 0.61 & 0.47 & 0.63 & 0.80 & 0.19 & 0.03 \\
+ Dense + Rerank & 0.46 & 0.33 & 0.48 & 0.78 & \color{gray} \text{---} & 0.59 \\
+ HyDE + Rerank & 0.53 & 0.38 & 0.55 & 0.83 & \color{gray} \text{---} & 21.41 \\
+ FAQ + Rerank & 0.48 & 0.34 & 0.49 & \bfseries 0.84 & \color{gray} \text{---} & 0.56 \\
\bottomrule
\

### Significance test

In [14]:
def test_significance(x, y):
    res = permutation_test(
        (x, y),
        statistic=lambda a, b, axis: np.mean(a, axis=axis) - np.mean(b, axis=axis),
        permutation_type='samples',
        alternative='two-sided',  # assume x is better
        n_resamples=10000,
        random_state=0
    )
    return res

In [15]:
metrics = ['MeanReciprocalRank', 'r@1', 'r@5', 'r@50']
system_pairs = [
    ('bm25', 'bm25_dense_msmarco'),
    ('bm25_dense_minilm', 'bm25_dense_msmarco'),
    ('bm25_hyde_gemma-1b', 'bm25_dense_msmarco'),
    ('bm25_hyde_gemma-4b', 'bm25_dense_msmarco'),
    ('bm25_hyde_gemma-27b', 'bm25_dense_msmarco'),
    ('bm25_faq_minilm', 'bm25_dense_msmarco'),
    ('bm25_faq_msmarco', 'bm25_dense_msmarco'),
    ('bm25_dense_rerank', 'bm25_dense_msmarco'),
    ('bm25_hyde_rerank', 'bm25_dense_msmarco'),
    ('bm25_faq_rerank', 'bm25_dense_msmarco'),
]

In [16]:
p_values = []
for system_x, system_y in system_pairs:
    metrics_x = pd.read_json(f'../output/20250317-email/{system_x}/metrics_by_sample.json')
    metrics_y = pd.read_json(f'../output/20250317-email/{system_y}/metrics_by_sample.json')
    for metric in metrics:
        x = metrics_x[metric]
        y = metrics_y[metric]
        res = test_significance(x, y)
        p_values.append((system_x, system_y, metric, res.pvalue))

In [17]:
alpha = 0.01
df_p = pd.DataFrame(p_values, columns=['system_a', 'system_b', 'metric', 'p_value'])
sig, corrected, _, alphacBonf = multipletests(df_p['p_value'], alpha=alpha, method='bonferroni', maxiter=1, is_sorted=False, returnsorted=False)
print(f'alpha: {alpha}')
print(f'corrected alpha: {alphacBonf:.5f}')
print(f'n tests: {len(df_p)}')

df_p['p_value_corrected'] = corrected
df_p['is_significant'] = sig
df_p

alpha: 0.01
corrected alpha: 0.00025
n tests: 40


Unnamed: 0,system_a,system_b,metric,p_value,p_value_corrected,is_significant
0,bm25,bm25_dense_msmarco,MeanReciprocalRank,0.0014,0.055994,False
1,bm25,bm25_dense_msmarco,r@1,0.007999,0.319968,False
2,bm25,bm25_dense_msmarco,r@5,0.029997,1.0,False
3,bm25,bm25_dense_msmarco,r@50,0.10299,1.0,False
4,bm25_dense_minilm,bm25_dense_msmarco,MeanReciprocalRank,0.041796,1.0,False
5,bm25_dense_minilm,bm25_dense_msmarco,r@1,0.288371,1.0,False
6,bm25_dense_minilm,bm25_dense_msmarco,r@5,0.328767,1.0,False
7,bm25_dense_minilm,bm25_dense_msmarco,r@50,0.291571,1.0,False
8,bm25_hyde_gemma-1b,bm25_dense_msmarco,MeanReciprocalRank,0.003,0.119988,False
9,bm25_hyde_gemma-1b,bm25_dense_msmarco,r@1,0.160384,1.0,False


In [18]:
df_sig = df_p.pivot_table(index='system_a', columns='metric', values='is_significant')
# df_sig = df_sig.rename(rename_run, axis=0)
df_sig = df_sig.astype(bool)
df_sig = df_sig.map(lambda x: '*' if x else '')
df_sig.reindex(rename_run.values())
df_sig.index.name = ''
df_sig.columns.name = ''
df_sig

Unnamed: 0,MeanReciprocalRank,r@1,r@5,r@50
,,,,
bm25,,,,
bm25_dense_minilm,,,,
bm25_dense_rerank,,,,
bm25_faq_minilm,*,*,*,
bm25_faq_msmarco,*,*,,
bm25_faq_rerank,,,,
bm25_hyde_gemma-1b,,,,
bm25_hyde_gemma-27b,,,,
bm25_hyde_gemma-4b,,,,
