In [6]:
import pandas as pd
import numpy as np
import glob
import os
from io import StringIO
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon

def parse_evaluation_file(file_content):
    lines = file_content.splitlines()
    table_lines = []
    # we don't care about the overall metrics as we analyze per-query differences
    for line in lines:
        if "==== Overall Evaluation Metrics" in line:
            break
        table_lines.append(line)
    table_str = "\n".join(table_lines)
    try:
        df = pd.read_csv(StringIO(table_str), sep='\s+')
    except pd.errors.EmptyDataError:
        raise ValueError("No table data found in the file content.")
    return df

results_folder = "../results"
file_pattern = os.path.join(results_folder, "*.txt")
file_paths = glob.glob(file_pattern)

baseline_keywords = ["bm25", "splade", "colbert", "dpr"]
hybrid_markers = ["fusion", "sequential", "union"]

records = []
for file_path in file_paths:
    with open(file_path, 'r') as f:
        content = f.read()

    df = parse_evaluation_file(content)

    # values are offset in the file, so we have Average precision in the recall column
    # we have recall in the precision column
    values = df['Recall'].astype(float).values

    fname = os.path.basename(file_path).lower()
    is_hybrid = any(marker in fname for marker in hybrid_markers)

    if not is_hybrid:
        matched = [kw for kw in baseline_keywords if kw in fname]
        if matched:
            group = matched[0]
        else:
            group = "other"
        baselines_included = matched
    else:
        group = "hybrid"
        baselines_included = [kw for kw in baseline_keywords if kw in fname]

    records.append({
        "file": os.path.basename(file_path),
        "is_hybrid": is_hybrid,
        "group": group,
        "baselines_included": baselines_included,
        "values": values
    })

df_files = pd.DataFrame(records)

results = []
for baseline in baseline_keywords:
    baseline_subset = df_files[(df_files['is_hybrid'] == False) & (df_files['group'] == baseline)]

    baseline_arrays = list(baseline_subset['values'])
    baseline_avg = np.mean(np.vstack(baseline_arrays), axis=0)

    hybrid_subset = df_files[(df_files['is_hybrid'] == True) &
                             (df_files['baselines_included'].apply(lambda lst: baseline in lst))]

    hybrid_arrays = list(hybrid_subset['values'])
    hybrid_avg = np.mean(np.vstack(hybrid_arrays), axis=0)

    if len(baseline_avg) != len(hybrid_avg):
        print(f"Skipping baseline {baseline} due to mismatched number of queries.")
        continue

    stat, p_val = wilcoxon(baseline_avg, hybrid_avg)

    results.append({
        "Baseline": baseline,
        "Num Baseline Files": len(baseline_subset),
        "Num Hybrid Files": len(hybrid_subset),
        "Baseline Mean (avg over queries)": np.mean(baseline_avg),
        "Hybrid Mean (avg over queries)": np.mean(hybrid_avg),
        "Wilcoxon Statistic": stat,
        "p-value": p_val,
        "Significant": "Yes" if p_val < 0.05 else "No"
    })

df_wilcoxon = pd.DataFrame(results)
pd.options.display.float_format = '{:.20f}'.format
print("\nWilcoxon Signed-Rank Test Comparison Table:")
print(df_wilcoxon)

example_baseline = baseline_keywords[0]
baseline_subset = df_files[(df_files['is_hybrid'] == False) & (df_files['group'] == example_baseline)]
hybrid_subset = df_files[(df_files['is_hybrid'] == True) &
                         (df_files['baselines_included'].apply(lambda lst: example_baseline in lst))]




Wilcoxon Signed-Rank Test Comparison Table:
  Baseline  Num Baseline Files  Num Hybrid Files  \
0     bm25                   1                28   
1   splade                   1                28   
2  colbert                   1                26   
3      dpr                   1                27   

   Baseline Mean (avg over queries)  Hybrid Mean (avg over queries)  \
0            0.06576315789473682893          0.07060855263157896078   
1            0.05525000000000000050          0.06488815789473684204   
2            0.11272368421052633702          0.08045445344129556475   
3            0.02589473684210526552          0.06056871345029241072   

        Wilcoxon Statistic                p-value Significant  
0 887.00000000000000000000 0.01083060733195395295         Yes  
1 841.00000000000000000000 0.00509437624606131137         Yes  
2 643.00000000000000000000 0.00010044232506261697         Yes  
3  57.00000000000000000000 0.00000000000115131103         Yes  


  df = pd.read_csv(StringIO(table_str), sep='\s+')
