# MS MARCO Retrieval Pipeline with Permutation Self-Consistency

This notebook demonstrates the full retrieval pipeline:
1. Initial retrieval using BM25 or SPLADE++
2. Optional LLM reranking with permutation self-consistency
3. Evaluation on TREC DL19/20 datasets using NDCG and MRR metrics

In [1]:
import os
import sys
from pathlib import Path

# Set JAVA_HOME for Pyserini (required for BM25 retrieval)
# This ensures Java 11+ is used even if Jupyter doesn't inherit shell environment
if 'JAVA_HOME' not in os.environ or '1.8' in os.popen('java -version 2>&1').read():
    # Try to find Java 11+ via Homebrew
    java_home_candidates = [
        '/usr/local/Cellar/openjdk@11/11.0.29/libexec/openjdk.jdk/Contents/Home',
        '/usr/local/Cellar/openjdk@17/17.0.13/libexec/openjdk.jdk/Contents/Home',
        '/opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home',
        '/opt/homebrew/opt/openjdk@17/libexec/openjdk.jdk/Contents/Home',
    ]
    
    for candidate in java_home_candidates:
        if os.path.exists(candidate):
            os.environ['JAVA_HOME'] = candidate
            os.environ['PATH'] = f"{candidate}/bin:{os.environ.get('PATH', '')}"
            print(f"Set JAVA_HOME to: {candidate}")
            break
    else:
        # Fallback: try to use java_home utility
        try:
            import subprocess
            java_home = subprocess.check_output(['/usr/libexec/java_home', '-v', '11+']).decode().strip()
            os.environ['JAVA_HOME'] = java_home
            os.environ['PATH'] = f"{java_home}/bin:{os.environ.get('PATH', '')}"
            print(f"Set JAVA_HOME to: {java_home}")
        except:
            print("Warning: Could not set JAVA_HOME automatically. Pyserini may not work.")
            print("Please ensure Java 11+ is installed and JAVA_HOME is set correctly.")

# Verify Java version
if 'JAVA_HOME' in os.environ:
    java_version = os.popen(f"{os.environ['JAVA_HOME']}/bin/java -version 2>&1").read()
    print(f"Java version: {java_version.split(chr(10))[0]}")

sys.path.insert(0, str(Path.cwd().parent))

from permsc.retrieval import (
    MSMarcoQueries, MSMarcoCollection, TRECQrels,
    BM25Retriever, SpladeRetriever, RetrievalPipeline,
    evaluate_retrieval
)
from permsc.llm.openai_pool import OpenAIConfig

import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

Set JAVA_HOME to: /usr/local/Cellar/openjdk@11/11.0.29/libexec/openjdk.jdk/Contents/Home
Java version: openjdk version "11.0.29" 2025-10-21



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/local/Cellar/python@3.10/3.10.19_1/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/local/Cellar/python@3.10/3.10.19_1/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/arul/Documents/diff-perm-sc/venv/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users

## Configuration

Set up paths and configuration. Update these paths to match your setup.

In [2]:
DATA_DIR = Path("../data")
MSMARCO_COLLECTION = DATA_DIR / "msmarco/collection.tsv"
TREC_DL19_QUERIES = DATA_DIR / "trec-dl19/msmarco-test2019-queries.tsv"
TREC_DL19_QRELS = DATA_DIR / "trec-dl19/2019qrels-pass.txt"
TREC_DL20_QUERIES = DATA_DIR / "trec-dl20/msmarco-test2020-queries.tsv"
TREC_DL20_QRELS = DATA_DIR / "trec-dl20/2020qrels-pass.txt"

# BM25 Index Configuration
# Option 1: Use prebuilt index (downloads automatically - recommended)
USE_PREBUILT_INDEX = True
PREBUILT_INDEX_NAME = "msmarco-v1-passage"

# Option 2: Use local index path (if you have downloaded it manually)
# USE_PREBUILT_INDEX = False
# BM25_INDEX_PATH = "indexes/msmarco-passage/lucene-index.msmarco-v1-passage.20221004.252b5e"

# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_API_KEY = ""

print(f"Data directory: {DATA_DIR}")
if USE_PREBUILT_INDEX:
    print(f"Using prebuilt index: {PREBUILT_INDEX_NAME} (will download if needed)")
else:
    print(f"BM25 index path: {BM25_INDEX_PATH}")
print(f"OpenAI API key set: {bool(OPENAI_API_KEY)}")

Data directory: ../data
Using prebuilt index: msmarco-v1-passage (will download if needed)
OpenAI API key set: False


## Load Datasets

In [3]:
print("Loading MS MARCO collection...")
collection = MSMarcoCollection(str(MSMARCO_COLLECTION))
print(f"Collection loaded: {len(collection)} passages")

print("\nLoading TREC DL19 queries and qrels...")
dl19_queries = MSMarcoQueries(str(TREC_DL19_QUERIES))
dl19_qrels = TRECQrels(str(TREC_DL19_QRELS))
print(f"DL19: {len(dl19_queries)} queries, {len(dl19_qrels)} queries with qrels")

print("\nLoading TREC DL20 queries and qrels...")
dl20_queries = MSMarcoQueries(str(TREC_DL20_QUERIES))
dl20_qrels = TRECQrels(str(TREC_DL20_QRELS))
print(f"DL20: {len(dl20_queries)} queries, {len(dl20_qrels)} queries with qrels")

Loading MS MARCO collection...
Collection loaded: 8841823 passages

Loading TREC DL19 queries and qrels...
DL19: 200 queries, 43 queries with qrels

Loading TREC DL20 queries and qrels...
DL20: 200 queries, 54 queries with qrels


## Initialize Retrievers

Choose which retriever to use: BM25 or SPLADE++

In [4]:
RETRIEVER_TYPE = "bm25"  # or "splade"

if RETRIEVER_TYPE == "bm25":
    if USE_PREBUILT_INDEX:
        print(f"Initializing BM25 retriever with prebuilt index: {PREBUILT_INDEX_NAME}")
        retriever = BM25Retriever(prebuilt_index=PREBUILT_INDEX_NAME)
    else:
        print(f"Initializing BM25 retriever with local index: {BM25_INDEX_PATH}")
        retriever = BM25Retriever(index_path=BM25_INDEX_PATH)
elif RETRIEVER_TYPE == "splade":
    print("Initializing SPLADE++ retriever...")
    retriever = SpladeRetriever(str(MSMARCO_COLLECTION))
else:
    raise ValueError(f"Unknown retriever type: {RETRIEVER_TYPE}")

print("Retriever initialized successfully")

Initializing BM25 retriever with prebuilt index: msmarco-v1-passage
Retriever initialized successfully


## Setup LLM Reranking (Optional)

If API key is provided, LLM reranking with permutation self-consistency will be enabled.

In [None]:
llm_config = None
if OPENAI_API_KEY:
    llm_config = OpenAIConfig(
        api_key=OPENAI_API_KEY,
        model_name="gpt-3.5-turbo",
        api_type="openai"
    )
    print("LLM reranking enabled")
else:
    print("No API key provided. LLM reranking disabled.")

pipeline = RetrievalPipeline(
    retriever=retriever,
    collection=collection,
    llm_config=llm_config,
    num_permutations=5,
    aggregator="kemeny"
)

No API key provided. LLM reranking disabled.
No LLM config provided or API key missing. LLM reranking will be disabled.


: 

## Run Retrieval and Evaluation

Run the pipeline on TREC DL19/20 queries and evaluate performance.

In [None]:
def run_evaluation(queries, qrels, dataset_name, max_queries=None):
    """Run retrieval pipeline and evaluate on a dataset."""
    query_ids = list(queries.get_all_queries().keys())
    if max_queries:
        query_ids = query_ids[:max_queries]
    
    results = {}
    
    print(f"\nRunning retrieval on {dataset_name} ({len(query_ids)} queries)...")
    for query_id in tqdm(query_ids):
        query_text = queries.get_query(query_id)
        if not query_text:
            continue
        
        ranking_example = pipeline.run(query_text, top_k=1000, rerank_depth=100)
        results[query_id] = ranking_example
    
    print(f"\nEvaluating {dataset_name}...")
    metrics = evaluate_retrieval(results, qrels.get_all_qrels(), k_values=[10, 100])
    
    return metrics, results

metrics_dl19, results_dl19 = run_evaluation(dl19_queries, dl19_qrels, "DL19", max_queries=50)
metrics_dl20, results_dl20 = run_evaluation(dl20_queries, dl20_qrels, "DL20", max_queries=50)


Running retrieval on DL19 (50 queries)...


  0%|          | 0/50 [00:00<?, ?it/s]

## Results Summary

In [None]:
results_df = pd.DataFrame({
    'Dataset': ['DL19', 'DL20'],
    'NDCG@10': [metrics_dl19['ndcg@10'], metrics_dl20['ndcg@10']],
    'NDCG@100': [metrics_dl19['ndcg@100'], metrics_dl20['ndcg@100']],
    'MRR': [metrics_dl19['mrr'], metrics_dl20['mrr']]
})

print("Evaluation Results:")
print("=" * 60)
print(results_df.to_string(index=False))
print("=" * 60)

print(f"\nRetriever: {RETRIEVER_TYPE.upper()}")
print(f"LLM Reranking: {'Enabled' if llm_config else 'Disabled'}")
if llm_config:
    print(f"Permutations: {pipeline.num_permutations}")
    print(f"Aggregator: {pipeline.aggregator_name}")

## Visualization

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics_to_plot = ['NDCG@10', 'NDCG@100', 'MRR']
for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx]
    values = results_df[metric].values
    ax.bar(results_df['Dataset'], values, color=['#3498db', '#e74c3c'])
    ax.set_ylabel(metric)
    ax.set_title(f'{metric} by Dataset')
    ax.set_ylim(0, max(values) * 1.2)
    
    for i, v in enumerate(values):
        ax.text(i, v + max(values) * 0.02, f'{v:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Compare Different Configurations

To compare BM25 vs SPLADE++ or with/without LLM reranking, run the cells above with different configurations and compare the results.