In [5]:
#!/usr/bin/env python3
import json
import csv
from pathlib import Path
from rank_bm25 import BM25Okapi
import sys
import re

# Files (adjust names if needed)
QUERIES_TSV = "./queries/custom-queries.tsv"      # two columns: id<TAB>query_text
DOCS_JSONL = "./data/custom_docs_passages.jsonl"        # each line: {"doc_id": "...", "passages": {"passage_1": "...", ...}}
OUTPUT_TSV = "./data/custom_bm25_results.tsv"

# Minimal tokenizer
_tokenize_re = re.compile(r"\w+")
def tokenize(text):
    return _tokenize_re.findall(text.lower())

# Load docs: doc_id -> concatenated text
docs = []
doc_ids = []
with open(DOCS_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip(): 
            continue
        obj = json.loads(line)
        doc_id = obj.get("doc_id")
        passages = obj.get("passages", {})
        # concatenate all passage texts
        text = " ".join(p for p in passages.values() if isinstance(p, str))
        docs.append(text)
        doc_ids.append(doc_id)

if not docs:
    print("No docs loaded from", DOCS_JSONL, file=sys.stderr)
    sys.exit(1)

# Tokenize corpus and build BM25
tokenized_corpus = [tokenize(d) for d in docs]
bm25 = BM25Okapi(tokenized_corpus)

In [6]:
# Read queries
queries = []
with open(QUERIES_TSV, "r", encoding="utf-8") as f:
    for line in f:
        line = line.rstrip("\n")
        if not line:
            continue
        parts = line.split("\t")
        if len(parts) >= 2:
            qid = parts[0]
            qtext = parts[1]
        else:
            # if single-column file, assign incremental id
            qid = parts[0]
            qtext = ""
        queries.append((qid, qtext))

if not queries:
    print("No queries loaded from", QUERIES_TSV, file=sys.stderr)
    sys.exit(1)

# Rank per query and write TREC-style TSV:
# query_id, Q0, doc_id, rank, score, Anserini
with open(OUTPUT_TSV, "w", encoding="utf-8", newline="") as out:
    writer = csv.writer(out, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
    for qid, qtext in queries:
        q_tokens = tokenize(qtext)
        scores = bm25.get_scores(q_tokens)  # numpy-like array
        # Pair doc_id with score and sort descending
        ranked = sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)
        for rank_idx, (doc_id, score) in enumerate(ranked, start=1):
            writer.writerow([qid, "Q0", doc_id, str(rank_idx), f"{score:.6f}", "Anserini"])