# Hybrid Information Retrieval Optimization: Fusing Lexical (BM25) and Semantic (SciBERT) Scores



In [72]:
import os
import subprocess

# --- 1. Python Path Setup (Optional, but often helpful) ---
# Ensure local packages are on the path, especially if using a virtual environment
import sys
if not os.path.dirname(os.getcwd()) in sys.path:
    sys.path.append(os.path.dirname(os.getcwd()))

# --- 2. Java Environment Configuration (CRITICAL for Pyserini) ---
print("--- Configuring local Java environment for Pyserini ---")
# This block attempts to find your Java installation. 
# You might need to adjust the path based on where JDK 21 is installed on your OS.

# Common check for Mac/Linux
try:
    if sys.platform.startswith(('linux', 'darwin')):
        # Attempt to find the Java path via the system's 'java' command
        java_bin_path = subprocess.check_output(['readlink', '-f', '/usr/bin/java']).decode('utf-8').strip()
        java_home = os.path.dirname(os.path.dirname(java_bin_path))
        os.environ['JAVA_HOME'] = java_home
        
        # Set JVM_PATH to the libjvm.so file Pyserini needs (might need manual adjustment)
        libjvm_path = os.path.join(java_home, 'lib/server/libjvm.so')
        if not os.path.exists(libjvm_path):
            # Fallback for alternative JDK structures
            libjvm_path = os.path.join(java_home, 'jre/lib/amd64/server/libjvm.so')
        os.environ['JVM_PATH'] = libjvm_path
        print(f"Set JAVA_HOME to: {os.environ.get('JAVA_HOME')}")
        print(f"Set JVM_PATH to: {os.environ.get('JVM_PATH')}")
    elif sys.platform.startswith('win'):
        # On Windows, you typically set JAVA_HOME in your system environment variables.
        # If set, pyjnius will usually find it. If not, uncomment and set manually:
        # os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-21' 
        print("Assuming JAVA_HOME is set in Windows environment variables.")

except Exception as e:
    print(f"Warning: Automatic Java path detection failed: {e}. Ensure JAVA_HOME is set.")

--- Configuring local Java environment for Pyserini ---
Set JAVA_HOME to: /usr
Set JVM_PATH to: /usr/jre/lib/amd64/server/libjvm.so


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [73]:
import os
import subprocess
import sys

print("--- Step 1: Installing Packages ---")
# Force reinstall/check packages in the notebook environment
print(subprocess.run(['pip', 'install', '-q', 'pyserini', 'pyjnius', 'transformers>=4.40', 'sentence-transformers>=2.7', 'torch'], capture_output=True, text=True).stdout)


print("--- Step 2: Manual JDK 21 Configuration (The Final Path) ---")

# The confirmed JDK 21 folder on your system
JAVA_HOME_PATH = '/Library/Java/JavaVirtualMachines/temurin-21.jdk/Contents/Home'
os.environ['JAVA_HOME'] = JAVA_HOME_PATH

# We use the final, most likely path: /lib/server/libjvm.dylib
JVM_PATH_ABS = os.path.join(JAVA_HOME_PATH, 'lib/server/libjvm.dylib')
os.environ['JVM_PATH'] = JVM_PATH_ABS


# Verification Check
if os.path.exists(os.environ['JVM_PATH']):
    print(f"✅ Success! JVM_PATH set to: {os.environ['JVM_PATH']}")
    print(f"JAVA_HOME set to: {os.environ['JAVA_HOME']}")
    # Optional: Verify Java version (can be tricky on Mac)
    try:
        os.environ['PATH'] = os.path.join(JAVA_HOME_PATH, 'bin') + ':' + os.environ['PATH']
        java_version_output = subprocess.check_output(['java', '-version'], stderr=subprocess.STDOUT).decode().split('\n')[0]
        print('java -version ->', java_version_output)
    except Exception:
        print("Warning: Could not run 'java -version', but JVM path is confirmed.")
    print("--- Java setup complete. Proceeding to imports... ---")
else:
    # If this STILL fails, the file extension or the jre path is the issue.
    # We will try the next most likely path (with the 'jre' folder)
    JVM_PATH_ABS_JRC = os.path.join(JAVA_HOME_PATH, 'jre/lib/server/libjvm.dylib')
    if os.path.exists(JVM_PATH_ABS_JRC):
        os.environ['JVM_PATH'] = JVM_PATH_ABS_JRC
        print(f"✅ Success! JVM_PATH corrected to: {os.environ['JVM_PATH']}")
    else:
        raise RuntimeError(
            f"CRITICAL ERROR: The file was not found at ANY standard path. "
            f"Checked: {JVM_PATH_ABS} and {JVM_PATH_ABS_JRC}. "
            "The internal structure of your Temurin installation is highly customized."
        )

# --- This section is where the import would happen in the next cell ---
print("\n--- Step 3: Importing Libraries ---")
from pyserini.search.lucene import LuceneSearcher
from sentence_transformers.cross_encoder import CrossEncoder
print("✅ Libraries imported successfully.")

--- Step 1: Installing Packages ---


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



--- Step 2: Manual JDK 21 Configuration (The Final Path) ---
✅ Success! JVM_PATH set to: /Library/Java/JavaVirtualMachines/temurin-21.jdk/Contents/Home/lib/server/libjvm.dylib
JAVA_HOME set to: /Library/Java/JavaVirtualMachines/temurin-21.jdk/Contents/Home
java -version -> openjdk version "21.0.2" 2024-01-16 LTS
--- Java setup complete. Proceeding to imports... ---

--- Step 3: Importing Libraries ---
✅ Libraries imported successfully.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [74]:
from pyserini.search.lucene import LuceneSearcher

In [75]:
import importlib.metadata as md
import pyserini, transformers, sentence_transformers, torch

def show(mod, pkg_name=None):
    pkg = pkg_name or mod.__name__.replace('_','-')
    try:
        ver = mod.__version__
    except Exception:
        try:
            ver = md.version(pkg)
        except Exception:
            ver = 'installed'
    print(f"{mod.__name__}:", ver)

show(pyserini, 'pyserini')
show(transformers, 'transformers')
show(sentence_transformers, 'sentence-transformers')
show(torch, 'torch')

# Import Lucene-backed searcher now that JVM is configured
from pyserini.search.lucene import LuceneSearcher


pyserini: 1.2.0
transformers: 4.56.2
sentence_transformers: 5.1.1
torch: 2.8.0


## Create a tiny demo corpus



In [76]:
import json, os

# Define a local, relative path (this will create a 'data' folder next to your notebook)
CORPUS_DIR = './data/corpus-jsonl' 
os.makedirs(CORPUS_DIR, exist_ok=True)

docs = [
  {"id": "D1", "title": "ACE2 expression in smokers", "contents": "Several studies report differential ACE2 receptor expression in smokers versus non-smokers, which could influence viral entry dynamics."},
  {"id": "D2", "title": "Long COVID symptoms in adolescents", "contents": "Adolescents may experience fatigue, headaches, sleep disturbance, and concentration difficulties as part of long COVID."},
  {"id": "D3", "title": "mRNA vaccine efficacy in elderly", "contents": "Effectiveness of mRNA vaccines against severe disease remains high in older adults, though waning immunity is a factor."},
  {"id": "D4", "title": "T-cell response to Omicron variant", "contents": "Studies indicate that prior infection and vaccination generate T-cells that cross-react with the Omicron spike protein, offering protection against severe outcomes."},
  {"id": "D5", "title": "Aerosol transmission dynamics", "contents": "Transmission risk is strongly influenced by ventilation, occupancy, and exposure time, supporting aerosol spread as a key factor."},
  {"id": "D6", "title": "Monoclonal antibody resistance", "contents": "The continuous evolution of the virus, particularly new variants, has led to resistance to several therapeutic monoclonal antibodies."},
  {"id": "D7", "title": "Vitamin D levels and severity", "contents": "Low Vitamin D levels are consistently associated with increased risk of severe COVID-19 and mortality."},
  {"id": "D8", "title": "Smoking and respiratory epithelium", "contents": "Chronic smoking alters ciliary beat frequency and epithelial integrity in the respiratory tract."}
]

# Write the documents to the new local path
with open(os.path.join(CORPUS_DIR, 'corpus.jsonl'), 'w', encoding='utf-8') as f:
    for doc in docs:
        json_doc = json.dumps(doc)
        f.write(json_doc + '\n')
        
print(f"✅ Corpus successfully created at {os.path.join(CORPUS_DIR, 'corpus.jsonl')}")

✅ Corpus successfully created at ./data/corpus-jsonl/corpus.jsonl


## Build a BM25 index with Pyserini (Lucene)



In [77]:
%%bash

CORPUS_DIR="./data/corpus-jsonl"
INDEX_DIR="./data/index"

python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input ${CORPUS_DIR} \
  --index ${INDEX_DIR} \
  --generator DefaultLuceneDocumentGenerator \
  --threads 2 --storePositions --storeDocvectors --storeRaw

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2025-09-28 23:24:43,574 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:205) - Setting log level to INFO
2025-09-28 23:24:43,577 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:209) - AbstractIndexer settings:
2025-09-28 23:24:43,577 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:210) -  + DocumentCollection path: ./data/corpus-jsonl
2025-09-28 23:24:43,577 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:211) -  + CollectionClass: JsonCollection
2025-09-28 23:24:43,577 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:212) -  + Index path: ./data/index
2025-09-28 23:24:43,577 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:213) -  + Threads: 2
2025-09-28 23:24:43,578 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:214) -  + Optimize (merge segments)? false


Sep 28, 2025 11:24:43 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


2025-09-28 23:24:43,608 INFO  [main] index.IndexCollection (IndexCollection.java:246) - Using DefaultEnglishAnalyzer
2025-09-28 23:24:43,608 INFO  [main] index.IndexCollection (IndexCollection.java:247) - Stemmer: porter
2025-09-28 23:24:43,608 INFO  [main] index.IndexCollection (IndexCollection.java:248) - Keep stopwords? false
2025-09-28 23:24:43,609 INFO  [main] index.IndexCollection (IndexCollection.java:249) - Stopwords file: null
2025-09-28 23:24:43,720 INFO  [main] index.IndexCollection (IndexCollection.java:197) - IndexCollection settings:
2025-09-28 23:24:43,720 INFO  [main] index.IndexCollection (IndexCollection.java:198) -  + Generator: DefaultLuceneDocumentGenerator
2025-09-28 23:24:43,720 INFO  [main] index.IndexCollection (IndexCollection.java:199) -  + Language: en
2025-09-28 23:24:43,720 INFO  [main] index.IndexCollection (IndexCollection.java:200) -  + Stemmer: porter
2025-09-28 23:24:43,720 INFO  [main] index.IndexCollection (IndexCollection.java:201) -  + Keep stopwo

## Hybrid scoring utilities (BM25 → Cross‑Encoder → Fusion)
We define: `bm25_search`, `cross_encoder_rerank`, normalizers, **RRF**, and a `fuse_scores` helper.


In [78]:
# 5. Define the hybrid scoring functions (from 'hybrid-code')
# This must be the absolute first executable line for Python 3.x
from __future__ import annotations

print("--- Defining hybrid scoring functions ---")

from dataclasses import dataclass
from typing import Dict, List, Tuple
import json, math
import numpy as np

# Pyserini and Hugging Face imports
from pyserini.search.lucene import LuceneSearcher
from sentence_transformers.cross_encoder import CrossEncoder


@dataclass
class RetrievedDoc:
    docid: str
    score: float
    title: str | None
    contents: str

def bm25_search(index_dir: str, query: str, k: int) -> List[RetrievedDoc]:
    searcher = LuceneSearcher(index_dir)
    hits = searcher.search(query, k=k)
    out: List[RetrievedDoc] = []
    for h in hits:
        raw = searcher.doc(h.docid).raw()
        title, contents = None, None
        try:
            obj = json.loads(raw)
            contents = obj.get("contents") or obj.get("text") or raw
            title = obj.get("title")
        except Exception:
            contents = raw
        out.append(RetrievedDoc(docid=h.docid, score=h.score, title=title, contents=contents))
    return out

def cross_encoder_rerank(model_name: str, query: str, docs: List[RetrievedDoc], batch_size: int = 32) -> Dict[str, float]:
    ce = CrossEncoder(model_name, max_length=512)
    pairs = [[query, d.contents if isinstance(d.contents, str) else str(d.contents)] for d in docs]
    # The original notebook runs with no_grad, which is typical for inference, but the library handles it.
    scores = ce.predict(pairs, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=False)
    return {d.docid: float(s) for d, s in zip(docs, scores)}

def _zscore(x: np.ndarray) -> np.ndarray:
    mu, sd = x.mean(), x.std()
    return (x - mu) / (sd + 1e-8)

def _minmax(x: np.ndarray) -> np.ndarray:
    lo, hi = x.min(), x.max()
    if math.isclose(hi, lo):
        return np.zeros_like(x)
    return (x - lo) / (hi - lo)

def normalize_scores(scores: Dict[str, float], method: str) -> Dict[str, float]:
    keys = list(scores.keys())
    arr = np.array([scores[k] for k in keys], dtype=float)
    if method == 'zscore':
        arr = _zscore(arr)
    elif method == 'minmax':
        arr = _minmax(arr)
    else:
        raise ValueError(f'Unknown normalization method: {method}')
    return {k: float(v) for k, v in zip(keys, arr)}

def reciprocal_rank_fusion(rankings: List[List[str]], k: int = 60) -> Dict[str, float]:
    fused: Dict[str, float] = {}
    for ranking in rankings:
        for rank, docid in enumerate(ranking):
            fused[docid] = fused.get(docid, 0.0) + 1.0 / (k + rank + 1)
    return fused

def fuse_scores(bm25_scores: Dict[str, float], ce_scores: Dict[str, float], *, method: str = 'weighted', norm: str = 'zscore', alpha: float = 0.5) -> Dict[str, float]:
    if method == 'rrf':
        bm25_ranking = [d for d, _ in sorted(bm25_scores.items(), key=lambda x: -x[1])]
        ce_ranking = [d for d, _ in sorted(ce_scores.items(), key=lambda x: -x[1])]\
        # For RRF, we only fuse the documents that are retrieved by BM25 (the input to the CE re-ranker)
        # However, the current implementation uses *all* documents from bm25_scores/ce_scores, which should be the same set.
        return reciprocal_rank_fusion([bm25_ranking, ce_ranking])

    # weighted fusion (normalize + sum)
    all_ids = list(set(bm25_scores) | set(ce_scores))
    bm = {d: bm25_scores.get(d, float('-inf')) for d in all_ids}
    ce = {d: ce_scores.get(d, float('-inf')) for d in all_ids}
    def safe_replace_missing(s: Dict[str, float]) -> Dict[str, float]:
        vals = [v for v in s.values() if v != float('-inf')]
        if not vals:
            return {k: 0.0 for k in s}
        floor = min(vals) - 1.0
        return {k: (floor if v == float('-inf') else v) for k, v in s.items()}
    bm = safe_replace_missing(bm)
    ce = safe_replace_missing(ce)
    bm_norm = normalize_scores(bm, norm)
    ce_norm = normalize_scores(ce, norm)
    return {d: alpha * bm_norm[d] + (1.0 - alpha) * ce_norm[d] for d in all_ids}

def pretty_print(name: str, ranking: List[Tuple[str, float]], lookup: Dict[str, RetrievedDoc], topn: int = 5):
    print(f"\n=== {name} Top-{topn} ===")
    for i, (docid, score) in enumerate(ranking[:topn], start=1):
        rec = lookup[docid]
        title = f"{rec.title} — " if rec.title else ""
        snippet = (rec.contents or '').replace('\n', ' ')
        if len(snippet) > 220:
            snippet = snippet[:220] + '…'
        print(f"{i:>2}. {docid}  (score={score:.4f})")
        print(f"    {title}{snippet}")

--- Defining hybrid scoring functions ---


## Run the hybrid scorer



In [79]:
# 6. Execute the final run logic (This cell will now use SciBERT and compare WSS vs RRF)

print("--- Running final comparative hybrid scorer ---")
# 🛑 CORRECTED PATH: Use the local index created in the Bash cell 🛑
INDEX_DIR = './data/index' 
QUERY = 'long COVID symptoms in adolescents'
K = 8 # Total documents retrieved
TOPN = 5 # Number of top results to print

# *** USE YOUR INTENDED SciBERT MODEL NAME HERE ***
# This model will now download successfully because you ran 'huggingface-cli login'
CE_MODEL = 'allenai/scibert_scivocab_uncased'


FUSION = 'weighted'
NORM = 'zscore'
ALPHA = 0.5

# Stage 1 & 2 Execution
bm25_docs = bm25_search(INDEX_DIR, QUERY, k=K)
id2doc = {d.docid: d for d in bm25_docs}
bm25_scores = {d.docid: d.score for d in bm25_docs}
ce_scores = cross_encoder_rerank(CE_MODEL, QUERY, bm25_docs, batch_size=16)

# Rankings for the report
bm25_ranking = sorted(bm25_scores.items(), key=lambda x: -x[1])
ce_ranking = sorted(ce_scores.items(), key=lambda x: -x[1])

# 1. Weighted Sum Fusion (WSS) - Your primary method
fused_scores = fuse_scores(bm25_scores, ce_scores, method=FUSION, norm=NORM, alpha=ALPHA)
fused_ranking = sorted(fused_scores.items(), key=lambda x: -x[1])

# Print All Rankings
pretty_print('BM25 (Lexical)', bm25_ranking, id2doc, topn=TOPN)
pretty_print('Cross-Encoder (SciBERT)', ce_ranking, id2doc, topn=TOPN)
pretty_print(f'WSS FUSED (Weighted Sum, alpha={ALPHA})', fused_ranking, id2doc, topn=TOPN)

# 2. Reciprocal Rank Fusion (RRF) - Your comparative baseline
print("\n--- Comparative Analysis: Reciprocal Rank Fusion (RRF) ---")
rrf_scores = fuse_scores(bm25_scores, ce_scores, method='rrf')
rrf_ranking = sorted(rrf_scores.items(), key=lambda x: -x[1])
pretty_print('RRF FUSION (k=60)', rrf_ranking, id2doc, topn=TOPN)

--- Running final comparative hybrid scorer ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== BM25 (Lexical) Top-5 ===
 1. D2  (score=2.6194)
    Long COVID symptoms in adolescents — Adolescents may experience fatigue, headaches, sleep disturbance, and concentration difficulties as part of long COVID.
 2. D7  (score=0.6898)
    Vitamin D levels and severity — Low Vitamin D levels are consistently associated with increased risk of severe COVID-19 and mortality.

=== Cross-Encoder (SciBERT) Top-5 ===
 1. D2  (score=0.4665)
    Long COVID symptoms in adolescents — Adolescents may experience fatigue, headaches, sleep disturbance, and concentration difficulties as part of long COVID.
 2. D7  (score=0.4150)
    Vitamin D levels and severity — Low Vitamin D levels are consistently associated with increased risk of severe COVID-19 and mortality.

=== WSS FUSED (Weighted Sum, alpha=0.5) Top-5 ===
 1. D2  (score=1.0000)
    Long COVID symptoms in adolescents — Adolescents may experience fatigue, headaches, sleep disturbance, and concentration difficulties as part of long COVID.
 2. 