In [1]:
# Hybrid retrieval (BM25 + Qdrant dense) + Reciprocal Rank Fusion (RRF)
# Updated to match YOUR current setup:
#   - Qdrant SERVER (localhost:6333) is used (no local path locks)
#   - point id == integer qid (zlib.crc32-based int)
#   - payload contains {"text": ..., plus metadata columns}
#   - dense retrieval works across multiple qdrant-client API versions
#     (search OR query_points fallbacks)

import re
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
from rank_bm25 import BM25Okapi

# -----------------------
# 0) Config
# -----------------------
CHUNKS_CSV   = "PubMed_chunks.csv"
COLLECTION   = "pubmed_chunks"
EMBED_MODEL  = "all-MiniLM-L6-v2"

QDRANT_URL   = "http://localhost:6333"   # Qdrant server
K_BM25       = 50                        # BM25 candidates
K_DENSE      = 50                        # dense candidates
TOP_K        = 10                        # final context after fusion
RRF_K        = 60                        # standard RRF constant

# -----------------------
# 1) Load chunks (BM25 source-of-truth)
# -----------------------
df = pd.read_csv(CHUNKS_CSV)

assert "text" in df.columns, "PubMed_chunks.csv must include a 'text' column"
df["text"] = df["text"].fillna("").astype(str)
df = df[df["text"].str.strip().ne("")].reset_index(drop=True)

import zlib

# --- Recompute qid exactly like embedding/upsert script ---
if "chunk_id" in df.columns:
    df["qid"] = df["chunk_id"].astype(str).apply(lambda s: zlib.crc32(s.encode("utf-8")))
else:
    pmid_col = "PMID" if "PMID" in df.columns else None
    chunk_idx_col = "chunk_index" if "chunk_index" in df.columns else None

    def make_key(row):
        pmid_part = str(row[pmid_col]) if pmid_col else ""
        idx_part  = str(row[chunk_idx_col]) if chunk_idx_col else ""
        text_part = str(row["text"])
        return f"{pmid_part}::{idx_part}::{text_part}"

    df["qid"] = df.apply(lambda r: zlib.crc32(make_key(r).encode("utf-8")), axis=1)

df["qid"] = df["qid"].astype(int)

# NOTE: BM25 doesn't require qid. We only use qid to match Qdrant point IDs.
# The embedding/indexing script stored qid in payload, and used id=qid.
if "qid" not in df.columns:
    raise ValueError(
        "PubMed_chunks.csv does not include 'qid'. "
        "Either export qid into the chunks CSV, or recompute qid the same way here."
    )
df["qid"] = df["qid"].astype(int)

# -----------------------
# 2) Init embedding + Qdrant client (SERVER)
# -----------------------
embedder = SentenceTransformer(EMBED_MODEL)
client = QdrantClient(url=QDRANT_URL)

# Optional sanity check: collection exists
try:
    _ = client.get_collection(COLLECTION)
except Exception as e:
    raise RuntimeError(
        f"Could not access collection '{COLLECTION}' on {QDRANT_URL}. "
        f"Confirm Qdrant is running and the collection was created. Original error: {e}"
    )


# -----------------------
# 3) BM25 setup
# -----------------------
_token_re = re.compile(r"[A-Za-z0-9]+")

def tokenize(s: str):
    return _token_re.findall(str(s).lower())

docs_tokens = [tokenize(t) for t in df["text"]]
bm25 = BM25Okapi(docs_tokens)

def bm25_retrieve(query: str, k: int = K_BM25):
    q_tokens = tokenize(query)
    scores = bm25.get_scores(q_tokens)  # aligned to df rows
    top_idx = np.argsort(scores)[::-1][:k]
    # return list of (qid, score) in ranked order
    return [(int(df.loc[i, "qid"]), float(scores[i])) for i in top_idx]

# -----------------------
# 4) Dense retrieval from Qdrant (API-compatible)
# -----------------------
def dense_retrieve(query: str, k: int = K_DENSE):
    q_vec = embedder.encode(query, normalize_embeddings=True)
    q_vec = q_vec.tolist() if hasattr(q_vec, "tolist") else list(q_vec)

    # A) Most common (newer clients)
    if hasattr(client, "search"):
        hits = client.search(
            collection_name=COLLECTION,
            query_vector=q_vec,
            limit=k,
            with_payload=True,
        )
        return [(int(h.id), float(h.score)) for h in hits]

    # B) Newer API surface on some versions
    if hasattr(client, "query_points"):
        # try common query object variants in order
        last_err = None
        for make_query in (
            lambda: qmodels.QueryVector(vector=q_vec),
            lambda: qmodels.Query(vector=q_vec),
            lambda: qmodels.NearestQuery(nearest=q_vec),
        ):
            try:
                res = client.query_points(
                    collection_name=COLLECTION,
                    query=make_query(),
                    limit=k,
                    with_payload=True,
                )
                hits = res.points
                return [(int(h.id), float(h.score)) for h in hits]
            except Exception as e:
                last_err = e
        raise RuntimeError(f"query_points exists but query object mismatch. Last error: {last_err}")

    # C) If neither exists, your client is too old / odd
    raise RuntimeError(
        "Your qdrant-client is missing both search() and query_points(). "
        "Upgrade in notebook with: %pip install -U qdrant-client"
    )

# -----------------------
# 5) Reciprocal Rank Fusion (RRF)
# -----------------------
def rrf_fuse(bm25_list, dense_list, rrf_k: int = RRF_K, top_k: int = TOP_K):
    """
    bm25_list  = [(qid, bm25_score), ...]   (rank is list position)
    dense_list = [(qid, dense_score), ...]  (rank is list position)
    returns list of qid in fused rank order
    """
    scores = {}

    for rank, (qid, _) in enumerate(bm25_list, start=1):
        scores[qid] = scores.get(qid, 0.0) + 1.0 / (rrf_k + rank)

    for rank, (qid, _) in enumerate(dense_list, start=1):
        scores[qid] = scores.get(qid, 0.0) + 1.0 / (rrf_k + rank)

    fused = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [qid for qid, _ in fused[:top_k]]

# -----------------------
# 6) End-to-end hybrid retrieval
# -----------------------
def hybrid_retrieve(query: str,
                    k_bm25: int = K_BM25,
                    k_dense: int = K_DENSE,
                    top_k: int = TOP_K):
    bm25_hits  = bm25_retrieve(query, k=k_bm25)
    dense_hits = dense_retrieve(query, k=k_dense)

    fused_qids = rrf_fuse(bm25_hits, dense_hits, rrf_k=RRF_K, top_k=top_k)

    # Fetch payloads from Qdrant by point ids (qid)
    points = client.retrieve(
        collection_name=COLLECTION,
        ids=fused_qids,
        with_payload=True,
        with_vectors=False
    )

    payload_by_qid = {int(p.id): (p.payload or {}) for p in points}

    # Return results in fused order; ensure text + provenance is present
    results = []
    for qid in fused_qids:
        p = payload_by_qid.get(int(qid), {})
        results.append({
            "qid": qid,
            "text": p.get("text", ""),
            **{k: v for k, v in p.items() if k != "text"}
        })
    return results

# -----------------------
# 7) Example usage
# -----------------------
query = "cemented knee arthroplasty implant fixation outcomes"
ctx = hybrid_retrieve(query)

for i, item in enumerate(ctx, 1):
    print(f"\n--- Context #{i} ---")
    print("qid:", item.get("qid"))
    print("PMID:", item.get("PMID", item.get("pmid")))
    print("Title:", item.get("Title", item.get("title")))
    print("Journal:", item.get("Journal", item.get("journal")))
    txt = item.get("text", "")
    print("Text:", (txt[:500] + "...") if len(txt) > 500 else txt)


--- Context #1 ---
qid: 2586286066
PMID: 38285554
Title: Four-Year Outcomes of Cementless Versus Cemented Fixation of a Newly Introduced Total Knee Arthroplasty Design.
Journal: Orthopedics
Text: Four-Year Outcomes of Cementless Versus Cemented Fixation of a Newly Introduced Total Knee Arthroplasty Design.

--- Context #2 ---
qid: 319193668
PMID: 40571820
Title: Fixation in total knee arthroplasty: are cemented implants still mandatory in younger and obese patients?
Journal: European journal of orthopaedic surgery & traumatology : orthopedie traumatologie
Text: Although cement still represents the gold standard in total knee arthroplasty fixation, recent advances in implant design, materials, and coatings have led to a renewed interest in cementless fixation. Furthermore, a growing number of challenging patients, such as young and active or obese, are requiring a total knee arthroplasty procedure. This narrative review aims to outline technological advancements in cementless fixation 