In [18]:
!pip install ir_datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting ir_datasets
  Downloading ir_datasets-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting inscriptis>=2.2.0 (from ir_datasets)
  Downloading inscriptis-2.6.0-py3-none-any.whl.metadata (25 kB)
Collecting lxml>=4.5.2 (from ir_datasets)
  Downloading lxml-5.3.2-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.6 kB)
Collecting trec-car-tools>=2.5.4 (from ir_datasets)
  Downloading trec_car_tools-2.6-py3-none-any.whl.metadata (640 bytes)
Collecting lz4>=3.1.10 (from ir_datasets)
  Downloading lz4-4.4.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting warc3-wet>=0.2.3 (from ir_datasets)
  Downloading warc3_wet-0.2.5-py3-none-any.whl.metadata (2.2 kB)
Collecting warc3-wet-clueweb09>=0.2.5 (from ir_datasets)
  Downloading warc3-wet-clueweb09-0.2.5.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting zlib-state>=0.1.3 (from ir_datasets)
  Downloading zlib_state-0.1.9.tar.gz (9.5 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting r

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.cluster import MiniBatchKMeans

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# # Set dimensions
# d = 128        # Embedding dimension
# k = 1000       # Number of centroids
# num_docs = 10000
# tokens_per_doc = 20
# tokens_per_query = 8

In [11]:
# Load a small subset of MS MARCO
dataset = load_dataset("ms_marco", "v2.1", split="train[:2000]")
passages = []
queries = []
for ex in dataset:
    passage_text = " ".join(ex['passages']["passage_text"])
    passages.append(passage_text)
    if len(queries) <= 5:
        queries.append(ex['query'])

# Load tokenizer + encoder (MiniLM)
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoder = AutoModel.from_pretrained(model_name).eval()




In [12]:
passages[0]

"The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated. The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science. Essay on The Manhattan Project - The Manhattan Project The Manhattan Project was to see if making an atomic bomb possible. The success of this project would forever change the world forever making it known that something this powerful can be manmade. The Manhattan Project was the name for a project conducted during World War II, to develop the first atomic bomb. It refers specifically to the period of the project from 194 … 2-1946 under the control of the U.S. Army Corps of Engineers, under t

In [13]:
# Token-to-vector extractor
def encode_text(texts, max_length=64):
    inputs = tokenizer(texts, padding=True, truncation=True,
                       return_tensors="pt", max_length=max_length)
    with torch.no_grad():
        outputs = encoder(**inputs)
    # Use token embeddings (excluding special tokens)
    token_embeddings = outputs.last_hidden_state  # [batch, seq, hidden]
    attention_mask = inputs.attention_mask.bool()
    all_vecs = []
    for i in range(len(texts)):
        mask = attention_mask[i]
        vecs = token_embeddings[i][mask][1:-1]  # exclude [CLS], [SEP]
        all_vecs.append(vecs.numpy())
    return all_vecs  # List of [n_tokens, d]

In [14]:
# Step 3: Encode all documents (token-level vectors)
doc_vecs = encode_text(passages[:1000])  # list of [n_tokens, d]

# Flatten into a big token matrix
flattened_tokens = np.vstack(doc_vecs)

# Step 3.1: Learn centroids (k-means)
k = 256
kmeans = MiniBatchKMeans(n_clusters=k, batch_size=1024, random_state=42)
kmeans.fit(flattened_tokens)
centroids = kmeans.cluster_centers_
centroids /= np.linalg.norm(centroids, axis=1, keepdims=True)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
# Step 4: Convert document tokens to centroid IDs
def assign_centroids(token_vecs, centroids):
    norms = np.linalg.norm(token_vecs, axis=1, keepdims=True)
    token_vecs = token_vecs / norms
    sims = np.dot(token_vecs, centroids.T)
    return np.argmax(sims, axis=1)  # [n_tokens]

doc_centroid_ids = [assign_centroids(vecs, centroids) for vecs in doc_vecs]


In [17]:
# Encode queries
query_vecs = encode_text(queries)

# For each query, compute centroid-based retrieval
def compute_scores_for_query(q_vec):
    q_vec = q_vec / np.linalg.norm(q_vec, axis=1, keepdims=True)
    S_cq = centroids @ q_vec.T  # [k, |q|]

    scores = []
    for doc in doc_centroid_ids:
        doc_scores = S_cq[doc]  # [len(doc), |q|]
        max_sim = np.max(doc_scores, axis=0)  # [|q|]
        scores.append(np.sum(max_sim))
    return scores

# Example: score for the first query
scores_q0 = compute_scores_for_query(query_vecs[0])
top_docs = np.argsort(scores_q0)[-5:][::-1]
print("Top passages for Q0:", [passages[i] for i in top_docs])


Top passages for Q0: ["When Australian scientist Ruben Meerman lost 30 pounds last year, one question kept bugging him: Where did the fat go? The answer might seem obvious: It was burned up, as we say — which implies that it was transformed into heat or energy. The researchers chose to follow the path of these atoms when leaving the body. They found that when 10 kg of fat were oxidized, 8.4 kg were converted and excreted as carbon dioxide (CO2) via the lungs, and 1.6 kg became water (H20). In order for 10 kg of human fat to be oxidized, the researchers calculated that 29 kg of oxygen must be inhaled. Exercise also increases the oxidation of fat, which then leaves your body via your lungs, in the form of carbon dioxide, and your bodily fluids, in the form of water. What’s not so complex however, is how to optimize your metabolism—even if you don’t understand the exact mechanisms involved. The research conducted by a team at UNSW Science in Sydney calculated exactly what happens to our f

In [19]:
import ir_datasets

# Load MS MARCO passage ranking dev dataset
msmarco = ir_datasets.load("msmarco-passage/dev")

# Map query_id → relevant passage_ids
qrels = {}
for qrel in msmarco.qrels_iter():
    if qrel.relevance > 0:
        qrels.setdefault(qrel.query_id, set()).add(qrel.doc_id)

# Map query_id → text, doc_id → text
query_texts = {}
doc_texts = {}
for q in msmarco.queries_iter():
    query_texts[q.query_id] = q.text
for d in msmarco.docs_iter():
    doc_texts[d.doc_id] = d.text


[INFO] Please confirm you agree to the MSMARCO data usage agreement found at <http://www.msmarco.org/dataset.aspx>
[INFO] [starting] https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv
[INFO] [finished] https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv: [00:00] [1.20MB] [1.77MB/s]
[INFO] If you have a local copy of https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz, you can symlink it here to avoid downloading it again: /Users/luigiliu/.ir_datasets/downloads/c177b2795d5f2dcc524cf00fcd973be1
[INFO] [starting] https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz
[INFO] [finished] https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz: [00:02] [18.9MB] [8.61MB/s]
                                                                                                   

ValueError: Insufficient disk space: /Users/luigiliu/.ir_datasets/msmarco-passage/collectionandqueries.tar.gz requires 1.1GB but only 251.6MB is available (806.1MB more needed)

In [None]:
# Reverse mapping: text → doc_id (approximate match)
text_to_doc_id = {v: k for k, v in doc_texts.items()}

# For one query
query_id = list(query_texts.keys())[0]
relevant_ids = qrels[query_id]

# Get PLAID top passages for this query
top_passages = [passages[i] for i in top_docs]
top_doc_ids = [text_to_doc_id.get(p.strip(), None) for p in top_passages]

# Check how many are relevant
hits = [doc_id for doc_id in top_doc_ids if doc_id in relevant_ids]
recall_at_k = len(hits) / len(relevant_ids)

print("Relevant doc IDs:", relevant_ids)
print("Retrieved doc IDs:", top_doc_ids)
print("Correctly retrieved:", hits)
print(f"Recall@{len(top_doc_ids)} = {recall_at_k:.2f}")
