In [None]:
from functools import lru_cache

import chromadb
import ir_datasets
from ir_measures import ScoredDoc, calc_aggregate, nDCG
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

from utils import persistent_cache

dataset = ir_datasets.load("beir/nfcorpus")
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
chroma_client = chromadb.Client()

# chroma_client.delete_collection(name="beir_nfcorpus_documents")
# chroma_client.delete_collection(name="beir_nfcorpus_queries")


EMBEDDINGS_CACHE = "./.cache/embeddings/"
EMBED_DIM = 1024


Generate and cache the full embeddings first.

In [None]:
@persistent_cache.persistent_cache(
    directory=EMBEDDINGS_CACHE, hash_filenames=True, file_type="json"
)
def calculate_embeddings(text):
    embeddings = model.encode(text)
    embeddings = embeddings.tolist()
    return {"embeddings": embeddings}


@lru_cache(maxsize=7000)
def fetch_embeddings(text):
    embeddings = calculate_embeddings(text)
    return embeddings


def create_collection(chroma_client):
    chroma_collection_documents = chroma_client.get_or_create_collection(
        name="beir_nfcorpus_documents", metadata={"hnsw:space": "cosine"}
    )
    return chroma_collection_documents

In [None]:
def index_docs(retain_dim):
    for doc in dataset.docs_iter():
        doc_text = doc.title + " " + doc.text
        doc_emb = fetch_embeddings(doc_text)["embeddings"][:retain_dim]
        chroma_collection_documents.add(
            documents=[doc_text],
            embeddings=[doc_emb],
            metadatas=[{"url": doc.url, "id": doc.doc_id}],
            ids=[doc.doc_id],
        )


def cache_all_queries(retain_dim):
    SEARCH_PROMPT_PREFIX = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:"

    for query in dataset.queries_iter():
        query_text = SEARCH_PROMPT_PREFIX + " " + query.text
        query_embed = calculate_embeddings(query_text)["embeddings"][:retain_dim]

Now that all the documents and queries has been embedded. Lets create a baseline that we can test over

In [None]:
def fetch_query_embeddings(query):
    query_embeddings = calculate_embeddings(query)["embeddings"]
    return query_embeddings


def search(query_id, query_embeddings):
    search_resp = chroma_collection_documents.query(query_embeddings=[query_embeddings])
    # return search_resp["ids"][0]
    # return search_resp

    result = []
    for hit_index in range(len(search_resp["ids"][0])):
        # convert norm dist to similarity
        similarity = 1 - search_resp["distances"][0][hit_index]
        result.append(ScoredDoc(query_id, search_resp["ids"][0][hit_index], similarity))
    return result


In [None]:
def delete_collection(chroma_client):
    try:
        chroma_client.delete_collection("beir_nfcorpus_documents")
    except Exception as e:
        print("collection doesnt exist")


delete_collection(chroma_client)

In [None]:
test_dataset = ir_datasets.load("beir/nfcorpus/test")
results = []

for dim in tqdm(range(1, EMBED_DIM + 1), desc="Running with DIM"):
    run_results = []
    metrics = [nDCG @ 10]

    delete_collection(chroma_client)
    chroma_collection_documents = create_collection(chroma_client)
    index_docs(retain_dim=dim)

    for query in test_dataset.queries_iter():
        qid = query.query_id
        query_text = query.text
        query_embeddings = fetch_query_embeddings(query_text)[:dim]
        run_results.extend(search(qid, query_embeddings))

    metrics = calc_aggregate(metrics, test_dataset.qrels, run_results)
    results.append({"dimention": dim, "nDCG@10": metrics[nDCG @ 10]})
# print("Ranking metric NDCG@10 {:.4f}".format(metrics[nDCG @ 10]))

In [None]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.to_csv("./analysis_results_all_dim.csv")

Ranking metric NDCG@10 for rank profile: 0.3619