In [3]:
# dependencies
!pip install --upgrade pip setuptools wheel
!pip install transformers faiss-cpu torch

import torch
import faiss
import pandas as pd

from transformers import (
    DPRQuestionEncoder,
    DPRContextEncoder,
    DPRQuestionEncoderTokenizer,
    DPRContextEncoderTokenizer,
)


Defaulting to user installation because normal site-packages is not writeable
Collecting pip
  Using cached pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Collecting setuptools
  Using cached setuptools-76.1.0-py3-none-any.whl.metadata (6.7 kB)
Collecting wheel
  Using cached wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Using cached pip-25.0.1-py3-none-any.whl (1.8 MB)
Using cached setuptools-76.1.0-py3-none-any.whl (1.2 MB)
Using cached wheel-0.45.1-py3-none-any.whl (72 kB)


ERROR: To modify pip, please run the following command:
C:\ProgramData\anaconda3\python.exe -m pip install --upgrade pip setuptools wheel


Defaulting to user installation because normal site-packages is not writeable


In [7]:
# function to parse CISI data
def load_data(path):
    import os
    
    def _load_documents(path):
        doc_set = {}
        doc_id, doc_text = "", ""

        with open(path) as f:
            lines = ""
            for l in f.readlines():
                lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
            lines = lines.lstrip("\n").split("\n")

        for l in lines:
            if l.startswith(".I"):
                doc_id = int(l.split(" ")[1].strip())
            elif l.startswith(".X"):
                doc_set[doc_id] = doc_text.lstrip(" ")
                doc_id, doc_text = "", ""
            else:
                doc_text += l.strip()[3:] + " "

        return doc_set

    def _load_queries(path):
        qry_set = {}
        qry_id = ""

        with open(path) as f:
            lines = ""
            for l in f.readlines():
                lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
            lines = lines.lstrip("\n").split("\n")

        for l in lines:
            if l.startswith(".I"):
                qry_id = int(l.split(" ")[1].strip())
            elif l.startswith(".W"):
                qry_set[qry_id] = l.strip()[3:]
                qry_id = ""

        return qry_set

    def _load_relevance(path):
        rel_set = {}

        with open(path) as f:
            for l in f.readlines():
                qry_id = int(l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0])
                doc_id = int(l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1])
                if qry_id in rel_set:
                    rel_set[qry_id].append(doc_id)
                else:
                    rel_set[qry_id] = [doc_id]

        return rel_set

    doc_set = _load_documents(path + "/CISI.ALL")
    qry_set = _load_queries(path + "/CISI.QRY")
    rel_set = _load_relevance(path + "/CISI.REL")

    print(f"\n\nNumber of mappings = {len(rel_set)}")
    print(rel_set[1])
    
    return doc_set, qry_set, rel_set

In [8]:
# load data into dataframes
doc_set, qry_set, rel_set = load_data('../dataset')

doc_set = pd.DataFrame(list(doc_set.items()), columns=["doc_id", "text"])
qry_set = pd.DataFrame(list(qry_set.items()), columns=["query_id", "text"])
rel_set = pd.DataFrame(list(rel_set.items()), columns=["query_id", "doc_ids"])



Number of mappings = 76
[28, 35, 38, 42, 43, 52, 65, 76, 86, 150, 189, 192, 193, 195, 215, 269, 291, 320, 429, 465, 466, 482, 483, 510, 524, 541, 576, 582, 589, 603, 650, 680, 711, 722, 726, 783, 813, 820, 868, 869, 894, 1162, 1164, 1195, 1196, 1281]


In [9]:
# documents, queries, and relations
docs = list(doc_set.iterrows())
queries = list(qry_set.iterrows())
qrels = list(rel_set.iterrows())

print("Num docs:", len(docs))
print("Num queries:", len(queries))
print("Num qrels:", len(qrels))

print(docs[0])
print(queries[0])
print(qrels[0])

# corpus creation
corpus = []
doc_ids = []

for _, d in docs:
    doc_text = d.text
    corpus.append(doc_text)
    doc_ids.append(f"{d.doc_id}")

print("Original #docs:", len(docs))

Num docs: 1460
Num queries: 112
Num qrels: 76
(0, doc_id                                                    1
text      18 Editions of the Dewey Decimal Classificatio...
Name: 0, dtype: object)
(0, query_id                                                    1
text        What problems and concerns are there in making...
Name: 0, dtype: object)
(0, query_id                                                    1
doc_ids     [28, 35, 38, 42, 43, 52, 65, 76, 86, 150, 189,...
Name: 0, dtype: object)
Original #docs: 1460


In [None]:
# load DPR pretrained encoders and tokenizers
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
question_encoder.to(device)
context_encoder.to(device)

# encode the corpus
def encode_passages(passages, batch_size=32):
    all_embs = []
    for i in range(0, len(passages), batch_size):
        batch = passages[i:i+batch_size]
        inputs = context_tokenizer(batch, padding=True, max_length=512, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            emb = context_encoder(**inputs).pooler_output
        all_embs.append(emb.cpu())
    return torch.cat(all_embs, dim=0).numpy()

passage_embeddings = encode_passages(corpus)
print("Passage embeddings shape:", passage_embeddings.shape)

In [None]:
# build a FAISS Index with the corpis embeddings
dimension = passage_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(passage_embeddings)

# encode queries and retrieve
def encode_query(query: str):
    inputs = question_tokenizer(query, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        q_emb = question_encoder(**inputs).pooler_output
    return q_emb.cpu().numpy()

def retrieve(query: str, top_k: int = 5):
    q_emb = encode_query(query)
    distances, indices = index.search(q_emb, top_k)
    results = []
    for rank, idx in enumerate(indices[0], start=1):
        results.append({
            'rank': rank,
            'doc_id': doc_ids[idx],
            'doc_text': corpus[idx],
            'distance': float(distances[0][rank-1])
        })
    return results

# sample query
sample_query = queries[0][1]["text"]
print("Sample Query (QID=" + str(queries[0][1]["query_id"]) + "):", sample_query)

# retrieve top 3 documents by nn distance
top_docs = retrieve(sample_query, top_k=3)

print("\nTop 3 Retrieved Passages:\n")
for doc_info in top_docs:
    print(f"Rank {doc_info['rank']} | Doc ID: {doc_info['doc_id']} | Distance: {doc_info['distance']:.4f}\n")
    print(doc_info['doc_text'][:300], "...\n")

In [1]:
from retrievers.DPR import DPRRetriever

# DPR initialization

doc_path = "../dataset/CISI.ALL"
qry_path = "../dataset/CISI.QRY"
rel_path = "../dataset/CISI.REL"

dpr_retriever = DPRRetriever(doc_path, qry_path, rel_path)

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the

In [13]:
print(dpr_retriever.qry_set["query_id"][1])

dpr_retriever._retrieve_dpr(dpr_retriever.qry_set["text"][4], top_k=3)

2


[{'rank': 1,
  'doc_id': '1327',
  'doc_text': "The SMART Retrieval System Experiments in Automatic Document Processing Salton, G. The automatic SMART document retrieval system was designed at Harvard University between 1961 and 1964, and has been operating of IBM 7094 and 360 equipment both at Harvard and at Cornell University for several years.  The system takes documents and search requests in the natural language, performs a fully automatic content analysis of the texts using one of several dozen programmed language analysis methods, matches analyzed documents with analyzed search requests, and retrieves for the user's attention those stored items believed to be most similar to the submitted queries. ",
  'distance': 77.95306396484375},
 {'rank': 2,
  'doc_id': '1013',
  'doc_text': 'Bibliographic and Technical Problems in Implementing a National Library Network Avram, H.D. The problems facing the planners of automated library networks are rooted in the complexities of organizing a