In [1]:
# The modules
from libs.lexical_store import LexicalStore
from libs.vector_store import VectorStore
from libs.retriever import RetrieverQA
from libs.basic_document_loader import BasicIndexer
import ir_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Preparing the dataset
# Loading the dataset
dataset = ir_datasets.load("cranfield")
# Preparing the docs and queries
prep_funct = lambda x: dict(doc_id=x[0],title=x[1],text=x[2],author=x[3],bib=x[4])
docs = list(map(prep_funct,dataset.docs_iter()))
queries = {query[0]:query[1] for query in dataset.queries_iter()}


In [3]:
# Preparing the stores
vector = VectorStore(nlist=25)
lexical = LexicalStore()
# Preparing the Indexer
index = BasicIndexer()

In [4]:
# Adding the documents
vector.add(docs)
lexical.add(docs)
index.add_documents(docs)

                                                                            

In [5]:
# Preparing the retrieval
retriever = RetrieverQA(documents_=index, stores=[vector,lexical])

In [6]:
# Iter for al queries
queries_info = {}
for query in dataset.qrels_iter():
    query_id = query[0]
    if query[2] >=2:
        exist_info = queries_info.get(query_id,set())
        exist_info.add(query[1])
        queries_info[query_id] = exist_info


In [7]:
precision = []
recall = []
for query_id, relevants_docs in queries_info.items():
    # The text of the query
    if query_id not in queries:
        continue
    
    query = queries[query_id]
    # Get the documents
    retrieved_docs = set([doc['doc_id'] for doc in retriever.search(query)])
    # relevant and retrieved
    rr = relevants_docs.intersection(retrieved_docs)
    # Calculating the precision
    current_precision = len(rr)/len(retrieved_docs)
    precision.append(current_precision)
    # Calculating the recall
    current_recall = len(rr)/len(relevants_docs)
    recall.append(current_recall)

                                                     

In [8]:
import numpy as np
precision = np.array(precision)
recall = np.array(recall)

In [9]:
print(precision.min(),np.median(precision),precision.mean(),precision.max())
print(recall.min(),np.median(recall),recall.mean(),recall.max())

0.0 0.0 0.009650349650349652 0.28
0.0 0.0 0.045072248013424486 1.0


In [12]:
# Getting  the result
result = retriever.search(queries['1'])
print(f"Query: {queries['1']}")
for doc in result:
    print(f"Doc: {doc['doc_id']}")

                                                     

Query: what similarity laws must be obeyed when constructing aeroelastic models
of heated high speed aircraft .
Doc: 486
Doc: 184
Doc: 13
Doc: 51
Doc: 12
Doc: 746
Doc: 875
Doc: 1268
Doc: 1361
Doc: 606
Doc: 878
Doc: 860
Doc: 792
Doc: 332
Doc: 14
Doc: 102
Doc: 1144
Doc: 172
Doc: 1362
Doc: 497
Doc: 36
Doc: 395
Doc: 747
Doc: 141
Doc: 29
Doc: 378
Doc: 195
Doc: 573
Doc: 435
Doc: 686
Doc: 1305
Doc: 1104
Doc: 374
Doc: 577
Doc: 880
Doc: 283
Doc: 695
Doc: 236
Doc: 665
Doc: 635
Doc: 914
Doc: 876
Doc: 252
Doc: 1328
Doc: 552
Doc: 142
Doc: 540
Doc: 1207
Doc: 251
Doc: 25


