# TREC CAR (2017) Dataset

As browsing the 30 million documents is far too time-consuming, we will create here a smaller dataset.

In [1]:
import dill as pickle
import ir_datasets

In [2]:
dataset = ir_datasets.load("car/v1.5/test200")

# Docstore
docstore = dataset.docs_store()

# Queries
for query in dataset.queries_iter():
    print(query[0], query[2], query[3], sep='\t')
    break

# Relevance Judgements
for qrel in dataset.qrels_iter():
    print(qrel[0], qrel[1], qrel[2], sep='\t')
    break

Hog-dog%20rodeo/Typical%20match	Hog-dog rodeo	('Typical match',)
ASME/ASME%20codes%20and%20standards	16d8f62407d2cdd283a71735e5c83f7d7947b93a	1


In [23]:
queries_ids = set()
docs = {}
relevances = {}

for qrel in dataset.qrels_iter():
    queries_ids.add(qrel[0])
    docs[qrel[1]] = docstore.get(qrel[1])[1]
    relevances[(qrel[0], qrel[1])] = qrel[2]

queries = {}
for query in dataset.queries_iter():
    if query[0] in queries_ids:
        queries[query[0]] = (query[1], query[2], query[3])

In [25]:
print(f"Number of queries: {len(queries)}")
print(f"Number of documents: {len(docs)}")
print(f"Number of relevance judgements: {len(relevances)}")

Number of queries: 1860
Number of documents: 4689
Number of relevance judgements: 4706


In [22]:
pickle.dump(queries, open("../data/queries.pkl", "wb"))
pickle.dump(docs, open("../data/documents.pkl", "wb"))
pickle.dump(relevances, open("../data/relevances.pkl", "wb"))