In [1]:
import pandas as pd
from online import MatchingRanking

Create a queries dict using all three dataset query files

In [2]:
# List of file paths
query_files = [
    "Datasets/antique/queries/antique-test-queries.txt",
    "Datasets/antique/queries/antique-train-queries.txt"
]

# Initialize an empty dictionary
queries = {}

# Loop through each file
for f in query_files:
    with open(f, 'r') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if line:
                identifier, text = line.split('\t', 1)
                queries[identifier] = text

# Now, queries contains data from all three CSV files
print(queries)




Create a qrels dict using all three dataset qrels files

In [3]:
qrels_files = [
    "Datasets/antique/qrels/antique-train.qrel",
    "Datasets/antique/qrels/antique-test.qrel"
]
qrels = {}

for f in qrels_files:
	with open(f, 'r') as file:
		for line in file:
			line = line.strip()
			if line:
				query, none, doc, rel = line.split()
				if query not in qrels.keys():
					qrels[query] = {}
				qrels[query][doc] = rel

# Now, qrels contains data from all three qrels files
print(qrels)

{'2531329': {'2531329_0': '4', '2531329_5': '4', '2531329_4': '3', '2531329_7': '3', '2531329_6': '3', '2531329_1': '2', '2531329_3': '3', '2531329_2': '3'}, '3825668': {'3825668_0': '4', '3825668_4': '4', '3825668_2': '4', '3825668_3': '4', '3825668_1': '4'}, '2146313': {'2146313_0': '4', '2146313_23': '3', '2146313_22': '3', '2146313_21': '4', '2146313_20': '3', '2146313_27': '3', '2146313_26': '3', '2146313_25': '3', '2146313_24': '3', '2146313_29': '3', '2146313_28': '4', '2146313_8': '4', '2146313_9': '4', '2146313_1': '3', '2146313_2': '3', '2146313_3': '2', '2146313_4': '3', '2146313_5': '4', '2146313_6': '3', '2146313_7': '2', '2146313_18': '3', '2146313_19': '4', '2146313_12': '4', '2146313_13': '2', '2146313_10': '2', '2146313_11': '4', '2146313_16': '3', '2146313_17': '3', '2146313_14': '4', '2146313_15': '4'}, '4038667': {'4038667_0': '4', '4038667_26': '3', '4038667_25': '3', '4038667_24': '3', '4038667_23': '3', '4038667_22': '2', '4038667_21': '2', '4038667_20': '4', '40

In [4]:
ap_sum = 0.0
rr_sum = 0.0
for query_id, query in queries.items():
	query_id, query = str(query_id), str(query)

	results = MatchingRanking.search("antique", query, 10)

	all_cnt = 0.0
	relevant_cnt = 0.0
	precision = 0.0
	query_rr = 0.0

	for doc_id, doc in results.items():
		doc_id, doc = str(doc_id), str(doc)

		all_cnt += 1.0
		if doc_id in qrels[query_id].keys():
			rel = 1.0 if (int(qrels[query_id][doc_id]) > 1) else 0.0
			if rel == 0.0: continue

			relevant_cnt += 1.0
			precision +=  (relevant_cnt / all_cnt) * rel
			if relevant_cnt == 1.0: query_rr = 1.0 / all_cnt

	query_avg_precision = (precision / relevant_cnt) if relevant_cnt > 0 else 0.0
	ap_sum += query_avg_precision
	rr_sum += query_rr
	# print(query, query_avg_precision)

map_k = ap_sum / len(queries)
mrr = rr_sum / len(queries)
print(f"antique_MAP: {map_k}, antique_MRR: {mrr}")

antique_MAP: 0.27233776597505366, antique_MRR: 0.2959351841777584
