In [1]:
import pandas as pd
from online import MatchingRanking

Create a queries dict using all three dataset query files

In [2]:
# List of file paths
query_files = [
    "Datasets/wikIR1k/validation/queries.csv",
    "Datasets/wikIR1k/training/queries.csv",
    "Datasets/wikIR1k/test/queries.csv"
]

# Initialize an empty dictionary
queries = {}

# Loop through each file
for file in query_files:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    
    # Convert the DataFrame to a dictionary and merge it with the existing dictionary
    queries.update(df.set_index('id_left')['text_left'].to_dict())

# Now, queries contains data from all three CSV files
print(queries)


{1402535: 'irish sea', 91198: 'phillips exeter academy', 1015979: 'president of chile', 111134: 'university of kentucky', 201459: 'johnnie to', 12823: 'togo', 73083: 'the temptations', 21040: 'leonard nimoy', 626467: 'polish resistance movement in world war ii', 8858: 'north america', 21957: 'hispanic', 838163: 'herman yau', 13163: 'truro', 340095: '2020 summer olympics', 532774: 'leeward islands cricket team', 84379: 'southern united states', 16676: 'brothers grimm', 75122: 'government of hong kong', 196465: 'university of cape town', 296184: 'cubans', 23158: 'mallorca', 203635: 'croatia national football team', 18750: 'brooklyn', 98364: 'terrestrial television', 2018: 'czech republic', 85239: 'warmian masurian voivodeship', 1502917: 'train station', 13140: 'transylvania', 178208: 'victorian legislative council', 124257: 'goldman sachs', 2574: 'chinese cuisine', 476603: 'neoclassical architecture', 529442: 'united states district court for the southern district of texas', 73541: 'ches

Create a qrels dict using all three dataset qrels files

In [3]:
qrels_files = [
    "Datasets/wikIR1k/validation/qrels",
    "Datasets/wikIR1k/training/qrels",
    "Datasets/wikIR1k/test/qrels"
]
qrels = {}

for f in qrels_files:
	with open(f, 'r') as file:
		for line in file:
			line = line.strip()
			if line:
				query, none, doc, rel = line.split()
				if query not in qrels.keys():
					qrels[query] = {}
				qrels[query][doc] = rel

# Now, qrels contains data from all three qrels files
print(qrels)

{'1402535': {'1402535': '2', '488489': '1', '1813456': '1', '1668563': '1', '1232652': '1', '1888565': '1', '1094677': '1', '1983801': '1', '681497': '1', '6123': '1', '1904664': '1', '1920866': '1', '267722': '1', '88235': '1', '458048': '1', '1756794': '1'}, '91198': {'91198': '2', '2280362': '1', '2285336': '1', '2337992': '1', '2291511': '1', '2264059': '1'}, '1015979': {'1015979': '2', '2226456': '1', '1514612': '1', '1119171': '1', '1053174': '1', '1186821': '1', '229754': '1'}, '111134': {'111134': '2', '523530': '1', '2122775': '1', '1449613': '1', '1891684': '1', '1416706': '1', '942105': '1', '1366368': '1', '342067': '1', '2083181': '1', '1772380': '1', '1136188': '1', '1378749': '1', '2029648': '1', '2158694': '1', '1722840': '1', '794273': '1'}, '201459': {'201459': '2', '2142274': '1', '1171309': '1', '1903934': '1', '456246': '1', '915867': '1'}, '12823': {'12823': '2', '1932715': '1', '1329804': '1', '1319882': '1', '2430562': '1', '1505063': '1', '256654': '1', '206210

In [4]:
ap_sum = 0.0
rr_sum = 0.0
for query_id, query in queries.items():
	query_id, query = str(query_id), str(query)

	results = MatchingRanking.search("wikir", query, 10)

	all_cnt = 0.0
	relevant_cnt = 0.0
	precision = 0.0
	query_rr = 0.0

	for doc_id, doc in results.items():
		doc_id, doc = str(doc_id), str(doc)

		all_cnt += 1.0
		if doc_id in qrels[query_id].keys():
			rel = 1.0 if (int(qrels[query_id][doc_id]) > 0) else 0.0
			if rel == 0.0: continue

			relevant_cnt += 1.0
			precision +=  (relevant_cnt / all_cnt) * rel
			if relevant_cnt == 1.0: query_rr = 1.0 / all_cnt

	query_avg_precision = (precision / relevant_cnt) if relevant_cnt > 0 else 0.0
	ap_sum += query_avg_precision
	rr_sum += query_rr
	# print(query, query_avg_precision)

map_k = ap_sum / len(queries)
mrr = rr_sum / len(queries)
print(f"wikir_MAP: {map_k}, wikir_MRR: {mrr}")

wikir_MAP: 0.5287332181960381, wikir_MRR: 0.589260128219983
