In [62]:
import numpy as np
import pandas as pd
from collections import defaultdict
from pyserini.search import SimpleSearcher
from pyserini.index import IndexReader

In [63]:
queries_doc = pd.read_csv('dev/queries.docdev.tsv', sep='\t', names=['q_id', 'text'])
queries_doc.set_index('q_id', inplace=True)

In [64]:
qrels = pd.read_csv('dev/msmarco-docdev-qrels.tsv', sep=' ', names=['q_id', '0', 'doc_id', 'rel'])
qrels.drop(['0', 'rel'], axis=1, inplace=True)
qrels.set_index('q_id', inplace=True)

In [65]:
results = pd.read_csv('dev-trec-output.tsv', sep='\t', names=['metric', 'q_id', 'rr'])
results = results.drop(['metric'], axis=1)
results = results.sort_values(by=['rr'])
results.set_index('q_id', inplace=True)
results = results.merge(qrels, on='q_id')
results = results.merge(queries_doc, on='q_id')
results

Unnamed: 0_level_0,rr,doc_id,text
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000000,0.0,D149214,where does real insulin come from
227637,0.0,D1156186,how does spicy food affect testosterone
225499,0.0,D1848360,how does apa cite laws internally
22231,0.0,D2792164,are eggs or grapes better to fight colds
222158,0.0,D968046,how do you find the mean on excel
...,...,...,...
882002,1.0,D11265,what normal blood pressure by age
1009994,1.0,D487092,_______ ratios measure an organization's abili...
116431,1.0,D2978107,deed restrictions
114573,1.0,D3085498,customer service number for atmos energy


In [66]:
# Basic statistics
print(f'Recall: {(results["rr"] > 0).sum() / len(results)}')

Recall: 0.8077812018489985


In [67]:
# Read probabilistic model output
rankings = pd.read_csv('output.trec', sep=' ', names=['q_id', 'Q0', 'doc_id', 'rank', 'score', 'run'])
rankings_dict = defaultdict(list)
for index, (q_id, _, doc_id, _, _, _) in rankings.iterrows():
    rankings_dict[q_id].append(doc_id)

In [68]:
print(rankings_dict[92542])

['D1118594', 'D2064696', 'D2064694', 'D340120', 'D1327250', 'D2115239', 'D361231', 'D3159701', 'D1606966', 'D2064695', 'D292663', 'D2942129', 'D346003', 'D412028', 'D890771', 'D774778', 'D340121', 'D2122910', 'D2615189', 'D3039616', 'D816898', 'D806075', 'D302760', 'D1048518', 'D2573182', 'D753465', 'D2893648', 'D78907', 'D3365428', 'D613528', 'D2111671', 'D2074599', 'D753464', 'D1637614', 'D3436135', 'D3034600', 'D2910354', 'D1988158', 'D2022402', 'D1993489', 'D629294', 'D2559908', 'D2216247', 'D1830709', 'D3302520', 'D129308', 'D2166235', 'D3505513', 'D1312752', 'D2022207', 'D3302364', 'D367034', 'D3303149', 'D3505512', 'D2109840', 'D598799', 'D127683', 'D1183676', 'D581663', 'D1098960', 'D2647872', 'D3034601', 'D127684', 'D2086376', 'D386614', 'D3159703', 'D1610278', 'D1364807', 'D768994', 'D1023401', 'D896444', 'D3483011', 'D457796', 'D3039618', 'D3290974', 'D2636982', 'D183127', 'D2866025', 'D1697652', 'D753466', 'D2278363', 'D2793257', 'D3298280', 'D5780', 'D2291609', 'D1322909',

In [69]:
searcher = SimpleSearcher('indexes/lucene-index-msmarco-doc')
index_reader = IndexReader('indexes/lucene-index-msmarco-doc')

In [70]:
import re
def findURL(s: str):
    regex = '(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'
    urls = re.search(regex, s)
    return urls[0].strip()

results['url'] = [findURL(searcher.doc(doc_id).raw()) for doc_id in results['doc_id']]
results

Unnamed: 0_level_0,rr,doc_id,text,url
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000000,0.0,D149214,where does real insulin come from,https://en.wikipedia.org/wiki/Insulin
227637,0.0,D1156186,how does spicy food affect testosterone,http://www.dailymail.co.uk/sciencetech/article...
225499,0.0,D1848360,how does apa cite laws internally,https://www.wikihow.com/Use-Internal-Citations
22231,0.0,D2792164,are eggs or grapes better to fight colds,http://allrecipes.com/recipe/34577/cold-rice-s...
222158,0.0,D968046,how do you find the mean on excel,http://www.ehow.com/how_6776563_calculate-mean...
...,...,...,...,...
882002,1.0,D11265,what normal blood pressure by age,https://www.reference.com/health/read-blood-pr...
1009994,1.0,D487092,_______ ratios measure an organization's abili...,https://www.readyratios.com/reference/liquidity/
116431,1.0,D2978107,deed restrictions,https://definitions.uslegal.com/d/deed-restric...
114573,1.0,D3085498,customer service number for atmos energy,https://gethuman.com/phone-number/Atmos-Energy


In [71]:
results['bm25_score'] = [index_reader.compute_query_document_score(doc_id, text) for (q_id, (_, doc_id, text, _)) in results.iterrows()]
results

Unnamed: 0_level_0,rr,doc_id,text,url,bm25_score
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000000,0.0,D149214,where does real insulin come from,https://en.wikipedia.org/wiki/Insulin,5.795771
227637,0.0,D1156186,how does spicy food affect testosterone,http://www.dailymail.co.uk/sciencetech/article...,12.297520
225499,0.0,D1848360,how does apa cite laws internally,https://www.wikihow.com/Use-Internal-Citations,6.465944
22231,0.0,D2792164,are eggs or grapes better to fight colds,http://allrecipes.com/recipe/34577/cold-rice-s...,5.092932
222158,0.0,D968046,how do you find the mean on excel,http://www.ehow.com/how_6776563_calculate-mean...,4.904947
...,...,...,...,...,...
882002,1.0,D11265,what normal blood pressure by age,https://www.reference.com/health/read-blood-pr...,8.735644
1009994,1.0,D487092,_______ ratios measure an organization's abili...,https://www.readyratios.com/reference/liquidity/,17.273161
116431,1.0,D2978107,deed restrictions,https://definitions.uslegal.com/d/deed-restric...,7.720560
114573,1.0,D3085498,customer service number for atmos energy,https://gethuman.com/phone-number/Atmos-Energy,14.931982


In [72]:
query_id = 1000000
print(f'Query text: {results.loc[query_id]["text"]}')
print("===================")
print('Relevant document URL:')
print(results.loc[query_id]['url'])
print()
for doc_id in rankings_dict[query_id]:
    doc = searcher.doc(doc_id)
    print(findURL(doc.raw()))
    print(index_reader.compute_query_document_score(doc_id, queries_doc.loc[query_id]["text"]))

Query text: where does real insulin come from
Relevant document URL:
https://en.wikipedia.org/wiki/Insulin

https://answers.yahoo.com/question/index?qid=20070712031322AAURFkK
7.947456359863281
http://www.toyboxphilosopher.com/2016/01/american-girls-lea-clark-sloth-and.html
9.036995887756348
http://www.mangomannutrition.com/causes-insulin-resistance-lipid-overload-2/
8.14870548248291
https://www.mangomannutrition.com/causes-insulin-resistance-lipid-overload-2/
8.14870548248291
https://www.dietdoctor.com/new-paradigm-insulin-resistance
8.895774841308594
http://articles.chicagotribune.com/1990-07-29/features/9003030670_1_human-insulin-recombinant-human-pancreas
6.971416473388672
http://www.answers.com/Q/Where_is_insulin_secreted
6.715585708618164
http://www.wisegeekhealth.com/what-is-insulin.htm
7.375102996826172
https://simplefill.com/novolog-prescription-assistance/
7.815303802490234
http://www.answers.com/Q/What_does_insulin_do
6.732107639312744
http://www.diabetes-info.co.uk/treating-

In [73]:
results


Unnamed: 0_level_0,rr,doc_id,text,url,bm25_score
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000000,0.0,D149214,where does real insulin come from,https://en.wikipedia.org/wiki/Insulin,5.795771
227637,0.0,D1156186,how does spicy food affect testosterone,http://www.dailymail.co.uk/sciencetech/article...,12.297520
225499,0.0,D1848360,how does apa cite laws internally,https://www.wikihow.com/Use-Internal-Citations,6.465944
22231,0.0,D2792164,are eggs or grapes better to fight colds,http://allrecipes.com/recipe/34577/cold-rice-s...,5.092932
222158,0.0,D968046,how do you find the mean on excel,http://www.ehow.com/how_6776563_calculate-mean...,4.904947
...,...,...,...,...,...
882002,1.0,D11265,what normal blood pressure by age,https://www.reference.com/health/read-blood-pr...,8.735644
1009994,1.0,D487092,_______ ratios measure an organization's abili...,https://www.readyratios.com/reference/liquidity/,17.273161
116431,1.0,D2978107,deed restrictions,https://definitions.uslegal.com/d/deed-restric...,7.720560
114573,1.0,D3085498,customer service number for atmos energy,https://gethuman.com/phone-number/Atmos-Energy,14.931982


In [74]:
mine = [1000000, 227637, 225499, 22231, 222158, 211621, 211468, 210442, 209764, 207595]
results.loc[mine]

Unnamed: 0_level_0,rr,doc_id,text,url,bm25_score
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000000,0.0,D149214,where does real insulin come from,https://en.wikipedia.org/wiki/Insulin,5.795771
227637,0.0,D1156186,how does spicy food affect testosterone,http://www.dailymail.co.uk/sciencetech/article...,12.29752
225499,0.0,D1848360,how does apa cite laws internally,https://www.wikihow.com/Use-Internal-Citations,6.465944
22231,0.0,D2792164,are eggs or grapes better to fight colds,http://allrecipes.com/recipe/34577/cold-rice-s...,5.092932
222158,0.0,D968046,how do you find the mean on excel,http://www.ehow.com/how_6776563_calculate-mean...,4.904947
211621,0.0,D730986,how close is a meter to a yard?,http://www.asknumbers.com/MetersToYardsConvers...,7.763941
211468,0.0,D2287221,how car gears work,http://auto.howstuffworks.com/transmission.htm,6.740464
210442,0.0,D2309275,how can i get more energy while pregnant,http://www.wikihow.com/Gain-Energy-During-Preg...,4.209955
209764,0.0,D376092,how breathing takes place in human,https://en.wikibooks.org/wiki/Human_Physiology...,6.516494
207595,0.0,D1156231,how are mlb players paid,http://www.businessinsider.com/major-league-ba...,8.368806


In [77]:
from urllib.parse import urlparse

results['domain'] = [urlparse(url).netloc for (q_id, (_, doc_id, text, url, _)) in results.iterrows()]
results

Unnamed: 0_level_0,rr,doc_id,text,url,bm25_score,domain
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000000,0.0,D149214,where does real insulin come from,https://en.wikipedia.org/wiki/Insulin,5.795771,en.wikipedia.org
227637,0.0,D1156186,how does spicy food affect testosterone,http://www.dailymail.co.uk/sciencetech/article...,12.297520,www.dailymail.co.uk
225499,0.0,D1848360,how does apa cite laws internally,https://www.wikihow.com/Use-Internal-Citations,6.465944,www.wikihow.com
22231,0.0,D2792164,are eggs or grapes better to fight colds,http://allrecipes.com/recipe/34577/cold-rice-s...,5.092932,allrecipes.com
222158,0.0,D968046,how do you find the mean on excel,http://www.ehow.com/how_6776563_calculate-mean...,4.904947,www.ehow.com
...,...,...,...,...,...,...
882002,1.0,D11265,what normal blood pressure by age,https://www.reference.com/health/read-blood-pr...,8.735644,www.reference.com
1009994,1.0,D487092,_______ ratios measure an organization's abili...,https://www.readyratios.com/reference/liquidity/,17.273161,www.readyratios.com
116431,1.0,D2978107,deed restrictions,https://definitions.uslegal.com/d/deed-restric...,7.720560,definitions.uslegal.com
114573,1.0,D3085498,customer service number for atmos energy,https://gethuman.com/phone-number/Atmos-Energy,14.931982,gethuman.com


In [78]:
results.groupby(['domain']).size().sort_values(ascending=False)

domain
en.wikipedia.org                   618
www.answers.com                    142
www.reference.com                   92
www.quora.com                       69
answers.yahoo.com                   62
                                  ... 
www.acceptancerate.com               1
www.abundance-and-happiness.com      1
www.abswood.com                      1
www.aboutlanguageschools.com         1
zetcode.com                          1
Length: 2461, dtype: int64