In [90]:
import numpy as np
import pandas as pd
from collections import defaultdict
from pyserini.search import SimpleSearcher
from pyserini.index import IndexReader

In [165]:
queries_doc = pd.read_csv('dev/queries.docdev.tsv', sep='\t', names=['q_id', 'text'])
queries_doc.set_index('q_id', inplace=True)

In [166]:
qrels = pd.read_csv('dev/msmarco-docdev-qrels.tsv', sep=' ', names=['q_id', '0', 'doc_id', 'rel'])
qrels.drop(['0', 'rel'], axis=1, inplace=True)
qrels.set_index('q_id', inplace=True)

In [167]:
results = pd.read_csv('dev-trec-output.tsv', sep='\t', names=['metric', 'q_id', 'rr'])
results = results.drop(['metric'], axis=1)
results = results.sort_values(by=['rr'])
results.set_index('q_id', inplace=True)
results = results.merge(qrels, on='q_id')
results = results.merge(queries_doc, on='q_id')
results

Unnamed: 0_level_0,rr,doc_id,text
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000000,0.0,D149214,where does real insulin come from
227637,0.0,D1156186,how does spicy food affect testosterone
225499,0.0,D1848360,how does apa cite laws internally
22231,0.0,D2792164,are eggs or grapes better to fight colds
222158,0.0,D968046,how do you find the mean on excel
...,...,...,...
882002,1.0,D11265,what normal blood pressure by age
1009994,1.0,D487092,_______ ratios measure an organization's abili...
116431,1.0,D2978107,deed restrictions
114573,1.0,D3085498,customer service number for atmos energy


In [29]:
# Basic statistics
print(f'Recall: {(results["rr"] > 0).sum() / len(results)}')

Recall: 0.8076626877165961


In [186]:
# Read probabilistic model output
rankings = pd.read_csv('dev-bm25-with-score.trec', sep=' ', names=['q_id', 'Q0', 'doc_id', 'rank', 'score', 'run'])
rankings_dict = defaultdict(list)
for index, (q_id, _, doc_id, _, score, _) in rankings.iterrows():
    rankings_dict[q_id].append((doc_id, score))

In [187]:
print(rankings_dict[92542])

[('D1118594', 5.74399995803833), ('D2064696', 5.612299919128418), ('D2064694', 5.465499877929688), ('D340120', 5.399099826812744), ('D1327250', 5.362299919128418), ('D2115239', 5.246099948883057), ('D361231', 5.24609899520874), ('D3159701', 5.231400012969971), ('D1606966', 5.169099807739258), ('D2064695', 5.140999794006348), ('D292663', 5.104499816894531), ('D2942129', 5.03249979019165), ('D346003', 5.003499984741211), ('D412028', 4.898099899291992), ('D890771', 4.884600162506104), ('D774778', 4.876999855041504), ('D340121', 4.858500003814697), ('D2122910', 4.844299793243408), ('D2615189', 4.8180999755859375), ('D3039616', 4.774799823760986), ('D816898', 4.774499893188477), ('D806075', 4.771599769592285), ('D302760', 4.746600151062012), ('D1048518', 4.701000213623047), ('D2573182', 4.68120002746582), ('D753465', 4.61899995803833), ('D2893648', 4.607999801635742), ('D78907', 4.589799880981445), ('D3365428', 4.5854997634887695), ('D613528', 4.580699920654297), ('D2111671', 4.537000179290

In [None]:
searcher = SimpleSearcher('indexes/lucene-index-msmarco-doc')
index_reader = IndexReader('indexes/lucene-index-msmarco-doc')

In [170]:
import re
def findURL(s: str):
    regex = '(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'
    urls = re.search(regex, s)
    return urls[0].strip()

results['url'] = [findURL(searcher.doc(doc_id).raw()) for doc_id in results['doc_id']]
results

Unnamed: 0_level_0,rr,doc_id,text,url
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000000,0.0,D149214,where does real insulin come from,https://en.wikipedia.org/wiki/Insulin
227637,0.0,D1156186,how does spicy food affect testosterone,http://www.dailymail.co.uk/sciencetech/article...
225499,0.0,D1848360,how does apa cite laws internally,https://www.wikihow.com/Use-Internal-Citations
22231,0.0,D2792164,are eggs or grapes better to fight colds,http://allrecipes.com/recipe/34577/cold-rice-s...
222158,0.0,D968046,how do you find the mean on excel,http://www.ehow.com/how_6776563_calculate-mean...
...,...,...,...,...
882002,1.0,D11265,what normal blood pressure by age,https://www.reference.com/health/read-blood-pr...
1009994,1.0,D487092,_______ ratios measure an organization's abili...,https://www.readyratios.com/reference/liquidity/
116431,1.0,D2978107,deed restrictions,https://definitions.uslegal.com/d/deed-restric...
114573,1.0,D3085498,customer service number for atmos energy,https://gethuman.com/phone-number/Atmos-Energy


In [183]:
results['bm25_score'] = [index_reader.compute_query_document_score(doc_id, text) for (q_id, (_, doc_id, text, _)) in results.iterrows()]
results

ValueError: too many values to unpack (expected 4)

In [280]:
results.loc[[203688, 203458, 199776, 196963, 196232, 196111, 19552, 193742, 190212, 189312]]


Unnamed: 0_level_0,rr,doc_id,text,url,bm25_score
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
203688,0.0,D351737,history of interest rates,http://www.fedprimerate.com/wall_street_journa...,4.577474
203458,0.0,D2107130,hippo denotation definition,http://whatis.techtarget.com/definition/HiPPOs...,6.966055
199776,0.0,D2039429,health benefits of eating vegetarian,http://www.nursingdegree.net/blog/19/57-health...,8.01827
196963,0.0,D3194467,green card meaning,http://www.businessdictionary.com/definition/g...,4.782698
196232,0.0,D2187416,government does do,https://www.reference.com/government-politics/...,3.386575
196111,0.0,D1074203,gorm the dissolver,http://www.reocities.com/area51/cavern/3247/go...,0.0
19552,0.0,D1751087,anxiety: definition,http://medical-dictionary.thefreedictionary.co...,4.669761
193742,0.0,D1061432,gendered phenomenon definition,https://www.merriam-webster.com/dictionary/nat...,4.752489
190212,0.0,D354561,foods to eat when diarrhea,http://www.upmc.com/patients-visitors/educatio...,8.199498
189312,0.0,D621827,foods that contains calcium,https://www.healthaliciousness.com/articles/fo...,6.769504


In [290]:
query_id = 193742
print(f'Query text: {results.loc[query_id]["text"]}')
print("===================")
print('Relevant document URL:')
print(results.loc[query_id]['url'])
print(results.loc[query_id]['bm25_score'])
print()
for (doc_id, score) in rankings_dict[query_id]:
    print("==============================")
    doc = searcher.doc(doc_id)
    print(doc_id)

    terms = index_reader.analyze(results.loc[query_id]['text'])
    tf = index_reader.get_document_vector(doc_id)
    print(f'Total number of terms: {sum(tf.values())}')
    for t in terms:
        if t in tf:
            print(f'{t}: {tf[t]}')

    print(findURL(doc.raw()))
    print(index_reader.compute_query_document_score(doc_id, queries_doc.loc[query_id]["text"]))
    print("==============================")

Query text: gendered phenomenon definition
Relevant document URL:
https://www.merriam-webster.com/dictionary/natural%20gender
4.752488613128662

D658309
Total number of terms: 504
gender: 44
phenomenon: 1
definit: 6
http://emedicine.medscape.com/article/917990-overview
7.741460800170898
D1195712
Total number of terms: 462
gender: 32
phenomenon: 1
definit: 5
http://www.definitions.net/definition/Gender
7.729641914367676
D2971601
Total number of terms: 239
phenomenon: 17
definit: 11
http://www.definitions.net/definition/natural%20phenomenon
5.9870924949646
D1355149
Total number of terms: 183
gender: 1
phenomenon: 2
definit: 8
http://www.philender.com/courses/intro/notes/opdef.html
7.348760604858398
D3551798
Total number of terms: 267
gender: 3
phenomenon: 1
definit: 10
http://www.definitions.net/definition/bishonen
7.408695220947266
D3063185
Total number of terms: 331
gender: 5
phenomenon: 4
http://www.nydailynews.com/life-style/facebook-studies-age-gap-couples-article-1.1613671
6.818539

In [289]:
doc_id = results.loc[query_id]['doc_id']
# doc_id = 'D1138842'
query_text = results.loc[query_id]['text']
query_text = query_text + " diet"
terms = index_reader.analyze(query_text)
tf = index_reader.get_document_vector(doc_id)
print(index_reader.compute_query_document_score(doc_id, query_text))
print(f'Total number of terms: {sum(tf.values())}')
for t in terms:
    if t in tf:
        print(f'{t}: {tf[t]}')
# print(index_reader.get_term_counts('mean', analyzer=None))
# index_reader.get_term_counts('hippopotamus')
# # tf[index_reader.analyze('denotation')[0]]
# # df = {term: (index_reader.get_term_counts(term, analyzer=None))[0] for term in tf.keys()}
# # df
print(index_reader.doc(doc_id).raw())

6.769504070281982
Total number of terms: 631
food: 8
contain: 3
calcium: 31
<TEXT>
https://www.healthaliciousness.com/articles/foods-high-in-calcium.php
Top 10 Foods Highest in Calcium
Top 10 Foods Highest in Calcium
Calcium is a nutrient necessary for the growth and maintenance of strong teeth and bones, nerve signaling, muscle contraction, and secretion of certain hormones and enzymes.
A deficiency in calcium can lead to numbness in fingers and toes, muscle cramps, convulsions, lethargy, loss of appetite, and abnormal heart rhythms.
Conversely, excess calcium (particularly from supplements) can lead to kidney stones, calcification of soft tissue, and increased risk of vascular diseases like stroke and heart attack.
Calcium is mostly found in dark leafy greens and dairy foods.
While there is some evidence that oxalates in greens can hinder calcium absorption, green vegetables are still a good source of calcium, and the calculated daily value (DV) already takes into account absorption 

In [None]:
searcher.search('green card definition')