In [27]:
import pandas as pd
import numpy as np
from pyserini.search import SimpleSearcher
from pyserini.index import IndexReader


In [55]:
nbr_of_queries = 1000

train_qrels = pd.read_csv('train/msmarco-doctrain-qrels.tsv', sep=' ', names=['q_id', '0', 'doc_id', '1'])
train_queries = pd.read_csv('train/queries.doctrain.tsv', sep='\t', names=['q_id', 'text'])
train_queries.set_index('q_id', inplace=True)
train_queries

Unnamed: 0_level_0,text
q_id,Unnamed: 1_level_1
1185869,)what was the immediate impact of the success ...
1185868,_________ justice is designed to repair the ha...
1183785,elegxo meaning
645590,what does physical medicine do
186154,feeding rice cereal how many times per day
...,...
19285,anterolisthesis definition
558837,what are fishing flies
559149,what are fsh levels during perimenopause
706678,what is a yowie


In [56]:
train_qrels.drop(['0', '1'], inplace=True, axis=1)
train_qrels

Unnamed: 0,q_id,doc_id
0,3,D312959
1,5,D140227
2,12,D213890
3,15,D1033338
4,16,D508131
...,...,...
367008,1185862,D2008201
367009,1185864,D1126522
367010,1185865,D630512
367011,1185868,D59235


In [58]:
sampled_queries = train_qrels.sample(n=nbr_of_queries)
sampled_queries['score'] = 1
sampled_queries

Unnamed: 0,q_id,doc_id,score
107130,353645,D2617561,1
346067,1138100,D1653454,1
39889,129948,D1078408,1
32185,105416,D115756,1
190814,624271,D178677,1
...,...,...,...
164254,554617,D670789,1
23104,76287,D1241797,1
292315,919798,D142433,1
81367,265785,D678427,1


In [59]:
searcher = SimpleSearcher('indexes/lucene-index-msmarco-doc')
searcher.set_bm25(k1=4.46, b=0.82)

In [60]:
to_add = []
for idx, (q_id, doc_id, _) in sampled_queries.iterrows():
    text = train_queries.loc[q_id]['text']
    hits = searcher.search(text, k=100)
    counter = 1
    for hit in hits:
        if hit.docid == doc_id:
            continue
        to_add.append([q_id, hit.docid, 0])
        counter += 1
        if counter == 100:
            break
to_add

[[353645, 'D2410423', 0],
 [353645, 'D1742503', 0],
 [353645, 'D2617564', 0],
 [353645, 'D3076736', 0],
 [353645, 'D1903439', 0],
 [353645, 'D1173373', 0],
 [353645, 'D2371344', 0],
 [353645, 'D1074668', 0],
 [353645, 'D1014783', 0],
 [353645, 'D810161', 0],
 [353645, 'D896511', 0],
 [353645, 'D3282819', 0],
 [353645, 'D1173376', 0],
 [353645, 'D3549110', 0],
 [353645, 'D213097', 0],
 [353645, 'D2792425', 0],
 [353645, 'D1742504', 0],
 [353645, 'D3399235', 0],
 [353645, 'D2240659', 0],
 [353645, 'D2204860', 0],
 [353645, 'D3478440', 0],
 [353645, 'D2327234', 0],
 [353645, 'D2823096', 0],
 [353645, 'D951048', 0],
 [353645, 'D2573395', 0],
 [353645, 'D177449', 0],
 [353645, 'D1249211', 0],
 [353645, 'D479858', 0],
 [353645, 'D2617563', 0],
 [353645, 'D1294776', 0],
 [353645, 'D2573397', 0],
 [353645, 'D2162500', 0],
 [353645, 'D709973', 0],
 [353645, 'D2795005', 0],
 [353645, 'D719334', 0],
 [353645, 'D2253556', 0],
 [353645, 'D2775217', 0],
 [353645, 'D3425158', 0],
 [353645, 'D913062',

In [61]:
new_df = pd.DataFrame(to_add, columns=['q_id', 'doc_id', 'score'])
res = sampled_queries.append(new_df)
res.sort_values('q_id', inplace=True)
res

Unnamed: 0,q_id,doc_id,score
13899,320,D268942,0
13886,320,D976698,0
13887,320,D2273256,0
13888,320,D2857085,0
13889,320,D991323,0
...,...,...,...
28538,1185070,D3360642,0
28537,1185070,D3175561,0
28536,1185070,D5288,0
28546,1185070,D1224227,0


In [62]:
res

Unnamed: 0,q_id,doc_id,score
13899,320,D268942,0
13886,320,D976698,0
13887,320,D2273256,0
13888,320,D2857085,0
13889,320,D991323,0
...,...,...,...
28538,1185070,D3360642,0
28537,1185070,D3175561,0
28536,1185070,D5288,0
28546,1185070,D1224227,0


In [63]:
import math
import re

def findURL(s: str):
    regex = '(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'
    urls = re.search(regex, s)
    return urls[0].strip()

index_reader = IndexReader('indexes/lucene-index-msmarco-doc')
stats = index_reader.stats()
total_documents = stats['documents']

def bm25(q_id, doc_id):
    return index_reader.compute_query_document_score(doc_id, train_queries.loc[q_id]["text"])

def tf_idf(q_id, doc_id):
    text = train_queries.loc[q_id]["text"]
    terms = index_reader.analyze(text) # Get analyzed terms for query
    tf = index_reader.get_document_vector(doc_id) # Keys are analyzed terms
    total_terms = sum(tf.values())
    res = 0

    for term in terms:
        frequency = tf[term] / total_terms if term in tf else 0
        document_frequency, _ = index_reader.get_term_counts(term, analyzer=None)
        idf = math.log(total_documents / (document_frequency + 1))
        res += frequency * idf

    return res

def tf(q_id, doc_id):
    text = train_queries.loc[q_id]["text"]
    terms = index_reader.analyze(text)
    tf = index_reader.get_document_vector(doc_id)
    total_terms = sum(tf.values())
    return sum((tf[term] if term in tf else 0) / total_terms for term in terms)

def idf(q_id, doc_id):
    text = train_queries.loc[q_id]["text"]
    terms = index_reader.analyze(text)
    res = 0
    for term in terms:
        document_frequency, _ = index_reader.get_term_counts(term, analyzer=None)
        res += math.log(total_documents / (document_frequency + 1))
    return res

def query_length(q_id):
    return len(train_queries.loc[q_id]["text"])

def doc_length(doc_id):
    return len(searcher.doc(doc_id).raw())

def url_length(doc_id):
    return len(findURL(searcher.doc(doc_id).raw()))

def url_slash(doc_id):
    return findURL(searcher.doc(doc_id).raw()).count('/')

q_d_dep_features = [bm25, tf_idf, tf, idf]
d_dep_features = [doc_length, url_length, url_slash]
q_dep_features = [query_length]

In [64]:
def get_list_of_features(df):
    features = []
    for idx, (q_id, doc_id, score) in df.iterrows():
        l = []
        for func in q_d_dep_features:
            l.append(func(q_id, doc_id))
        for func in d_dep_features:
            l.append(func(doc_id))
        for func in q_dep_features:
            l.append(func(q_id))
        features.append(l)
    return features

features = get_list_of_features(res)
features

[[6.012809753417969,
  0.02509961857496166,
  0.004878048780487805,
  10.29084361573428,
  3389,
  39,
  4,
  22],
 [7.209566116333008,
  0.025320518626968016,
  0.007415254237288135,
  10.29084361573428,
  8686,
  54,
  6,
  22],
 [5.6088714599609375,
  0.037332857039042815,
  0.0040650406504065045,
  10.29084361573428,
  2077,
  69,
  3,
  22],
 [5.6088714599609375,
  0.03718171186884426,
  0.004048582995951417,
  10.29084361573428,
  2318,
  56,
  4,
  22],
 [6.515629768371582,
  0.027332984617870627,
  0.002976190476190476,
  10.29084361573428,
  5472,
  37,
  4,
  22],
 [5.585899353027344,
  0.0358745423109552,
  0.00390625,
  10.29084361573428,
  2167,
  40,
  3,
  22],
 [5.585899353027344,
  0.0365891746279065,
  0.00398406374501992,
  10.29084361573428,
  1767,
  66,
  4,
  22],
 [5.563114643096924,
  0.03452587530678395,
  0.0037593984962406013,
  10.29084361573428,
  2400,
  30,
  3,
  22],
 [5.563114643096924,
  0.033395937569471026,
  0.0036363636363636364,
  10.29084361573

In [65]:
df_features = pd.DataFrame(features, columns=list('12345678'))
df_features['q_id'] = res['q_id'].tolist()
df_features['rel'] = res['score'].tolist()
df_features

Unnamed: 0,1,2,3,4,5,6,7,8,q_id,rel
0,6.012810,0.025100,0.004878,10.290844,3389,39,4,22,320,0
1,7.209566,0.025321,0.007415,10.290844,8686,54,6,22,320,0
2,5.608871,0.037333,0.004065,10.290844,2077,69,3,22,320,0
3,5.608871,0.037182,0.004049,10.290844,2318,56,4,22,320,0
4,6.515630,0.027333,0.002976,10.290844,5472,37,4,22,320,0
...,...,...,...,...,...,...,...,...,...,...
99995,7.726025,0.144224,0.036566,8.812684,5061,100,4,20,1185070,0
99996,6.709054,0.387390,0.088652,8.812684,2324,74,6,20,1185070,0
99997,6.742682,0.511632,0.114504,8.812684,3161,58,4,20,1185070,0
99998,7.816648,0.184099,0.043724,8.812684,17884,56,6,20,1185070,0


In [66]:
df_features.to_csv('X_train.csv', index=False)

In [67]:
def prepare_rerank_csv_file(path_to_csv):
    first_run = pd.read_csv(path_to_csv, sep=' ', names=['q_id', 'Q0', 'doc_id', 'rank', 'score', 'run'])
    first_run.drop(['Q0', 'rank', 'run'], axis=1, inplace=True)
    features = get_list_of_features(first_run)
    df_features = pd.DataFrame(features, columns=list('12345678'))
    df_features['q_id'] = first_run['q_id'].tolist()
    df_features['doc_id'] =
    return df_features

In [None]:
csv_with_features = prepare_rerank_csv_file('dev-bm25.trec')
csv_with_features