In [93]:
import pandas as pd
import numpy as np
from pyserini.search import SimpleSearcher
from pyserini.index import IndexReader


In [120]:
nbr_of_queries = 1000

qrels = pd.read_csv('train/msmarco-doctrain-qrels.tsv', sep=' ', names=['q_id', '0', 'doc_id', '1'])
queries = pd.read_csv('train/queries.doctrain.tsv', sep='\t', names=['q_id', 'text'])
queries.set_index('q_id', inplace=True)
queries

Unnamed: 0_level_0,text
q_id,Unnamed: 1_level_1
1185869,)what was the immediate impact of the success ...
1185868,_________ justice is designed to repair the ha...
1183785,elegxo meaning
645590,what does physical medicine do
186154,feeding rice cereal how many times per day
...,...
19285,anterolisthesis definition
558837,what are fishing flies
559149,what are fsh levels during perimenopause
706678,what is a yowie


In [121]:
qrels.drop(['0', '1'], inplace=True, axis=1)
qrels

Unnamed: 0,q_id,doc_id
0,3,D312959
1,5,D140227
2,12,D213890
3,15,D1033338
4,16,D508131
...,...,...
367008,1185862,D2008201
367009,1185864,D1126522
367010,1185865,D630512
367011,1185868,D59235


In [122]:
sampled_queries = qrels.sample(n=nbr_of_queries)
sampled_queries['score'] = 1
sampled_queries

Unnamed: 0,q_id,doc_id,score
195718,637886,D1330104,1
77052,253272,D413125,1
199336,648142,D175708,1
299043,940437,D906865,1
226949,727417,D2874892,1
...,...,...,...
326480,1019266,D2533281,1
978,4730,D128765,1
358139,1165980,D522360,1
208305,674297,D6241,1


In [124]:
searcher = SimpleSearcher('indexes/lucene-index-msmarco-doc')
searcher.set_bm25(k1=4.46, b=0.82)
number_of_hits = 100

In [126]:
to_add = []
for idx, (q_id, doc_id, _) in sampled_queries.iterrows():
    text = queries.loc[q_id]['text']
    hits = searcher.search(text, k=number_of_hits)
    counter = 1
    for hit in hits:
        if hit.docid == doc_id:
            continue
        to_add.append([q_id, hit.docid, 0])
        counter += 1
        if counter == number_of_hits:
            break
to_add

[[637886, 'D162205', 0],
 [253272, 'D3027608', 0],
 [648142, 'D3239237', 0],
 [940437, 'D1799284', 0],
 [727417, 'D2416872', 0],
 [610795, 'D2885752', 0],
 [874191, 'D1709190', 0],
 [22906, 'D3080492', 0],
 [660357, 'D415567', 0],
 [116568, 'D3475007', 0],
 [1141383, 'D2895207', 0],
 [708357, 'D2997193', 0],
 [877481, 'D2259151', 0],
 [761269, 'D1514453', 0],
 [97516, 'D536321', 0],
 [111873, 'D3081120', 0],
 [884719, 'D325210', 0],
 [769069, 'D2498912', 0],
 [196659, 'D2256411', 0],
 [151501, 'D3549301', 0],
 [845459, 'D2897884', 0],
 [476991, 'D548306', 0],
 [754, 'D280304', 0],
 [1139074, 'D1688956', 0],
 [755490, 'D1313165', 0],
 [295161, 'D2988227', 0],
 [26938, 'D895787', 0],
 [423009, 'D2380067', 0],
 [724419, 'D2876864', 0],
 [435652, 'D2537838', 0],
 [791531, 'D603231', 0],
 [911673, 'D1908132', 0],
 [19902, 'D2252006', 0],
 [631653, 'D782704', 0],
 [914338, 'D2464585', 0],
 [536605, 'D1931280', 0],
 [1059011, 'D411925', 0],
 [342877, 'D2751544', 0],
 [605070, 'D2207127', 0],


In [127]:
new_df = pd.DataFrame(to_add, columns=['q_id', 'doc_id', 'score'])
res = sampled_queries.append(new_df)
res.sort_values('q_id', inplace=True)
res

Unnamed: 0,q_id,doc_id,score
163,754,D112645,1
22,754,D280304,0
960,4637,D238656,1
700,4637,D3373482,0
978,4730,D128765,1
...,...,...,...
366693,1185108,D2727661,1
366885,1185557,D1021295,1
494,1185557,D1021292,0
330,1185796,D670838,0


In [128]:
res

Unnamed: 0,q_id,doc_id,score
163,754,D112645,1
22,754,D280304,0
960,4637,D238656,1
700,4637,D3373482,0
978,4730,D128765,1
...,...,...,...
366693,1185108,D2727661,1
366885,1185557,D1021295,1
494,1185557,D1021292,0
330,1185796,D670838,0


In [129]:
import math
import re

def findURL(s: str):
    regex = '(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'
    urls = re.search(regex, s)
    return urls[0].strip()

index_reader = IndexReader('indexes/lucene-index-msmarco-doc')
stats = index_reader.stats()
total_documents = stats['documents']

def bm25(q_id, doc_id):
    return index_reader.compute_query_document_score(doc_id, queries.loc[q_id]["text"])

def tf_idf(q_id, doc_id):
    text = queries.loc[q_id]["text"]
    terms = index_reader.analyze(text) # Get analyzed terms for query
    tf = index_reader.get_document_vector(doc_id) # Keys are analyzed terms
    total_terms = sum(tf.values())
    res = 0

    for term in terms:
        frequency = tf[term] / total_terms if term in tf else 0
        document_frequency, _ = index_reader.get_term_counts(term, analyzer=None)
        idf = math.log(total_documents / (document_frequency + 1))
        res += frequency * idf

    return res

def tf(q_id, doc_id):
    text = queries.loc[q_id]["text"]
    terms = index_reader.analyze(text)
    tf = index_reader.get_document_vector(doc_id)
    total_terms = sum(tf.values())
    return sum((tf[term] if term in tf else 0) / total_terms for term in terms)

def idf(q_id, doc_id):
    text = queries.loc[q_id]["text"]
    terms = index_reader.analyze(text)
    res = 0
    for term in terms:
        document_frequency, _ = index_reader.get_term_counts(term, analyzer=None)
        res += math.log(total_documents / (document_frequency + 1))
    return res

def query_length(q_id):
    return len(queries.loc[q_id]["text"])

def doc_length(doc_id):
    return len(searcher.doc(doc_id).raw())

def url_length(doc_id):
    return len(findURL(searcher.doc(doc_id).raw()))

def url_slash(doc_id):
    return findURL(searcher.doc(doc_id).raw()).count('/')

q_d_dep_features = [bm25, tf_idf, tf, idf]
d_dep_features = [doc_length, url_length, url_slash]
q_dep_features = [query_length]

In [130]:
def get_list_of_features(df):
    features = []
    for idx, (q_id, doc_id, score) in df.iterrows():
        l = []
        for func in q_d_dep_features:
            l.append(func(q_id, doc_id))
        for func in d_dep_features:
            l.append(func(doc_id))
        for func in q_dep_features:
            l.append(func(q_id))
        features.append(l)
    return features


In [131]:
features = get_list_of_features(res)
features

[[5.398359775543213,
  0.12667340076573533,
  0.03471904979442668,
  12.36671035649233,
  18217,
  86,
  4,
  36],
 [11.72272777557373,
  0.6303288332289798,
  0.18433179723502305,
  12.36671035649233,
  4141,
  88,
  4,
  36],
 [9.335668563842773,
  0.4674083145611395,
  0.13793103448275862,
  9.645074028619637,
  4020,
  66,
  4,
  35],
 [9.511495590209961,
  1.0620478651165408,
  0.3278688524590164,
  9.645074028619637,
  4999,
  77,
  4,
  35],
 [6.479376792907715,
  0.6949213695564382,
  0.3224043715846995,
  6.699864773767764,
  1666,
  43,
  5,
  16],
 [8.557600975036621,
  0.26765476368181257,
  0.08348924228250701,
  13.563881041477508,
  38728,
  45,
  4,
  38],
 [8.864699363708496,
  0.49142745255259274,
  0.163855421686747,
  13.563881041477508,
  3500,
  85,
  6,
  38],
 [8.956446647644043,
  0.7471615547817527,
  0.28391959798994976,
  9.213492700587334,
  3211,
  70,
  3,
  27],
 [8.57768440246582,
  0.2286664312353831,
  0.08225108225108226,
  9.213492700587334,
  3289,

In [132]:
df_features = pd.DataFrame(features, columns=list('12345678'))
df_features['q_id'] = res['q_id'].tolist()
df_features['rel'] = res['score'].tolist()
df_features

Unnamed: 0,1,2,3,4,5,6,7,8,q_id,rel
0,5.398360,0.126673,0.034719,12.366710,18217,86,4,36,754,1
1,11.722728,0.630329,0.184332,12.366710,4141,88,4,36,754,0
2,9.335669,0.467408,0.137931,9.645074,4020,66,4,35,4637,1
3,9.511496,1.062048,0.327869,9.645074,4999,77,4,35,4637,0
4,6.479377,0.694921,0.322404,6.699865,1666,43,5,16,4730,1
...,...,...,...,...,...,...,...,...,...,...
1865,10.159256,0.597887,0.119298,10.731985,2482,46,4,27,1185108,1
1866,22.970654,0.103233,0.035691,30.362155,12627,51,5,73,1185557,1
1867,24.620754,0.474600,0.116700,30.362155,4171,103,5,73,1185557,0
1868,6.295106,0.369440,0.087500,6.740933,2023,44,4,20,1185796,0


In [133]:
df_features.to_csv('X_train_1.csv', index=False)

In [103]:
import lightgbm

In [104]:
df = pd.read_csv('X_train.csv')
nbr_of_rows = len(df)
split_point = int(nbr_of_rows / 10 * 8)
train_df = df[:split_point]  # first 80%
validation_df = df[split_point:]  # remaining 20%

In [105]:
qids_train = train_df.groupby("q_id")["q_id"].count().to_numpy()
X_train = train_df.drop(["q_id", "rel"], axis=1)
y_train = train_df["rel"]
qids_validation = validation_df.groupby("q_id")["q_id"].count().to_numpy()
X_validation = validation_df.drop(["q_id", "rel"], axis=1)
y_validation = validation_df["rel"]

In [106]:
model = lightgbm.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[1, 3, 5, 10, 100]
)

model.fit(
    X=X_train,
    y=y_train,
    group=qids_train,
    eval_set=[(X_validation, y_validation)],
    eval_group=[qids_validation],
    eval_at=1,
    verbose=1,
)

[1]	valid_0's ndcg@1: 0.06	valid_0's ndcg@3: 0.107737	valid_0's ndcg@5: 0.12646	valid_0's ndcg@10: 0.151718	valid_0's ndcg@100: 0.295656
[2]	valid_0's ndcg@1: 0.135	valid_0's ndcg@3: 0.215474	valid_0's ndcg@5: 0.246022	valid_0's ndcg@10: 0.270475	valid_0's ndcg@100: 0.380876
[3]	valid_0's ndcg@1: 0.125	valid_0's ndcg@3: 0.207974	valid_0's ndcg@5: 0.240456	valid_0's ndcg@10: 0.27734	valid_0's ndcg@100: 0.380454
[4]	valid_0's ndcg@1: 0.13	valid_0's ndcg@3: 0.20351	valid_0's ndcg@5: 0.242234	valid_0's ndcg@10: 0.282075	valid_0's ndcg@100: 0.384046
[5]	valid_0's ndcg@1: 0.115	valid_0's ndcg@3: 0.199284	valid_0's ndcg@5: 0.244029	valid_0's ndcg@10: 0.282971	valid_0's ndcg@100: 0.382279
[6]	valid_0's ndcg@1: 0.115	valid_0's ndcg@3: 0.200474	valid_0's ndcg@5: 0.245658	valid_0's ndcg@10: 0.277816	valid_0's ndcg@100: 0.381212
[7]	valid_0's ndcg@1: 0.105	valid_0's ndcg@3: 0.192974	valid_0's ndcg@5: 0.234727	valid_0's ndcg@10: 0.268764	valid_0's ndcg@100: 0.375343




[8]	valid_0's ndcg@1: 0.12	valid_0's ndcg@3: 0.216129	valid_0's ndcg@5: 0.243247	valid_0's ndcg@10: 0.280569	valid_0's ndcg@100: 0.386821
[9]	valid_0's ndcg@1: 0.125	valid_0's ndcg@3: 0.216784	valid_0's ndcg@5: 0.25423	valid_0's ndcg@10: 0.289371	valid_0's ndcg@100: 0.392755
[10]	valid_0's ndcg@1: 0.12	valid_0's ndcg@3: 0.217438	valid_0's ndcg@5: 0.260249	valid_0's ndcg@10: 0.287408	valid_0's ndcg@100: 0.391335
[11]	valid_0's ndcg@1: 0.12	valid_0's ndcg@3: 0.214938	valid_0's ndcg@5: 0.256034	valid_0's ndcg@10: 0.288451	valid_0's ndcg@100: 0.391254
[12]	valid_0's ndcg@1: 0.115	valid_0's ndcg@3: 0.211784	valid_0's ndcg@5: 0.245142	valid_0's ndcg@10: 0.287736	valid_0's ndcg@100: 0.388498
[13]	valid_0's ndcg@1: 0.12	valid_0's ndcg@3: 0.214284	valid_0's ndcg@5: 0.245708	valid_0's ndcg@10: 0.289242	valid_0's ndcg@100: 0.389647
[14]	valid_0's ndcg@1: 0.13	valid_0's ndcg@3: 0.216665	valid_0's ndcg@5: 0.248089	valid_0's ndcg@10: 0.290364	valid_0's ndcg@100: 0.391518
[15]	valid_0's ndcg@1: 0.135

LGBMRanker(metric='ndcg', ndcg_eval_at=[1, 3, 5, 10, 100],
           objective='lambdarank')

In [109]:
# Get features for queries selected by BM25 on the dev set
queries = pd.read_csv('dev/queries.docdev.tsv', sep='\t', names=['q_id', 'text'])
queries.set_index('q_id', inplace=True)
dev_results = pd.read_csv('dev-bm25.trec', sep=' ', names=['q_id', 'Q0', 'doc_id', 'rank', 'score', 'run'])
dev_results.drop(['Q0', 'rank', 'run'], axis=1, inplace=True)
features = get_list_of_features(dev_results)

In [114]:
# Write these features to a file
dev_features = pd.DataFrame(features, columns=list('12345678'))
dev_features['q_id'] = dev_results['q_id'].tolist()
dev_features['doc_id'] = dev_results['doc_id'].tolist()
dev_features.to_csv('dev-bm25-features.csv', index=False)

In [115]:
dev_features

Unnamed: 0,1,2,3,4,5,6,7,8,q_id,doc_id
0,7.100707,0.132776,0.054187,7.871985,4029,46,4,28,92542,D1118594
1,6.920385,0.328740,0.095465,7.871985,3945,54,4,28,92542,D2064696
2,6.508493,0.521713,0.147959,7.871985,1703,70,4,28,92542,D2064694
3,7.051576,0.142167,0.059098,7.871985,5604,52,5,28,92542,D340120
4,7.034795,0.106458,0.043400,7.871985,4820,42,4,28,92542,D1327250
...,...,...,...,...,...,...,...,...,...,...
519295,9.851149,0.303322,0.074468,12.770114,1335,61,4,28,247194,D2850421
519296,10.056492,0.248638,0.053541,12.770114,4340,65,4,28,247194,D2490026
519297,9.298171,0.403351,0.079545,12.770114,3810,78,5,28,247194,D639780
519298,9.933525,0.241005,0.066465,12.770114,2417,41,3,28,247194,D1640352


In [117]:
# Get features for queries selected by BM25 on the test set
queries = pd.read_csv('test/msmarco-test2019-queries.tsv', sep='\t', names=['q_id', 'text'])
queries.set_index('q_id', inplace=True)
test_results = pd.read_csv('test-bm25-100.trec', sep=' ', names=['q_id', 'Q0', 'doc_id', 'rank', 'score', 'run'])
test_results.drop(['Q0', 'rank', 'run'], axis=1, inplace=True)
features = get_list_of_features(test_results)

In [119]:
# Write the csv with these features to a file
test_features = pd.DataFrame(features, columns=list('12345678'))
test_features['q_id'] = test_results['q_id'].tolist()
test_features['doc_id'] = test_results['doc_id'].tolist()
test_features.to_csv('test-bm25-features.csv', index=False)