In [1]:
import pyterrier as pt
import pandas as pd
import numpy as np
import pyltr

import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file

from copy import deepcopy

In [2]:
if not pt.started():
    pt.init()

PyTerrier 0.4.0 has loaded Terrier 5.4 (built by craigm on 2021-01-16 14:17)


In [3]:
ds = pt.get_dataset("trec-deep-learning-passages")
index = pt.IndexFactory.of("./passage_index_8_bkp/data.properties")

19:00:02.641 [main] WARN  o.t.structures.CompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 2.1 GiB of memory would be required.


In [4]:
print(index.getCollectionStatistics().toString())

Number of documents: 8841823
Number of terms: 1170682
Number of postings: 215238456
Number of fields: 1
Number of tokens: 288759529
Field names: [text]
Positions:   false



In [5]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [6]:
pt.Experiment(
    [bm25],
    ds.get_topics('test-2019'),
    ds.get_qrels('test-2019'),
    eval_metrics=["map", "recip_rank", "ndcg"],
    names = ['BM25']
)

Unnamed: 0,name,map,recip_rank,ndcg
0,BM25,0.369987,0.795028,0.59338


In [5]:
df = ds.get_topics('train')
df.head()

Unnamed: 0,qid,query
0,121352,define extreme
1,634306,what does chattel mean on credit history
2,920825,what was the great leap forward brainly
3,510633,tattoo fixers how much does it cost
4,737889,what is decentralization process


In [8]:
df.dtypes

qid      object
query    object
dtype: object

In [9]:
df['qid'].unique()

array(['121352', '634306', '920825', ..., '210839', '908165', '50393'],
      dtype=object)

In [55]:
pipeline = pt.FeaturesBatchRetrieve(index, wmodel="BM25", features=["WMODEL:Tf", "WMODEL:PL2"]) % 10
pipeline = pipeline.compile()

Applying 8 rules


In [56]:
train_qrels = ds.get_qrels('train')

In [57]:
train_qrels.head()

Unnamed: 0,qid,docno,label
0,1185869,0,1
1,1185868,16,1
2,597651,49,1
3,403613,60,1
4,1183785,389,1


In [77]:
train_mil_queries = df[:1000].copy()
va_mil_queries = df[100:201].copy()

In [60]:
res = pipeline(train_mil_queries)
pt.io.write_results(res, 'featureFile_100q', format='letor')

In [62]:
with open("featureFile_100q") as inf:
    X_train, Y_train, qids_train, _ =  pyltr.data.letor.read_dataset(inf)

In [70]:
res = res.astype({'qid': 'int64'})

In [71]:
res.join(train_qrels, on='qid', how='left', lsuffix='_res', rsuffix='_qrels')

Unnamed: 0,qid_res,query,docid,rank,features,docno_res,score,qid_qrels,docno_qrels,label
0,121352,define extreme,6009766,0,"[4.0, 11.876832573726833]",718760,21.919845,386508,1875754,1.0
1,121352,define extreme,1038974,1,"[5.0, 11.47977026833666]",6237152,21.776986,386508,1875754,1.0
2,121352,define extreme,2521184,2,"[2.0, 10.75602754973717]",6350410,20.484073,386508,1875754,1.0
3,121352,define extreme,5621732,3,"[4.0, 10.760031725457694]",7236060,20.436113,386508,1875754,1.0
4,121352,define extreme,3420721,4,"[6.0, 10.849189347526387]",2912791,20.397157,386508,1875754,1.0
...,...,...,...,...,...,...,...,...,...,...
99005,1142466,when to start timeouts with toddlers,4622403,5,"[5.0, 12.527907630617136]",1253430,25.179912,,,
99006,1142466,when to start timeouts with toddlers,7787235,6,"[4.0, 12.494236255873759]",2531632,25.154846,,,
99007,1142466,when to start timeouts with toddlers,181753,7,"[5.0, 12.685787930573474]",1082511,25.094037,,,
99008,1142466,when to start timeouts with toddlers,4622402,8,"[5.0, 12.392980085085238]",1253425,24.955413,,,


In [78]:
# this configures LightGBM as LambdaMART
lmart_l = lgb.LGBMRanker(task="train",
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=100,
    max_bin=255,
    num_leaves=7,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[1, 3, 5, 10],
    learning_rate= .1,
    importance_type="gain",
    num_iterations=10,
    num_threads=6,
    device_type='gpu')
lmart_l_pipe = pipeline >> pt.ltr.apply_learned_model(lmart_l, form="ltr")


In [79]:
lmart_l_pipe.fit(train_mil_queries, train_qrels, va_mil_queries, train_qrels)

[1]	valid_0's ndcg@1: 0.792079	valid_0's ndcg@2: 0.823313	valid_0's ndcg@3: 0.843115	valid_0's ndcg@4: 0.855908	valid_0's ndcg@5: 0.859738
[2]	valid_0's ndcg@1: 0.792079	valid_0's ndcg@2: 0.835807	valid_0's ndcg@3: 0.850658	valid_0's ndcg@4: 0.859187	valid_0's ndcg@5: 0.870677
[3]	valid_0's ndcg@1: 0.811881	valid_0's ndcg@2: 0.849362	valid_0's ndcg@3: 0.864214	valid_0's ndcg@4: 0.877006	valid_0's ndcg@5: 0.880836
[4]	valid_0's ndcg@1: 0.80198	valid_0's ndcg@2: 0.845708	valid_0's ndcg@3: 0.855609	valid_0's ndcg@4: 0.872665	valid_0's ndcg@5: 0.880326
[5]	valid_0's ndcg@1: 0.80198	valid_0's ndcg@2: 0.851955	valid_0's ndcg@3: 0.856905	valid_0's ndcg@4: 0.873962	valid_0's ndcg@5: 0.881622
[6]	valid_0's ndcg@1: 0.792079	valid_0's ndcg@2: 0.848301	valid_0's ndcg@3: 0.853251	valid_0's ndcg@4: 0.870308	valid_0's ndcg@5: 0.877968
[7]	valid_0's ndcg@1: 0.80198	valid_0's ndcg@2: 0.839461	valid_0's ndcg@3: 0.849362	valid_0's ndcg@4: 0.874947	valid_0's ndcg@5: 0.878777
[8]	valid_0's ndcg@1: 0.80198	



In [None]:
# this configures XGBoost as LambdaMART
lmart_x = xgb.sklearn.XGBRanker(objective='rank:ndcg',
      learning_rate=0.1,
      gamma=1.0,
      tree_method='gpu_hist',
      min_child_weight=0.1,
      max_depth=6,
      verbose=2,
      random_state=42, gpu_id=0)

lmart_x_pipe = pipeline >> pt.ltr.apply_learned_model(lmart_x, form="ltr")
lmart_x_pipe.fit(train_mil_queries, train_qrels)

In [48]:
qrels = ds.get_qrels('train')
qrels.head()

Unnamed: 0,qid,docno,label
0,1185869,0,1
1,1185868,16,1
2,597651,49,1
3,403613,60,1
4,1183785,389,1


In [90]:
df = df.astype({'qid': 'int64'})

In [56]:
qrels.dtypes

qid      object
docno    object
label     int64
dtype: object

In [58]:
full_table = df.join(qrels, on='qid', how='right', lsuffix='_topics', rsuffix='_qrels')

In [59]:
full_table.head()

Unnamed: 0,qid,qid_topics,query,qid_qrels,docno,label
,0,,,1185869,0,1
322300.0,1,1.0,a potlatch is considered an example of,1185868,16,1
,2,,,597651,49,1
36118.0,3,3.0,another name for the primary visual cortex is,403613,60,1
455738.0,4,4.0,defining alcoholism as a disease is associated...,1183785,389,1


In [14]:
test_queries = ds.get_topics('test-2019')
test_queries = test_queries.astype({'qid': 'int64'})
test_queries.head()

Unnamed: 0,qid,query
0,1108939,what slows down the flow of blood
1,1112389,what is the county for grand rapids mn
2,792752,what is ruclip
3,1119729,what do you do when you have a nosebleed from ...
4,1105095,where is sugar lake lodge located


In [15]:
test_qrels = ds.get_qrels('test-2019')
test_qrels = test_qrels.astype({'qid': 'int64'})

In [138]:
Q = test_queries.iloc[55]

In [139]:
Q.qid

1133167

In [140]:
bm25_t = pt.BatchRetrieve(index, wmodel="BM25") % 4
bm25_t = bm25_t.compile()

Applying 8 rules


In [142]:
bm25_t.search(Q.query)

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,5356601,5674622,0,33.297181,how is the weather in jamaica
1,1,2823018,8160230,1,31.210468,how is the weather in jamaica
2,1,1369965,8255705,2,31.180305,how is the weather in jamaica
3,1,1498595,190809,3,30.632402,how is the weather in jamaica


In [143]:
Q_qrels = test_qrels[test_qrels.qid == Q.qid]
Q_qrels.head()

Unnamed: 0,qid,docno,label
8768,1133167,1014506,0
8769,1133167,1060131,0
8770,1133167,1064355,0
8771,1133167,1070102,0
8772,1133167,1097959,0


In [146]:
Q_qrels = Q_qrels.astype({'docno': 'int64'})

In [149]:
Q_qrels[Q_qrels.docno == 5674622]

Unnamed: 0,qid,docno,label
9040,1133167,5674622,2


1. retrieved ranks, actual ranks per Query
2. if the retreived doc doesn't exist in qrels
3. 