In [None]:
from pyserini.ltr import FeatureExtractor, AvgICTF, AvgIDF, BM25, DocSize, MatchingTermCount, \
    PMI, QueryLength, SCQ, SCS, SumMatchingTF, TFIDF, UniqueTermCount, \
    UnorderedSequentialPairs, OrderedSequentialPairs
from pyserini.analysis import Analyzer, get_lucene_analyzer
from pyserini.search import get_topics_with_reader
fe = FeatureExtractor('indexes/msmarco-passage/lucene-index-msmarco/', 20)
fe.add(AvgICTF())
fe.add(AvgIDF())
fe.add(BM25())
fe.add(DocSize())
fe.add(MatchingTermCount())
fe.add(PMI())
fe.add(QueryLength())
fe.add(SCQ())
fe.add(SCS())
fe.add(SumMatchingTF())
fe.add(TFIDF())
fe.add(UniqueTermCount())
#seems to have bug
# fe.add(UnorderedSequentialPairs(6))
# fe.add(UnorderedSequentialPairs(8))
# fe.add(UnorderedSequentialPairs(10))
# fe.add(OrderedSequentialPairs(6))
# fe.add(OrderedSequentialPairs(8))
# fe.add(OrderedSequentialPairs(10))
analyzer = Analyzer(get_lucene_analyzer())

In [None]:
queries = get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader',
                                 'collections/msmarco-passage/queries.train.tsv')
queries.update(get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader',
                                 'collections/msmarco-passage/queries.dev.tsv'))

In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv('collections/msmarco-passage/qidpidtriples.train.full.tsv',sep="\t",
                    names=['qid','pos_pid','neg_pid'])
pos_half = train[['qid','pos_pid']].rename(columns={"pos_pid": "pid"})
pos_half['rel'] = 1.
neg_half = train[['qid','neg_pid']].rename(columns={"neg_pid": "pid"})
neg_half['rel'] = 0.
train = pd.concat([pos_half,neg_half],axis=0,ignore_index=True)
del pos_half, neg_half
train=train[train['qid'].isin(pd.Series(train['qid'].unique()).sample(frac=0.01,random_state=123456))].reset_index(drop=True)
dev = pd.read_csv('collections/msmarco-passage/top1000.dev',sep="\t",
                    names=['qid','pid','query','doc'], usecols=['qid','pid'])
dev=dev[dev['qid'].isin(pd.Series(dev['qid'].unique()).sample(n=500,random_state=123456))]

In [None]:
dev_qrel=pd.read_csv('collections/msmarco-passage/qrels.dev.small.tsv', sep="\t", names=["qid","q0","pid","rel"])
dev_qrel[dev_qrel['qid'].isin(dev['qid'].unique())].to_csv('collections/msmarco-passage/qrels.dev.500.tsv', sep='\t', header=False, index=False)

In [None]:
from tqdm import tqdm
def extract(df,analyzer):
    lines = []
    fetch_later = []
    for qid,group in df.groupby('qid'):
        analyzed_query = analyzer.analyze(queries[qid]['title'])
        docids = [str(did) for did in group['pid'].drop_duplicates().tolist()]
        fe.lazy_extract(str(qid),analyzed_query,docids)
        fetch_later.append(str(qid))
        if len(fetch_later) == 100:
            for qid in tqdm(fetch_later):
                for pid, feature in fe.get_result(qid).items():
                    lines.append((int(qid), int(pid), *feature))
            fetch_later = []
    extracted = pd.DataFrame(lines, columns=['qid','pid']+fe.feature_names())
    return df.merge(extracted,how='inner',left_on=['qid','pid'],right_on=['qid','pid'])

In [None]:
train_data=extract(train,analyzer)


In [None]:
dev_data=extract(dev,analyzer)


In [9]:
from lightgbm.sklearn import LGBMRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
model = LGBMRegressor()
# model = LogisticRegression()
# model = RandomForestRegressor()
# model = LinearSVC()
train_X = train_data.loc[:,fe.feature_names()].values
train_Y = train_data.loc[:,'rel'].values
model.fit(train_X, train_Y)

LGBMRegressor()

In [10]:
dev_X = dev_data.loc[:,fe.feature_names()].values
dev_data['score'] = model.predict(dev_X)

In [11]:
with open('lambdarank.run','w') as f:
    for qid, group in tqdm(dev_data.groupby('qid')):
        rank = 1
        for t in group.sort_values('score',ascending=False).itertuples():
            f.write(f'{t.qid}\t{t.pid}\t{rank}\n')
            rank += 1

100%|██████████| 500/500 [00:02<00:00, 249.79it/s]


In [12]:
!python tools/scripts/msmarco/msmarco_eval.py collections/msmarco-passage/qrels.dev.500.tsv lambdarank.run

#####################
MRR @10: 0.18552698412698415
QueriesRanked: 500
#####################


In [13]:
with open('lambdarank.run.trec','w') as f:
    for qid, group in tqdm(dev_data.groupby('qid')):
        rank = 1
        for t in group.sort_values('score',ascending=False).itertuples():
            f.write(f'{t.qid}\tQ0\t{t.pid}\t{rank}\t{t.score}\tlambdarank\n')
            rank+=1

100%|██████████| 500/500 [00:02<00:00, 201.39it/s]


In [14]:
!tools/eval/trec_eval.9.0.4/trec_eval -m all_trec collections/msmarco-passage/qrels.dev.small.trec lambdarank.run.trec

runid                 	all	lambdarank
num_q                 	all	500
num_ret               	all	482514
num_rel               	all	530
num_rel_ret           	all	429
map                   	all	0.1932
gm_map                	all	0.0146
Rprec                 	all	0.0987
bpref                 	all	0.8163
recip_rank            	all	0.1982
iprec_at_recall_0.00  	all	0.1983
iprec_at_recall_0.10  	all	0.1983
iprec_at_recall_0.20  	all	0.1983
iprec_at_recall_0.30  	all	0.1983
iprec_at_recall_0.40  	all	0.1968
iprec_at_recall_0.50  	all	0.1968
iprec_at_recall_0.60  	all	0.1891
iprec_at_recall_0.70  	all	0.1891
iprec_at_recall_0.80  	all	0.1885
iprec_at_recall_0.90  	all	0.1885
iprec_at_recall_1.00  	all	0.1885
P_5                   	all	0.0600
P_10                  	all	0.0402
P_15                  	all	0.0313
P_20                  	all	0.0248
P_30                  	all	0.0182
P_100                 	all	0.0070
P_200                 	all	0.0038
P_500                 	al