In [1]:
from pyserini.ltr import FeatureExtractor, AvgICTF, AvgIDF, BM25, DocSize, MatchingTermCount, \
    PMI, QueryLength, SCQ, SCS, SumMatchingTF, TFIDF, UniqueTermCount, \
    UnorderedSequentialPairs, OrderedSequentialPairs
from pyserini.analysis import Analyzer, get_lucene_analyzer
from pyserini.search import get_topics_with_reader
fe = FeatureExtractor('indexes/msmarco-passage/lucene-index-msmarco/', 20)
fe.add(AvgICTF())
fe.add(AvgIDF())
fe.add(BM25())
fe.add(DocSize())
fe.add(MatchingTermCount())
#take too long
# fe.add(PMI())
fe.add(QueryLength())
fe.add(SCQ())
fe.add(SCS())
fe.add(SumMatchingTF())
fe.add(TFIDF())
fe.add(UniqueTermCount())
#seems to have bug
# fe.add(UnorderedSequentialPairs(6))
# fe.add(UnorderedSequentialPairs(8))
# fe.add(UnorderedSequentialPairs(10))
# fe.add(OrderedSequentialPairs(6))
# fe.add(OrderedSequentialPairs(8))
# fe.add(OrderedSequentialPairs(10))
analyzer = Analyzer(get_lucene_analyzer())

In [2]:
queries = get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader',
                                 'collections/msmarco-passage/queries.train.tsv')
queries.update(get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader',
                                 'collections/msmarco-passage/queries.dev.tsv'))

In [3]:
import pandas as pd
import numpy as np
# train = pd.read_csv('collections/msmarco-passage/qidpidtriples.train.full.tsv',sep="\t",
#                     names=['qid','pos_pid','neg_pid'])
# pos_half = train[['qid','pos_pid']].rename(columns={"pos_pid": "pid"})
# pos_half['rel'] = 1.
# neg_half = train[['qid','neg_pid']].rename(columns={"neg_pid": "pid"})
# neg_half['rel'] = 0.
# train = pd.concat([pos_half,neg_half],axis=0,ignore_index=True)
# del pos_half, neg_half
# sampled_train_qid=pd.Series(train['qid'].unique()).sample(frac=0.01,random_state=123456)
# sampled_train=train[train['qid'].isin(sampled_train_qid)].reset_index(drop=True).copy(deep=True)
# del train
sampled_train = pd.read_json('sampled_train.json')
sampled_train_qid = pd.read_json('sampled_train_qid.json',typ='series', orient='records')

In [4]:
!md5sum sampled_train.json
!md5sum sampled_train_qid.json

7581d3cf5b6baeba9b1949d5d14b8846  sampled_train.json
42f68a9f62637d5d2eccab78e323a6d0  sampled_train_qid.json


In [5]:
print(sampled_train.shape)
print(sampled_train.qid.drop_duplicates().shape)
print(sampled_train.groupby('qid').count().mean()['pid'])
print(sampled_train.head(10))

(5417782, 3)
(3277,)
1653.2749465974978
       qid      pid  rel
0  1170090  3917099    1
1   129031  6620836    1
2   517222  3652956    1
3  1045550  4710372    1
4  1070824  2993656    1
5   340406  6630217    1
6   179852   376926    1
7  1031093  3391276    1
8   995835  5178179    1
9   261655   284187    1


In [6]:
dev = pd.read_csv('collections/msmarco-passage/top1000.dev',sep="\t",
                    names=['qid','pid','query','doc'], usecols=['qid','pid'])
sampled_dev_qid=pd.Series(dev['qid'].unique()).sample(n=500,random_state=123456)
sampled_dev = dev[dev['qid'].isin(sampled_dev_qid)].reset_index(drop=True).copy(deep=True)
del dev
dev_qrel=pd.read_csv('collections/msmarco-passage/qrels.dev.small.tsv', sep="\t", names=["qid","q0","pid","rel"])
dev_qrel[dev_qrel['qid'].isin(sampled_dev_qid)].to_csv('collections/msmarco-passage/qrels.dev.500.tsv', sep='\t', header=False, index=False)

In [7]:
print(sampled_dev.shape)
print(sampled_dev.qid.drop_duplicates().shape)
print(sampled_dev.groupby('qid').count().mean()['pid'])
print(sampled_dev.head(10))

(482514, 2)
(500,)
965.028
       qid      pid
0   820267  1000130
1   991832  1000599
2  1093443  1000611
3   706950  1000647
4   996805  1000797
5    23285   100094
6   198246  1001042
7   370734  1001058
8   939866  1001239
9   425688  1001438


In [8]:
from tqdm import tqdm
def extract(df,analyzer):
    lines = []
    fetch_later = []
    for qid,group in df.groupby('qid'):
        analyzed_query = analyzer.analyze(queries[qid]['title'])
        docids = [str(did) for did in group['pid'].drop_duplicates().tolist()]
        fe.lazy_extract(str(qid),analyzed_query,docids)
        fetch_later.append(str(qid))
        if len(fetch_later) == 1000:
            for qid in tqdm(fetch_later):
                for doc in fe.get_result(qid):
                    lines.append((int(qid), int(doc['pid']), *doc['features']))
            fetch_later = []
    #deal with rest
    if len(fetch_later) > 0:
        for qid in tqdm(fetch_later):
            for doc in fe.get_result(qid):
                lines.append((int(qid), int(doc['pid']), *doc['features']))
        fetch_later = []
    extracted = pd.DataFrame(lines, columns=['qid','pid']+fe.feature_names())
    return df.merge(extracted,how='inner',left_on=['qid','pid'],right_on=['qid','pid'])

In [9]:
import json
def export(df, analyzer, fn):
    with open(fn,'w') as f:
        for qid,group in df.groupby('qid'):
            line = {}
            line['qid'] = qid
            line['queryTokens'] = analyzer.analyze(queries[qid]['title'])
            line['docIds'] = [str(did) for did in group['pid'].drop_duplicates().tolist()]
            f.write(json.dumps(line)+'\n')

In [10]:
train_data=extract(sampled_train,analyzer)
export(sampled_train,analyzer,'train.json')

100%|██████████| 1000/1000 [02:12<00:00,  7.57it/s]
100%|██████████| 1000/1000 [01:53<00:00,  8.82it/s]
100%|██████████| 1000/1000 [01:55<00:00,  8.63it/s]
100%|██████████| 277/277 [00:33<00:00,  8.35it/s]


In [11]:
train_data.head(20)

Unnamed: 0,qid,pid,rel,AvgICTF,AvgIDF,BM25Feature,DocSize,MatchingTermCount,QueryLength,AvgSCQ,SimplifiedClarityScore,SumMatchingTF,TFIDF,UniqueQueryTerms
0,1170090,3917099,1,2.949263,5.767209,14.047854,27.0,2.0,2.0,13.272567,8.466328,3.0,111.21155,2.0
1,1170090,3917099,1,2.949263,5.767209,14.047854,27.0,2.0,2.0,13.272567,8.466328,3.0,111.21155,2.0
2,1170090,3917099,1,2.949263,5.767209,14.047854,27.0,2.0,2.0,13.272567,8.466328,3.0,111.21155,2.0
3,1170090,3917099,1,2.949263,5.767209,14.047854,27.0,2.0,2.0,13.272567,8.466328,3.0,111.21155,2.0
4,1170090,3917099,1,2.949263,5.767209,14.047854,27.0,2.0,2.0,13.272567,8.466328,3.0,111.21155,2.0
5,1170090,3917099,1,2.949263,5.767209,14.047854,27.0,2.0,2.0,13.272567,8.466328,3.0,111.21155,2.0
6,1170090,3917099,1,2.949263,5.767209,14.047854,27.0,2.0,2.0,13.272567,8.466328,3.0,111.21155,2.0
7,1170090,3917099,1,2.949263,5.767209,14.047854,27.0,2.0,2.0,13.272567,8.466328,3.0,111.21155,2.0
8,1170090,3917099,1,2.949263,5.767209,14.047854,27.0,2.0,2.0,13.272567,8.466328,3.0,111.21155,2.0
9,1170090,3917099,1,2.949263,5.767209,14.047854,27.0,2.0,2.0,13.272567,8.466328,3.0,111.21155,2.0


In [12]:
dev_data=extract(sampled_dev,analyzer)
export(sampled_dev,analyzer,'dev.json')

100%|██████████| 500/500 [01:15<00:00,  6.66it/s]


In [13]:
dev_data.head(20)

Unnamed: 0,qid,pid,AvgICTF,AvgIDF,BM25Feature,DocSize,MatchingTermCount,QueryLength,AvgSCQ,SimplifiedClarityScore,SumMatchingTF,TFIDF,UniqueQueryTerms
0,820267,1000130,2.471484,4.364158,16.079428,34.0,3.0,4.0,14.322651,6.375166,4.0,120.058304,4.0
1,991832,1000599,1.425946,4.553277,12.827876,30.0,2.0,4.0,14.238573,6.543933,4.0,100.29901,4.0
2,1093443,1000611,1.069592,6.338291,13.949917,35.0,2.0,6.0,12.886062,7.774541,3.0,133.35167,6.0
3,706950,1000647,1.086032,5.197794,9.236618,78.0,1.0,3.0,13.712804,7.446469,3.0,107.50466,3.0
4,996805,1000797,0.68521,5.786142,9.838989,31.0,1.0,4.0,13.186437,7.821704,2.0,97.46449,4.0
5,23285,100094,1.559581,4.788155,10.293707,32.0,2.0,4.0,14.229664,6.611624,3.0,69.58974,4.0
6,198246,1001042,1.716994,9.43359,8.736868,62.0,1.0,2.0,10.069888,12.130479,2.0,93.42264,2.0
7,370734,1001058,1.132592,4.164123,11.240091,24.0,2.0,5.0,14.543454,5.861984,3.0,74.796036,5.0
8,939866,1001239,0.324318,4.331619,9.169763,28.0,1.0,6.0,14.313557,6.004467,4.0,92.01625,6.0
9,425688,1001438,0.461457,7.626915,8.170407,19.0,1.0,4.0,11.70463,9.58847,3.0,69.18878,4.0


In [30]:
from lightgbm.sklearn import LGBMRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
model = LGBMRegressor(random_state=12345)
# model = LogisticRegression()
# model = RandomForestRegressor()
# model = LinearSVC()
train_X = train_data.loc[:,fe.feature_names()].values
train_Y = train_data.loc[:,'rel'].values
model.fit(train_X, train_Y)

LGBMRegressor(random_state=12345)

In [31]:
dev_X = dev_data.loc[:,fe.feature_names()].values
dev_data['score'] = model.predict(dev_X)

In [32]:
with open('lambdarank.run','w') as f:
    score_tie_counter = 0
    score_tie_query = set()
    for qid, group in tqdm(dev_data.groupby('qid')):
        rank = 1
        prev_score = -1e10
        prev_pid = ''
        assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
        for t in group.sort_values(['score','pid'],ascending=False).itertuples():
            if abs(t.score-prev_score)<1e-8:
                score_tie_counter+=1
                score_tie_query.add(qid)
            assert prev_pid != t.pid
            prev_score = t.score
            prev_pid = t.pid
            f.write(f'{t.qid}\t{t.pid}\t{rank}\n')
            rank += 1
    if score_tie_counter>0:
        print(f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries')

100%|██████████| 500/500 [00:02<00:00, 180.12it/s]


score_tie occurs 302345 times in 497 queries


In [33]:
!md5sum lambdarank.run

40f02ea2d4d9ac59a2222c1809ecabfa  lambdarank.run


In [34]:
!python tools/scripts/msmarco/msmarco_eval.py collections/msmarco-passage/qrels.dev.500.tsv lambdarank.run

#####################
MRR @10: 0.1844920634920635
QueriesRanked: 500
#####################


In [35]:
with open('lambdarank.run.trec','w') as f:
    for qid, group in tqdm(dev_data.groupby('qid')):
        rank = 1
        assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
        for t in group.sort_values(['score','pid'],ascending=False).itertuples():
            new_score = t.score - rank*1e-6
            f.write(f'{t.qid}\tQ0\t{t.pid}\t{rank}\t{new_score:.6f}\tlambdarank\n')
            rank+=1

100%|██████████| 500/500 [00:02<00:00, 174.36it/s]


In [36]:
!md5sum lambdarank.run.trec

eed0712c572a4d8a0f55165e5b196479  lambdarank.run.trec


In [37]:
!tools/eval/trec_eval.9.0.4/trec_eval -m all_trec collections/msmarco-passage/qrels.dev.500.tsv lambdarank.run.trec | egrep '^map\s|recall_1000'

map                   	all	0.1920
recall_1000           	all	0.8163


MRR @10: 0.18604365079365076

edcd3516ae1df3923308a76351b19682  lambdarank.run

map                   	all	0.1913

123a47e69a9d229c5fad4264a899df8a  lambdarank.run.trec