In [None]:
from pyserini.ltr import FeatureExtractor, AvgICTF, AvgIDF, BM25, DocSize, MatchingTermCount, \
    PMI, QueryLength, SCQ, SCS, SumMatchingTF, TFIDF, UniqueTermCount
from pyserini.analysis import Analyzer, get_lucene_analyzer
from pyserini.search import get_topics_with_reader
fe = FeatureExtractor('indexes/msmarco-passage/lucene-index-msmarco/', 20)
fe.add(AvgICTF())
fe.add(AvgIDF())
fe.add(BM25())
fe.add(DocSize())
fe.add(MatchingTermCount())
#take too long
#fe.add(PMI())
fe.add(QueryLength())
fe.add(SCQ())
fe.add(SCS())
fe.add(SumMatchingTF())
fe.add(TFIDF())
fe.add(UniqueTermCount())
analyzer = Analyzer(get_lucene_analyzer())

In [2]:
queries = get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader',
                                 'collections/msmarco-passage/queries.train.tsv')
queries.update(get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader',
                                 'collections/msmarco-passage/queries.dev.tsv'))

In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv('collections/msmarco-passage/qidpidtriples.train.full.tsv',sep="\t",
                     names=['qid','pos_pid','neg_pid'])
pos_half = train[['qid','pos_pid']].rename(columns={"pos_pid": "pid"})
pos_half['rel'] = 1.
neg_half = train[['qid','neg_pid']].rename(columns={"neg_pid": "pid"})
neg_half['rel'] = 0.
train = pd.concat([pos_half,neg_half],axis=0,ignore_index=True)
del pos_half, neg_half
# sampled_train_qid=pd.Series(train['qid'].unique()).sample(frac=0.01,random_state=123456)
# sampled_train=train[train['qid'].isin(sampled_train_qid)].reset_index(drop=True).copy(deep=True)
# del train
#sampled_train = pd.read_json('sampled_train.json')
#sampled_train_qid = pd.read_json('sampled_train_qid.json',typ='series', orient='records')

In [4]:
sampled_train=train.sample(frac=0.01,random_state=123456)

In [5]:
print(sampled_train.shape)
print(sampled_train.qid.drop_duplicates().shape)
print(sampled_train.groupby('qid').count().mean()['pid'])
print(sampled_train.head(10))

(5398380, 3)
(321598,)
16.78611185392944
               qid      pid  rel
438906840   996529  4571859  0.0
280258430  1036625  6548653  0.0
127133986   645341  5241691  1.0
529081817   972208  4618120  0.0
213327677   540059  7012294  1.0
443983768  1169063   633107  0.0
108790199   652991  2190841  1.0
343385477   511998  6030824  0.0
303077525   963043  2978761  0.0
345884074   160244  5523895  0.0


In [6]:
dev = pd.read_csv('collections/msmarco-passage/top1000.dev',sep="\t",
                    names=['qid','pid','query','doc'], usecols=['qid','pid'])
sampled_dev_qid=pd.Series(dev['qid'].unique()).sample(n=500,random_state=123456)
sampled_dev = dev[dev['qid'].isin(sampled_dev_qid)].reset_index(drop=True).copy(deep=True)
del dev
dev_qrel=pd.read_csv('collections/msmarco-passage/qrels.dev.small.tsv', sep="\t", names=["qid","q0","pid","rel"])
dev_qrel[dev_qrel['qid'].isin(sampled_dev_qid)].to_csv('collections/msmarco-passage/qrels.dev.500.tsv', sep='\t', header=False, index=False)

In [7]:
print(sampled_dev.shape)
print(sampled_dev.qid.drop_duplicates().shape)
print(sampled_dev.groupby('qid').count().mean()['pid'])
print(sampled_dev.head(10))

(482514, 2)
(500,)
965.028
       qid      pid
0   820267  1000130
1   991832  1000599
2  1093443  1000611
3   706950  1000647
4   996805  1000797
5    23285   100094
6   198246  1001042
7   370734  1001058
8   939866  1001239
9   425688  1001438


In [None]:
from tqdm import tqdm
def extract(df,analyzer):
    lines = []
    fetch_later = []
    for qid,group in df.groupby('qid'):
        analyzed_query = analyzer.analyze(queries[qid]['title'])
        docids = [str(did) for did in group['pid'].drop_duplicates().tolist()]
        fe.lazy_extract(str(qid),analyzed_query,docids)
        fetch_later.append(str(qid))
        if len(fetch_later) == 1000:
            for qid in tqdm(fetch_later):
                for doc in fe.get_result(qid):
                    lines.append((int(qid), int(doc['pid']), *doc['features']))
            fetch_later = []
    #deal with rest
    if len(fetch_later) > 0:
        for qid in tqdm(fetch_later):
            for doc in fe.get_result(qid):
                lines.append((int(qid), int(doc['pid']), *doc['features']))
        fetch_later = []
    extracted = pd.DataFrame(lines, columns=['qid','pid']+fe.feature_names())
    return df.merge(extracted,how='inner',left_on=['qid','pid'],right_on=['qid','pid'])

In [None]:
import json
def export(df, analyzer, fn):
    with open(fn,'w') as f:
        for qid,group in df.groupby('qid'):
            line = {}
            line['qid'] = qid
            line['queryTokens'] = analyzer.analyze(queries[qid]['title'])
            line['docIds'] = [str(did) for did in group['pid'].drop_duplicates().tolist()]
            f.write(json.dumps(line)+'\n')

In [10]:
train_data=extract(sampled_train,analyzer)
#export(sampled_train,analyzer,'train.json')

100%|██████████| 1000/1000 [00:00<00:00, 6767.63it/s]
100%|██████████| 1000/1000 [00:00<00:00, 6021.35it/s]
100%|██████████| 1000/1000 [00:00<00:00, 6328.07it/s]
100%|██████████| 1000/1000 [00:00<00:00, 5115.66it/s]
100%|██████████| 1000/1000 [00:00<00:00, 5256.72it/s]
100%|██████████| 1000/1000 [00:00<00:00, 6390.14it/s]
100%|██████████| 1000/1000 [00:00<00:00, 6426.65it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7870.40it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7908.22it/s]
100%|██████████| 1000/1000 [00:00<00:00, 6426.36it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7913.21it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7883.70it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7947.81it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7361.61it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7638.12it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8370.21it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7992.96it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7613.20it/s]
100%|██████████| 1000/1000 [

100%|██████████| 1000/1000 [00:00<00:00, 8838.45it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8629.92it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8774.62it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8858.20it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8825.04it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8565.84it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7832.76it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8069.01it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8472.01it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8429.64it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8472.57it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8848.61it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8274.05it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8721.44it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8891.34it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8836.74it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8661.60it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8716.65it/s]
100%|██████████| 1000/1000 [

100%|██████████| 1000/1000 [00:00<00:00, 9336.38it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9302.92it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9306.30it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9016.38it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9157.75it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9323.80it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9447.67it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9140.35it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8572.96it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8572.00it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8455.24it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8547.52it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8012.38it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8971.50it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9150.06it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9200.46it/s]
100%|██████████| 1000/1000 [00:00<00:00, 3619.01it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9477.15it/s]
100%|██████████| 1000/1000 [

In [11]:
dev_data=extract(sampled_dev,analyzer)
#export(sampled_dev,analyzer,'dev.json')

100%|██████████| 500/500 [00:07<00:00, 66.35it/s] 


In [12]:
from lightgbm.sklearn import LGBMRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
model = LGBMRegressor(random_state=12345)
# model = LogisticRegression()
# model = RandomForestRegressor()
# model = LinearSVC()
train_X = train_data.loc[:,fe.feature_names()].values
train_Y = train_data.loc[:,'rel'].values
model.fit(train_X, train_Y)

LGBMRegressor(random_state=12345)

In [13]:
dev_X = dev_data.loc[:,fe.feature_names()].values
dev_data['score'] = model.predict(dev_X)

In [None]:
with open('lambdarank.run','w') as f:
    score_tie_counter = 0
    score_tie_query = set()
    for qid, group in tqdm(dev_data.groupby('qid')):
        rank = 1
        prev_score = -1e10
        prev_pid = ''
        assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
        for t in group.sort_values(['score','pid'],ascending=False).itertuples():
            if abs(t.score-prev_score)<1e-8:
                score_tie_counter+=1
                score_tie_query.add(qid)
            assert prev_pid != t.pid
            prev_score = t.score
            prev_pid = t.pid
            f.write(f'{t.qid}\t{t.pid}\t{rank}\n')
            rank += 1
    if score_tie_counter>0:
        print(f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries')

100%|██████████| 500/500 [00:02<00:00, 182.50it/s]

score_tie occurs 275217 times in 498 queries





In [None]:
!python3 tools/scripts/msmarco/msmarco_eval.py collections/msmarco-passage/qrels.dev.500.tsv lambdarank.run

#####################
MRR @10: 0.19582222222222223
QueriesRanked: 500
#####################


In [None]:
with open('lambdarank.run.trec','w') as f:
    for qid, group in tqdm(dev_data.groupby('qid')):
        rank = 1
        assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
        for t in group.sort_values(['score','pid'],ascending=False).itertuples():
            new_score = t.score - rank*1e-6
            f.write(f'{t.qid}\tQ0\t{t.pid}\t{rank}\t{new_score:.6f}\tlambdarank\n')
            rank+=1

100%|██████████| 500/500 [00:02<00:00, 187.29it/s]


In [None]:
!tools/eval/trec_eval.9.0.4/trec_eval -m all_trec collections/msmarco-passage/qrels.dev.500.tsv lambdarank.run.trec | egrep '^map\s|recall_1000'

map                   	all	0.2024
recall_1000           	all	0.8163
