In [1]:
from pyserini.ltr import FeatureExtractor, AvgICTF, AvgIDF, BM25, DocSize, MatchingTermCount, \
    PMI, QueryLength, SCQ, SCS, SumMatchingTF, TFIDF, UniqueTermCount, \
    UnorderedSequentialPairs, OrderedSequentialPairs
from pyserini.analysis import Analyzer, get_lucene_analyzer
from pyserini.search import get_topics_with_reader
fe = FeatureExtractor('indexes/msmarco-passage/lucene-index-msmarco/', 20)
fe.add(AvgICTF())
fe.add(AvgIDF())
fe.add(BM25())
fe.add(DocSize())
fe.add(MatchingTermCount())
#take too long
fe.add(PMI())
fe.add(QueryLength())
fe.add(SCQ())
fe.add(SCS())
fe.add(SumMatchingTF())
fe.add(TFIDF())
fe.add(UniqueTermCount())
analyzer = Analyzer(get_lucene_analyzer())

In [2]:
queries = get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader',
                                 'collections/msmarco-passage/queries.train.tsv')
queries.update(get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader',
                                 'collections/msmarco-passage/queries.dev.tsv'))

In [3]:
import pandas as pd
import numpy as np
# train = pd.read_csv('collections/msmarco-passage/qidpidtriples.train.full.tsv',sep="\t",
#                     names=['qid','pos_pid','neg_pid'])
# pos_half = train[['qid','pos_pid']].rename(columns={"pos_pid": "pid"})
# pos_half['rel'] = 1.
# neg_half = train[['qid','neg_pid']].rename(columns={"neg_pid": "pid"})
# neg_half['rel'] = 0.
# train = pd.concat([pos_half,neg_half],axis=0,ignore_index=True)
# del pos_half, neg_half
# sampled_train_qid=pd.Series(train['qid'].unique()).sample(frac=0.01,random_state=123456)
# sampled_train=train[train['qid'].isin(sampled_train_qid)].reset_index(drop=True).copy(deep=True)
# del train
sampled_train = pd.read_json('sampled_train.json')
sampled_train_qid = pd.read_json('sampled_train_qid.json',typ='series', orient='records')

In [5]:
print(sampled_train.shape)
print(sampled_train.qid.drop_duplicates().shape)
print(sampled_train.groupby('qid').count().mean()['pid'])
print(sampled_train.head(10))

(5417782, 3)
(3277,)
1653.2749465974978
       qid      pid  rel
0  1170090  3917099    1
1   129031  6620836    1
2   517222  3652956    1
3  1045550  4710372    1
4  1070824  2993656    1
5   340406  6630217    1
6   179852   376926    1
7  1031093  3391276    1
8   995835  5178179    1
9   261655   284187    1


In [6]:
dev = pd.read_csv('collections/msmarco-passage/top1000.dev',sep="\t",
                    names=['qid','pid','query','doc'], usecols=['qid','pid'])
sampled_dev_qid=pd.Series(dev['qid'].unique()).sample(n=500,random_state=123456)
sampled_dev = dev[dev['qid'].isin(sampled_dev_qid)].reset_index(drop=True).copy(deep=True)
del dev
dev_qrel=pd.read_csv('collections/msmarco-passage/qrels.dev.small.tsv', sep="\t", names=["qid","q0","pid","rel"])
dev_qrel[dev_qrel['qid'].isin(sampled_dev_qid)].to_csv('collections/msmarco-passage/qrels.dev.500.tsv', sep='\t', header=False, index=False)

In [7]:
print(sampled_dev.shape)
print(sampled_dev.qid.drop_duplicates().shape)
print(sampled_dev.groupby('qid').count().mean()['pid'])
print(sampled_dev.head(10))

(482514, 2)
(500,)
965.028
       qid      pid
0   820267  1000130
1   991832  1000599
2  1093443  1000611
3   706950  1000647
4   996805  1000797
5    23285   100094
6   198246  1001042
7   370734  1001058
8   939866  1001239
9   425688  1001438


In [8]:
from tqdm import tqdm
def extract(df,analyzer):
    lines = []
    fetch_later = []
    for qid,group in df.groupby('qid'):
        analyzed_query = analyzer.analyze(queries[qid]['title'])
        docids = [str(did) for did in group['pid'].drop_duplicates().tolist()]
        fe.lazy_extract(str(qid),analyzed_query,docids)
        fetch_later.append(str(qid))
        if len(fetch_later) == 1000:
            for qid in tqdm(fetch_later):
                for doc in fe.get_result(qid):
                    lines.append((int(qid), int(doc['pid']), *doc['features']))
            fetch_later = []
    #deal with rest
    if len(fetch_later) > 0:
        for qid in tqdm(fetch_later):
            for doc in fe.get_result(qid):
                lines.append((int(qid), int(doc['pid']), *doc['features']))
        fetch_later = []
    extracted = pd.DataFrame(lines, columns=['qid','pid']+fe.feature_names())
    return df.merge(extracted,how='inner',left_on=['qid','pid'],right_on=['qid','pid'])

In [9]:
import json
def export(df, analyzer, fn):
    with open(fn,'w') as f:
        for qid,group in df.groupby('qid'):
            line = {}
            line['qid'] = qid
            line['queryTokens'] = analyzer.analyze(queries[qid]['title'])
            line['docIds'] = [str(did) for did in group['pid'].drop_duplicates().tolist()]
            f.write(json.dumps(line)+'\n')

In [10]:
train_data=extract(sampled_train,analyzer)
#export(sampled_train,analyzer,'train.json')

100%|██████████| 1000/1000 [01:09<00:00, 14.35it/s]
100%|██████████| 1000/1000 [00:59<00:00, 16.68it/s]
100%|██████████| 1000/1000 [01:00<00:00, 16.43it/s]
100%|██████████| 277/277 [00:17<00:00, 15.75it/s]


In [12]:
dev_data=extract(sampled_dev,analyzer)
#export(sampled_dev,analyzer,'dev.json')

100%|██████████| 500/500 [00:39<00:00, 12.61it/s]


In [14]:
from lightgbm.sklearn import LGBMRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
model = LGBMRegressor(random_state=12345)
# model = LogisticRegression()
# model = RandomForestRegressor()
# model = LinearSVC()
train_X = train_data.loc[:,fe.feature_names()].values
train_Y = train_data.loc[:,'rel'].values
model.fit(train_X, train_Y)

LGBMRegressor(random_state=12345)

In [15]:
dev_X = dev_data.loc[:,fe.feature_names()].values
dev_data['score'] = model.predict(dev_X)

In [16]:
with open('lambdarank.run','w') as f:
    score_tie_counter = 0
    score_tie_query = set()
    for qid, group in tqdm(dev_data.groupby('qid')):
        rank = 1
        prev_score = -1e10
        prev_pid = ''
        assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
        for t in group.sort_values(['score','pid'],ascending=False).itertuples():
            if abs(t.score-prev_score)<1e-8:
                score_tie_counter+=1
                score_tie_query.add(qid)
            assert prev_pid != t.pid
            prev_score = t.score
            prev_pid = t.pid
            f.write(f'{t.qid}\t{t.pid}\t{rank}\n')
            rank += 1
    if score_tie_counter>0:
        print(f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries')

100%|██████████| 500/500 [00:03<00:00, 161.20it/s]


score_tie occurs 305143 times in 497 queries


In [18]:
!python tools/scripts/msmarco/msmarco_eval.py collections/msmarco-passage/qrels.dev.500.tsv lambdarank.run

#####################
MRR @10: 0.18456746031746044
QueriesRanked: 500
#####################


In [19]:
with open('lambdarank.run.trec','w') as f:
    for qid, group in tqdm(dev_data.groupby('qid')):
        rank = 1
        assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
        for t in group.sort_values(['score','pid'],ascending=False).itertuples():
            new_score = t.score - rank*1e-6
            f.write(f'{t.qid}\tQ0\t{t.pid}\t{rank}\t{new_score:.6f}\tlambdarank\n')
            rank+=1

100%|██████████| 500/500 [00:02<00:00, 169.81it/s]


In [21]:
!tools/eval/trec_eval.9.0.4/trec_eval -m all_trec collections/msmarco-passage/qrels.dev.500.tsv lambdarank.run.trec | egrep '^map\s|recall_1000'

map                   	all	0.1926
recall_1000           	all	0.8163
