In [1]:
import argparse
import datetime
import glob
import hashlib
import json
import multiprocessing
import pickle
import os
import shutil
import subprocess
import uuid

import numpy as np
import pandas as pd
import lightgbm as lgb
from collections import defaultdict
from lightgbm.sklearn import LGBMRanker
from tqdm import tqdm

from pyserini.analysis import Analyzer, get_lucene_analyzer
from pyserini.ltr import *
from pyserini.search import get_topics_with_reader

def train_data_loader(neg_sample=30, random_seed=12345):
    if os.path.exists(f'train_sampled_with_{neg_sample}_{random_seed}.pickle'):
        sampled_train = pd.read_pickle(f'train_sampled_with_{neg_sample}_{random_seed}.pickle')
        print(sampled_train.shape)
        print(sampled_train.index.get_level_values('qid').drop_duplicates().shape)
        print(sampled_train.groupby('qid').count().mean())
        print(sampled_train.head(10))
        print(sampled_train.info())
        return sampled_train
    else:
        train = pd.read_csv('collections/msmarco-passage/qidpidtriples.train.full.tsv', sep="\t",
                            names=['qid', 'pos_pid', 'neg_pid'], dtype=np.int32)
        pos_half = train[['qid', 'pos_pid']].rename(columns={"pos_pid": "pid"}).drop_duplicates()
        pos_half['rel'] = np.int32(1)
        neg_half = train[['qid', 'neg_pid']].rename(columns={"neg_pid": "pid"}).drop_duplicates()
        neg_half['rel'] = np.int32(0)
        del train
        sampled_neg_half = []
        for qid, group in tqdm(neg_half.groupby('qid')):
            sampled_neg_half.append(group.sample(n=min(neg_sample, len(group)), random_state=random_seed))
        sampled_train = pd.concat([pos_half] + sampled_neg_half, axis=0, ignore_index=True)
        sampled_train = sampled_train.sort_values(['qid','pid']).set_index(['qid','pid'])
        print(sampled_train.shape)
        print(sampled_train.index.get_level_values('qid').drop_duplicates().shape)
        print(sampled_train.groupby('qid').count().mean())
        print(sampled_train.head(10))
        print(sampled_train.info())

        sampled_train.to_pickle(f'train_sampled_with_{neg_sample}_{random_seed}.pickle')
        return sampled_train
sampled_train = train_data_loader()

(32429423, 1)
(327721,)
rel    98.954364
dtype: float64
            rel
qid pid        
91  30491     0
    40677     0
    228542    0
    315007    0
    412769    0
    492641    0
    517618    0
    745919    0
    777051    0
    793527    1
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 32429423 entries, (91, 30491) to (1185869, 8822597)
Data columns (total 1 columns):
 #   Column  Dtype
---  ------  -----
 0   rel     int32
dtypes: int32(1)
memory usage: 592.0 MB
None


In [2]:
def dev_data_loader(task='rerank'):
    if os.path.exists(f'dev_{task}.pickle'):
        dev = pd.read_pickle(f'dev_{task}.pickle')
        print(dev.shape)
        print(dev.index.get_level_values('qid').drop_duplicates().shape)
        print(dev.groupby('qid').count().mean())
        print(dev.head(10))
        print(dev.info())
        return dev
    else:
        if task == 'rerank':
            dev = pd.read_csv('collections/msmarco-passage/top1000.dev', sep="\t",
                              names=['qid', 'pid', 'query', 'doc'], usecols=['qid', 'pid'], dtype=np.int32)
        elif task == 'anserini':
            dev = pd.read_csv('runs/msmarco-passage/run.msmarco-passage.dev.small.tsv',sep="\t",
                            names=['qid','pid','rank'], dtype=np.int32)
        else:
            raise Exception('unknown parameters')
        dev_qrel = pd.read_csv('collections/msmarco-passage/qrels.dev.small.tsv', sep="\t",
                               names=["qid", "q0", "pid", "rel"], usecols=['qid', 'pid', 'rel'], dtype=np.int32)
        dev = dev.merge(dev_qrel, left_on=['qid', 'pid'], right_on=['qid', 'pid'], how='left')
        dev['rel'] = dev['rel'].fillna(0).astype(np.int32)
        dev = dev.sort_values(['qid','pid']).set_index(['qid','pid'])
        
        print(dev.shape)
        print(dev.index.get_level_values('qid').drop_duplicates().shape)
        print(dev.groupby('qid').count().mean())
        print(dev.head(10))
        print(dev.info())

        dev.to_pickle(f'dev_{task}.pickle')
        return dev
dev = dev_data_loader()

(6668967, 1)
(6980,)
rel    955.439398
dtype: float64
            rel
qid pid        
2   10749     0
    63138     0
    96198     0
    98589     0
    98595     0
    112123    0
    112127    0
    112128    0
    112130    0
    112131    0
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6668967 entries, (2, 10749) to (1102400, 8837328)
Data columns (total 1 columns):
 #   Column  Dtype
---  ------  -----
 0   rel     int32
dtypes: int32(1)
memory usage: 253.7 MB
None


In [3]:
def query_loader(choice='default'):
    if os.path.exists(f'query_{choice}_tokenized.pickle'):
        return pickle.load(open(f'query_{choice}_tokenized.pickle','rb'))
    else:
        if choice == 'default':
            analyzer = Analyzer(get_lucene_analyzer())
            queries = get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader', \
                                             'collections/msmarco-passage/queries.train.tsv')
            queries.update(get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader', \
                                                  'collections/msmarco-passage/queries.dev.tsv'))
            for qid,value in queries.items():
                assert 'tokenized' not in value
                value['tokenized'] = analyzer.analyze(value['title'])
        else:
            raise Exception('unknown parameters')

        pickle.dump(queries,open(f'query_{choice}_tokenized.pickle','wb'))

        return queries
queries = query_loader()

In [4]:
fe = FeatureExtractor('indexes/msmarco-passage/lucene-index-msmarco/',max(multiprocessing.cpu_count()//2,1))
fe.add(BM25(k1=0.9,b=0.4))
fe.add(BM25(k1=1.2,b=0.75))
fe.add(BM25(k1=2.0,b=0.75))
fe.add(LMDir(mu=1000))
fe.add(LMDir(mu=1500))
fe.add(LMDir(mu=2500))
fe.add(LMJM(0.1))
fe.add(LMJM(0.4))
fe.add(LMJM(0.7))
fe.add(DFR_GL2())
fe.add(DFR_In_expB2())
fe.add(DPH())
fe.add(Proximity())
fe.add(DocSize())
fe.add(QueryLength())
fe.add(UniqueTermCount())
fe.add(MatchingTermCount())
fe.add(QueryCoverageRatio())
fe.add(SCS())
fe.add(tfStat(AvgPooler()))
fe.add(tfStat(SumPooler()))
fe.add(tfStat(MinPooler()))
fe.add(tfStat(MaxPooler()))
fe.add(tfStat(VarPooler()))
fe.add(tfIdfStat(AvgPooler()))
fe.add(tfIdfStat(SumPooler()))
fe.add(tfIdfStat(MinPooler()))
fe.add(tfIdfStat(MaxPooler()))
fe.add(tfIdfStat(VarPooler()))
fe.add(normalizedTfStat(AvgPooler()))
fe.add(normalizedTfStat(SumPooler()))
fe.add(normalizedTfStat(MinPooler()))
fe.add(normalizedTfStat(MaxPooler()))
fe.add(normalizedTfStat(VarPooler()))
fe.add(idfStat(AvgPooler()))
fe.add(idfStat(SumPooler()))
fe.add(idfStat(MinPooler()))
fe.add(idfStat(MaxPooler()))
fe.add(idfStat(VarPooler()))
fe.add(ictfStat(AvgPooler()))
fe.add(ictfStat(SumPooler()))
fe.add(ictfStat(MinPooler()))
fe.add(ictfStat(MaxPooler()))
fe.add(ictfStat(VarPooler()))
fe.add(scqStat(AvgPooler()))
fe.add(scqStat(SumPooler()))
fe.add(scqStat(MinPooler()))
fe.add(scqStat(MaxPooler()))
fe.add(scqStat(VarPooler()))
fe.add(UnorderedSequentialPairs(3))
fe.add(UnorderedSequentialPairs(8))
fe.add(UnorderedSequentialPairs(15))
fe.add(OrderedSequentialPairs(3))
fe.add(OrderedSequentialPairs(8))
fe.add(OrderedSequentialPairs(15))
fe.add(UnorderedQueryPairs(3))
fe.add(UnorderedQueryPairs(8))
fe.add(UnorderedQueryPairs(15))
fe.add(OrderedQueryPairs(3))
fe.add(OrderedQueryPairs(8))
fe.add(OrderedQueryPairs(15))

In [5]:
def extract(df, queries, fe):
    df_pieces = []
    fetch_later = []
    qidpid2rel = defaultdict(dict)
    need_rows = 0
    for qid,group in tqdm(df.groupby('qid')):
        for t in group.reset_index().itertuples():
            assert t.pid not in qidpid2rel[t.qid]
            qidpid2rel[t.qid][t.pid] = t.rel
            need_rows += 1
        fe.lazy_extract(str(qid),queries[qid]['tokenized'],list(qidpid2rel[t.qid].keys()))
        fetch_later.append(str(qid))
        if len(fetch_later) == 10000:
            info = np.zeros(shape=(need_rows,3), dtype=np.int32)
            feature = np.zeros(shape=(need_rows,len(fe.feature_names())), dtype=np.float32)
            idx = 0
            for qid in fetch_later:
                for doc in fe.get_result(qid):
                    info[idx,0] = int(qid)
                    info[idx,1] = int(doc['pid'])
                    info[idx,2] = qidpid2rel[int(qid)][int(doc['pid'])]
                    feature[idx,:] = doc['features']
                    idx += 1
            info = pd.DataFrame(info, columns=['qid','pid','rel'])
            feature = pd.DataFrame(feature, columns=fe.feature_names())
            df_pieces.append(pd.concat([info,feature], axis=1))
            fetch_later = []
            need_rows = 0
    #deal with rest
    if len(fetch_later) > 0:
        info = np.zeros(shape=(need_rows,3), dtype=np.int32)
        feature = np.zeros(shape=(need_rows,len(fe.feature_names())), dtype=np.float32)
        idx = 0
        for qid in fetch_later:
            for doc in fe.get_result(qid):
                info[idx,0] = int(qid)
                info[idx,1] = int(doc['pid'])
                info[idx,2] = qidpid2rel[int(qid)][int(doc['pid'])]
                feature[idx,:] = doc['features']
                idx += 1
        info = pd.DataFrame(info, columns=['qid','pid','rel'])
        feature = pd.DataFrame(feature, columns=fe.feature_names())
        df_pieces.append(pd.concat([info,feature], axis=1))
    data = pd.concat(df_pieces, axis=0, ignore_index=True)
    data = data.sort_values(by='qid', kind='mergesort')
    group = data.groupby('qid').agg(count=('pid', 'count'))['count']
    return data,group

In [6]:
def hash_df(df):
    h = pd.util.hash_pandas_object(df)
    return hex(h.sum().astype(np.uint64))


def hash_anserini_jar():
    find = glob.glob(os.environ['ANSERINI_CLASSPATH'] + "/*fatjar.jar")
    assert len(find) == 1
    md5Hash = hashlib.md5(open(find[0], 'rb').read())
    return md5Hash.hexdigest()


def hash_fe(fe):
    return hashlib.md5(','.join(sorted(fe.feature_names())).encode()).hexdigest()


def data_loader(task, df, queries, fe):
    df_hash = hash_df(df)
    jar_hash = hash_anserini_jar()
    fe_hash = hash_fe(fe)
    if os.path.exists(f'{task}_{df_hash}_{jar_hash}_{fe_hash}.pickle'):
        res = pickle.load(open(f'{task}_{df_hash}_{jar_hash}_{fe_hash}.pickle','rb'))
        print(res['data'].shape)
        print(res['data'].qid.drop_duplicates().shape)
        print(res['group'].mean())
        print(res['data'].head(10))
        print(res['data'].info())
        return res
    else:
        if task == 'train' or task == 'dev': 
            data,group = extract(df, queries, fe)
            obj = {'data':data,'group':group,'df_hash':df_hash,'jar_hash':jar_hash,'fe_hash':fe_hash}
            print(data.shape)
            print(data.qid.drop_duplicates().shape)
            print(group.mean())
            print(data.head(10))
            print(data.info())
            pickle.dump(obj,open(f'{task}_{df_hash}_{jar_hash}_{fe_hash}.pickle','wb'))
            return obj
        else:
            raise Exception('unknown parameters')

In [7]:
train_extracted = data_loader('train', sampled_train, queries, fe)
dev_extracted = data_loader('dev', dev, queries, fe)
del sampled_train, dev

(32429423, 64)
(327721,)
98.95436362027456
   qid     pid  rel  BM25_k1_0.90_b_0.40  BM25_k1_1.20_b_0.75  \
0   91   30491    0            14.489719            15.415164   
1   91   40677    0            20.184231            22.359982   
2   91  228542    0            21.023039            22.106380   
3   91  315007    0            14.996305            14.801217   
4   91  412769    0            21.192341            20.946251   
5   91  492641    0            12.471210            12.869027   
6   91  517618    0            21.082029            22.528090   
7   91  745919    0            25.917801            26.662256   
8   91  777051    0            16.559290            17.715590   
9   91  793527    1            36.255280            37.489307   

   BM25_k1_2.00_b_0.75  LMD_mu_1000  LMD_mu_1500  LMD_mu_2500  \
0            18.494238     9.712474    10.429009    11.322591   
1            25.231373    18.332842    19.348846    20.545277   
2            23.523069    25.236155    26.3799

In [8]:
feature_name = fe.feature_names()
train_X = train_extracted['data'].loc[:, feature_name]
train_Y = train_extracted['data']['rel']
dev_X = dev_extracted['data'].loc[:, feature_name]
dev_Y = dev_extracted['data']['rel']
lgb_train = lgb.Dataset(train_X,label=train_Y,group=train_extracted['group'])
lgb_valid = lgb.Dataset(dev_X,label=dev_Y,group=dev_extracted['group'])

In [9]:
def eval_mrr(dev_data):
    score_tie_counter = 0
    score_tie_query = set()

    MRR = []
    for qid, group in tqdm(dev_data.groupby('qid')):
        group = group.reset_index()
        rank = 0
        prev_score = None
        assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
        # stable sort is also used in LightGBM

        for t in group.sort_values('score', ascending=False, kind='mergesort').itertuples():
            if prev_score is not None and abs(t.score - prev_score) < 1e-8:
                score_tie_counter += 1
                score_tie_query.add(qid)
            prev_score = t.score
            prev_pid = t.pid
            rank += 1
            if t.rel>0:
                MRR.append(1.0/rank)
                break
            elif rank == 10 or rank == len(group):
                MRR.append(0.)
                break

    score_tie = f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries'
    print(score_tie,np.mean(MRR))


In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'max_bin':255,
    'num_leaves':63,
    'max_depth':10,
    'min_data_in_leaf':50,
    'min_sum_hessian_in_leaf':0,
    'bagging_fraction':0.8,
    'bagging_freq':20,
    'feature_fraction':1,
    'learning_rate':0.1,
    'num_boost_round':1000,
    'early_stopping_round':500,
    'metric':['map'],
    'eval_at':[10],
    'label_gain':[0,1],
    'lambdarank_truncation_level':20,
    'num_threads':max(multiprocessing.cpu_count()//2,1)
}
num_boost_round = params.pop('num_boost_round')
early_stopping_round = params.pop('early_stopping_round')
eval_results={}
dev_extracted['data']['score']=0.
for seed in [12345,31345,21356,65743,68786]:
    params['seed'] = seed
    gbm = lgb.train(params, lgb_train, 
                    valid_sets=lgb_valid,
                    num_boost_round=num_boost_round,
                    early_stopping_rounds =early_stopping_round,
                    feature_name=feature_name,
                    evals_result=eval_results,
                    verbose_eval=False)
    dev_extracted['data']['score'] += gbm.predict(dev_X)
    best_score = gbm.best_score['valid_0']['map@10']
    print(best_score)
    best_iteration = gbm.best_iteration
    print(best_iteration)
    eval_map = eval_results['valid_0']['map@10']
    # print(eval_map)
    feature_importances = sorted(list(zip(feature_name,gbm.feature_importance().tolist())),key=lambda x:x[1],reverse=True)
    print(feature_importances)
    eval_mrr(dev_extracted['data'])

[LightGBM] [Info] Total groups: 327721, total data: 32429423
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9841
[LightGBM] [Info] Number of data points in the train set: 32429423, number of used features: 55
[LightGBM] [Info] Total groups: 6980, total data: 6668967
0.38917278770788793
873
[('DFR_In_expB2', 2578), ('SCQvar', 2220), ('SCQavg', 2163), ('DocSize', 1971), ('IDFsum', 1889), ('NormalizedTFvar', 1870), ('NormalizedTFmin', 1696), ('TFIDFvar', 1623), ('SCQmax', 1613), ('SCQsum', 1563), ('TFIDFmax', 1562), ('NormalizedTFavg', 1529), ('Proximity', 1513), ('DFR_GL2', 1485), ('BM25_k1_2.00_b_0.75', 1405), ('TFvar', 1393), ('BM25_k1_0.90_b_0.40', 1305), ('LMD_mu_2500', 1272), ('TFIDFavg', 1237), ('TFIDFsum', 1167), ('SCS', 999), ('UnorderedQueryPairs15', 933), ('ICTFvar', 920), ('IDFavg', 891), ('IDFvar', 868), ('TFavg', 842), ('IDFmax', 837), ('BM25_k1_1.20_b_0.75', 832), ('ICTFa

100%|██████████| 6980/6980 [00:36<00:00, 192.06it/s]


score_tie occurs 936 times in 712 queries 0.21424682767089642
[LightGBM] [Info] Total groups: 327721, total data: 32429423
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9841
[LightGBM] [Info] Number of data points in the train set: 32429423, number of used features: 55
[LightGBM] [Info] Total groups: 6980, total data: 6668967
0.38958647913160804
316
[('DFR_In_expB2', 1170), ('IDFsum', 1004), ('SCQavg', 856), ('SCQsum', 746), ('NormalizedTFmin', 697), ('LMD_mu_2500', 676), ('NormalizedTFvar', 673), ('DocSize', 669), ('DFR_GL2', 633), ('SCQvar', 614), ('BM25_k1_0.90_b_0.40', 596), ('BM25_k1_2.00_b_0.75', 592), ('TFIDFvar', 591), ('TFIDFmax', 558), ('UnorderedQueryPairs15', 463), ('TFvar', 414), ('MatchingTermCount', 406), ('SCQmax', 404), ('NormalizedTFavg', 385), ('TFsum', 368), ('UnorderedQueryPairs3', 349), ('QueryCoverageRatio', 329), ('Proximity', 316), ('TFavg', 316), ('Unordere

100%|██████████| 6980/6980 [00:39<00:00, 177.77it/s]


score_tie occurs 892 times in 699 queries 0.21428753354254784
[LightGBM] [Info] Total groups: 327721, total data: 32429423
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9841
[LightGBM] [Info] Number of data points in the train set: 32429423, number of used features: 55
[LightGBM] [Info] Total groups: 6980, total data: 6668967
0.3905145464744318
472
[('DFR_In_expB2', 1647), ('IDFsum', 1222), ('SCQavg', 1157), ('SCQvar', 1060), ('NormalizedTFmin', 993), ('DocSize', 975), ('SCQsum', 975), ('NormalizedTFvar', 939), ('TFIDFvar', 914), ('LMD_mu_2500', 897), ('BM25_k1_2.00_b_0.75', 861), ('DFR_GL2', 844), ('TFIDFmax', 819), ('BM25_k1_0.90_b_0.40', 817), ('TFvar', 710), ('NormalizedTFavg', 705), ('SCQmax', 704), ('UnorderedQueryPairs15', 642), ('Proximity', 636), ('TFIDFsum', 555), ('TFIDFavg', 502), ('TFavg', 497), ('TFsum', 476), ('UnorderedSequentialPairs15', 464), ('UnorderedQueryPairs8

100%|██████████| 6980/6980 [00:37<00:00, 185.66it/s]


score_tie occurs 873 times in 686 queries 0.2147775390003184
[LightGBM] [Info] Total groups: 327721, total data: 32429423
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9841
[LightGBM] [Info] Number of data points in the train set: 32429423, number of used features: 55
[LightGBM] [Info] Total groups: 6980, total data: 6668967


In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'max_bin':255,
    'num_leaves':63,
    'max_depth':10,
    'min_data_in_leaf':50,
    'min_sum_hessian_in_leaf':0,
    'bagging_fraction':0.8,
    'bagging_freq':50,
    'feature_fraction':1,
    'learning_rate':0.1,
    'num_boost_round':500,
    'metric':['map'],
    'eval_at':[10],
    'label_gain':[0,1],
    'lambdarank_truncation_level':20,
    'seed':12345,
    'num_threads':max(multiprocessing.cpu_count()//2,1)
}

num_boost_round = params.pop('num_boost_round')
eval_results={}
cv_gbm = lgb.cv(params, lgb_train, nfold=5, 
                num_boost_round=num_boost_round,
                feature_name=feature_name,
                verbose_eval=False,
                return_cvbooster=True)
dev_extracted['data']['score'] = 0.
for gbm in cv_gbm['cvbooster'].boosters:
    dev_extracted['data']['score']+=gbm.predict(dev_X)
feature_importances = sorted(list(zip(feature_name,gbm.feature_importance().tolist())),key=lambda x:x[1],reverse=True)
print(feature_importances)
eval_mrr(dev_extracted['data'])