In [1]:
import argparse
import datetime
import glob
import hashlib
import json
import multiprocessing
import pickle
import os
import shutil
import subprocess
import uuid

import numpy as np
import pandas as pd
import lightgbm as lgb
from collections import defaultdict
from lightgbm.sklearn import LGBMRanker
from tqdm import tqdm

from pyserini.analysis import Analyzer, get_lucene_analyzer
from pyserini.ltr import *
from pyserini.search import get_topics_with_reader

def train_data_loader(neg_sample=10, random_seed=12345):
    if os.path.exists(f'train_sampled_with_{neg_sample}_{random_seed}.pickle'):
        sampled_train = pd.read_pickle(f'train_sampled_with_{neg_sample}_{random_seed}.pickle')
        print(sampled_train.shape)
        print(sampled_train.index.get_level_values('qid').drop_duplicates().shape)
        print(sampled_train.groupby('qid').count().mean())
        print(sampled_train.head(10))
        print(sampled_train.info())
        return sampled_train
    else:
        train = pd.read_csv('collections/msmarco-passage/qidpidtriples.train.full.tsv', sep="\t",
                            names=['qid', 'pos_pid', 'neg_pid'], dtype=np.int32)
        pos_half = train[['qid', 'pos_pid']].rename(columns={"pos_pid": "pid"}).drop_duplicates()
        pos_half['rel'] = np.int32(1)
        neg_half = train[['qid', 'neg_pid']].rename(columns={"neg_pid": "pid"}).drop_duplicates()
        neg_half['rel'] = np.int32(0)
        del train
        sampled_neg_half = []
        for qid, group in tqdm(neg_half.groupby('qid')):
            sampled_neg_half.append(group.sample(n=min(neg_sample, len(group)), random_state=random_seed))
        sampled_train = pd.concat([pos_half] + sampled_neg_half, axis=0, ignore_index=True)
        sampled_train = sampled_train.sort_values(['qid','pid']).set_index(['qid','pid'])
        print(sampled_train.shape)
        print(sampled_train.index.get_level_values('qid').drop_duplicates().shape)
        print(sampled_train.groupby('qid').count().mean())
        print(sampled_train.head(10))
        print(sampled_train.info())

        sampled_train.to_pickle(f'train_sampled_with_{neg_sample}_{random_seed}.pickle')
        return sampled_train
sampled_train = train_data_loader()

(3606272, 1)
(327721,)
rel    11.004092
dtype: float64
             rel
qid pid         
91  793527     1
    1156624    0
    1378443    0
    1480965    0
    1662964    0
    1779082    0
    2477520    0
    5668069    0
    6813163    0
    6814345    0
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3606272 entries, (91, 793527) to (1185869, 8252591)
Data columns (total 1 columns):
 #   Column  Dtype
---  ------  -----
 0   rel     int32
dtypes: int32(1)
memory usage: 152.7 MB
None


In [2]:
def dev_data_loader(task='rerank'):
    if os.path.exists(f'dev_{task}.pickle'):
        dev = pd.read_pickle(f'dev_{task}.pickle')
        print(dev.shape)
        print(dev.index.get_level_values('qid').drop_duplicates().shape)
        print(dev.groupby('qid').count().mean())
        print(dev.head(10))
        print(dev.info())
        return dev
    else:
        if task == 'rerank':
            dev = pd.read_csv('collections/msmarco-passage/top1000.dev', sep="\t",
                              names=['qid', 'pid', 'query', 'doc'], usecols=['qid', 'pid'], dtype=np.int32)
        elif task == 'anserini':
            dev = pd.read_csv('runs/msmarco-passage/run.msmarco-passage.dev.small.tsv',sep="\t",
                            names=['qid','pid','rank'], dtype=np.int32)
        else:
            raise Exception('unknown parameters')
        dev_qrel = pd.read_csv('collections/msmarco-passage/qrels.dev.small.tsv', sep="\t",
                               names=["qid", "q0", "pid", "rel"], usecols=['qid', 'pid', 'rel'], dtype=np.int32)
        dev = dev.merge(dev_qrel, left_on=['qid', 'pid'], right_on=['qid', 'pid'], how='left')
        dev['rel'] = dev['rel'].fillna(0).astype(np.int32)
        dev = dev.sort_values(['qid','pid']).set_index(['qid','pid'])
        
        print(dev.shape)
        print(dev.index.get_level_values('qid').drop_duplicates().shape)
        print(dev.groupby('qid').count().mean())
        print(dev.head(10))
        print(dev.info())

        dev.to_pickle(f'dev_{task}.pickle')
        return dev
dev = dev_data_loader()

(6668967, 1)
(6980,)
rel    955.439398
dtype: float64
            rel
qid pid        
2   10749     0
    63138     0
    96198     0
    98589     0
    98595     0
    112123    0
    112127    0
    112128    0
    112130    0
    112131    0
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6668967 entries, (2, 10749) to (1102400, 8837328)
Data columns (total 1 columns):
 #   Column  Dtype
---  ------  -----
 0   rel     int32
dtypes: int32(1)
memory usage: 253.7 MB
None


In [3]:
def query_loader(choice='default'):
    if os.path.exists(f'query_{choice}_tokenized.pickle'):
        return pickle.load(open(f'query_{choice}_tokenized.pickle','rb'))
    else:
        if choice == 'default':
            analyzer = Analyzer(get_lucene_analyzer())
            queries = get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader', \
                                             'collections/msmarco-passage/queries.train.tsv')
            queries.update(get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader', \
                                                  'collections/msmarco-passage/queries.dev.tsv'))
            for qid,value in queries.items():
                assert 'tokenized' not in value
                value['tokenized'] = analyzer.analyze(value['title'])
        else:
            raise Exception('unknown parameters')

        pickle.dump(queries,open(f'query_{choice}_tokenized.pickle','wb'))

        return queries
queries = query_loader()

In [4]:
fe = FeatureExtractor('indexes/msmarco-passage/lucene-index-msmarco/',max(multiprocessing.cpu_count()//2,1))
fe.add(BM25(k1=0.9,b=0.4))
fe.add(BM25(k1=1.2,b=0.75))
fe.add(BM25(k1=2.0,b=0.75))
fe.add(LMDir(mu=1000))
fe.add(LMDir(mu=1500))
fe.add(LMDir(mu=2500))
fe.add(LMJM(0.1))
fe.add(LMJM(0.4))
fe.add(LMJM(0.7))
fe.add(DFR_GL2())
fe.add(DFR_In_expB2())
fe.add(DPH())
fe.add(Proximity())
fe.add(DocSize())
fe.add(QueryLength())
fe.add(UniqueTermCount())
fe.add(MatchingTermCount())
fe.add(QueryCoverageRatio())
fe.add(SCS())
fe.add(tfStat(AvgPooler()))
fe.add(tfStat(SumPooler()))
fe.add(tfStat(MinPooler()))
fe.add(tfStat(MaxPooler()))
fe.add(tfStat(VarPooler()))
fe.add(tfIdfStat(AvgPooler()))
fe.add(tfIdfStat(SumPooler()))
fe.add(tfIdfStat(MinPooler()))
fe.add(tfIdfStat(MaxPooler()))
fe.add(tfIdfStat(VarPooler()))
fe.add(normalizedTfStat(AvgPooler()))
fe.add(normalizedTfStat(SumPooler()))
fe.add(normalizedTfStat(MinPooler()))
fe.add(normalizedTfStat(MaxPooler()))
fe.add(normalizedTfStat(VarPooler()))
fe.add(idfStat(AvgPooler()))
fe.add(idfStat(SumPooler()))
fe.add(idfStat(MinPooler()))
fe.add(idfStat(MaxPooler()))
fe.add(idfStat(VarPooler()))
fe.add(ictfStat(AvgPooler()))
fe.add(ictfStat(SumPooler()))
fe.add(ictfStat(MinPooler()))
fe.add(ictfStat(MaxPooler()))
fe.add(ictfStat(VarPooler()))
fe.add(scqStat(AvgPooler()))
fe.add(scqStat(SumPooler()))
fe.add(scqStat(MinPooler()))
fe.add(scqStat(MaxPooler()))
fe.add(scqStat(VarPooler()))
fe.add(UnorderedSequentialPairs(3))
fe.add(UnorderedSequentialPairs(8))
fe.add(UnorderedSequentialPairs(15))
fe.add(OrderedSequentialPairs(3))
fe.add(OrderedSequentialPairs(8))
fe.add(OrderedSequentialPairs(15))
fe.add(UnorderedQueryPairs(3))
fe.add(UnorderedQueryPairs(8))
fe.add(UnorderedQueryPairs(15))
fe.add(OrderedQueryPairs(3))
fe.add(OrderedQueryPairs(8))
fe.add(OrderedQueryPairs(15))

In [5]:
def extract(df, queries, fe):
    df_pieces = []
    fetch_later = []
    qidpid2rel = defaultdict(dict)
    need_rows = 0
    for qid,group in tqdm(df.groupby('qid')):
        for t in group.reset_index().itertuples():
            assert t.pid not in qidpid2rel[t.qid]
            qidpid2rel[t.qid][t.pid] = t.rel
            need_rows += 1
        fe.lazy_extract(str(qid),queries[qid]['tokenized'],list(qidpid2rel[t.qid].keys()))
        fetch_later.append(str(qid))
        if len(fetch_later) == 10000:
            info = np.zeros(shape=(need_rows,3), dtype=np.int32)
            feature = np.zeros(shape=(need_rows,len(fe.feature_names())), dtype=np.float32)
            idx = 0
            for qid in fetch_later:
                for doc in fe.get_result(qid):
                    info[idx,0] = int(qid)
                    info[idx,1] = int(doc['pid'])
                    info[idx,2] = qidpid2rel[int(qid)][int(doc['pid'])]
                    feature[idx,:] = doc['features']
                    idx += 1
            info = pd.DataFrame(info, columns=['qid','pid','rel'])
            feature = pd.DataFrame(feature, columns=fe.feature_names())
            df_pieces.append(pd.concat([info,feature], axis=1))
            fetch_later = []
            need_rows = 0
    #deal with rest
    if len(fetch_later) > 0:
        info = np.zeros(shape=(need_rows,3), dtype=np.int32)
        feature = np.zeros(shape=(need_rows,len(fe.feature_names())), dtype=np.float32)
        idx = 0
        for qid in fetch_later:
            for doc in fe.get_result(qid):
                info[idx,0] = int(qid)
                info[idx,1] = int(doc['pid'])
                info[idx,2] = qidpid2rel[int(qid)][int(doc['pid'])]
                feature[idx,:] = doc['features']
                idx += 1
        info = pd.DataFrame(info, columns=['qid','pid','rel'])
        feature = pd.DataFrame(feature, columns=fe.feature_names())
        df_pieces.append(pd.concat([info,feature], axis=1))
    data = pd.concat(df_pieces, axis=0, ignore_index=True)
    data = data.sort_values(by='qid', kind='mergesort')
    group = data.groupby('qid').agg(count=('pid', 'count'))['count']
    return data,group

In [6]:
def hash_df(df):
    h = pd.util.hash_pandas_object(df)
    return hex(h.sum().astype(np.uint64))


def hash_anserini_jar():
    find = glob.glob(os.environ['ANSERINI_CLASSPATH'] + "/*fatjar.jar")
    assert len(find) == 1
    md5Hash = hashlib.md5(open(find[0], 'rb').read())
    return md5Hash.hexdigest()


def hash_fe(fe):
    return hashlib.md5(','.join(sorted(fe.feature_names())).encode()).hexdigest()


def data_loader(task, df, queries, fe):
    df_hash = hash_df(df)
    jar_hash = hash_anserini_jar()
    fe_hash = hash_fe(fe)
    if os.path.exists(f'{task}_{df_hash}_{jar_hash}_{fe_hash}.pickle'):
        res = pickle.load(open(f'{task}_{df_hash}_{jar_hash}_{fe_hash}.pickle','rb'))
        print(res['data'].shape)
        print(res['data'].qid.drop_duplicates().shape)
        print(res['group'].mean())
        print(res['data'].head(10))
        print(res['data'].info())
        return res
    else:
        if task == 'train' or task == 'dev': 
            data,group = extract(df, queries, fe)
            obj = {'data':data,'group':group,'df_hash':df_hash,'jar_hash':jar_hash,'fe_hash':fe_hash}
            print(data.shape)
            print(data.qid.drop_duplicates().shape)
            print(group.mean())
            print(data.head(10))
            print(data.info())
            pickle.dump(obj,open(f'{task}_{df_hash}_{jar_hash}_{fe_hash}.pickle','wb'))
            return obj
        else:
            raise Exception('unknown parameters')

In [7]:
train_extracted = data_loader('train', sampled_train, queries, fe)
dev_extracted = data_loader('dev', dev, queries, fe)
del sampled_train, dev

100%|██████████| 327721/327721 [12:33<00:00, 434.77it/s] 


(3606272, 64)
(327721,)
11.004091895240158
   qid      pid  rel  BM25_k1_0.90_b_0.40  BM25_k1_1.20_b_0.75  \
0   91   793527    1            36.255280            37.489307   
1   91  1156624    0            19.361340            21.753880   
2   91  1378443    0            17.880190            19.618870   
3   91  1480965    0            17.356880            17.810566   
4   91  1662964    0            20.182610            21.573776   
5   91  1779082    0            20.570230            22.203360   
6   91  2477520    0            19.199781            20.434164   
7   91  5668069    0            27.566326            29.139702   
8   91  6813163    0            22.318399            22.719379   
9   91  6814345    0            16.739628            17.976086   

   BM25_k1_2.00_b_0.75  LMD_mu_1000  LMD_mu_1500  LMD_mu_2500  \
0            42.093887    34.407925    36.330475    38.600681   
1            24.925566    18.220840    19.241903    20.438829   
2            22.600361    17.302290

100%|██████████| 6980/6980 [00:54<00:00, 128.22it/s]


(6668967, 64)
(6980,)
955.4393982808023
   qid     pid  rel  BM25_k1_0.90_b_0.40  BM25_k1_1.20_b_0.75  \
0    2   10749    0             8.842813             9.978682   
1    2   63138    0             9.609193            10.731856   
2    2   96198    0             8.728024             9.668857   
3    2   98589    0            12.699286            14.042357   
4    2   98595    0            11.181725            11.772854   
5    2  112123    0            15.955744            17.375216   
6    2  112127    0            21.200821            23.124138   
7    2  112128    0             8.884659             9.361135   
8    2  112130    0            18.646755            20.812979   
9    2  112131    0            15.182899            15.518018   

   BM25_k1_2.00_b_0.75  LMD_mu_1000  LMD_mu_1500  LMD_mu_2500  \
0            11.398603     6.202654     6.583692     7.055481   
1            12.636791     5.813496     6.198410     6.679400   
2            10.910498     6.206557     6.586314 

In [21]:
def train(train_extracted, dev_extracted, feature_name):
    train_X = train_extracted['data'].loc[:, feature_name]
    train_Y = train_extracted['data']['rel']
    dev_X = dev_extracted['data'].loc[:, feature_name]
    dev_Y = dev_extracted['data']['rel']
    lgb_train = lgb.Dataset(train_X,label=train_Y,group=train_extracted['group'])
    lgb_valid = lgb.Dataset(dev_X,label=dev_Y,group=dev_extracted['group'])
    
    params = {
        'boosting_type': 'gbdt',
        'objective': 'lambdarank',
        'max_bin':255,
        'num_leaves':63,
        'max_depth':10,
        'min_data_in_leaf':50,
        'min_sum_hessian_in_leaf':0,
        'bagging_fraction':0.9,
        'bagging_freq':1,
        'feature_fraction':1,
        'learning_rate':0.1,
        'num_boost_round':1000,
        'early_stopping_round':200,
        'metric':['map'],
        'eval_at':[10],
        'label_gain':[0,1],
        'lambdarank_truncation_level':20,
        'seed':12345,
        'num_threads':max(multiprocessing.cpu_count()//2,1)
    }
    num_boost_round = params.pop('num_boost_round')
    early_stopping_round = params.pop('early_stopping_round')
    eval_results={}
    gbm = lgb.train(params, lgb_train, 
                    valid_sets=lgb_valid,
                    num_boost_round=num_boost_round,
                    early_stopping_rounds =early_stopping_round,
                    feature_name=feature_name,
                    evals_result=eval_results,
                    verbose_eval=False)
    dev_extracted['data']['score']=gbm.predict(dev_X)
    best_score = gbm.best_score['valid_0']['map@10']
    print(best_score)
    best_iteration = gbm.best_iteration
    print(best_iteration)
    eval_map = eval_results['valid_0']['map@10']
    print(eval_map)
    feature_importances = sorted(list(zip(feature_name,gbm.feature_importance().tolist())),key=lambda x:x[1],reverse=True)
    print(feature_importances)
    return {'model':gbm,'best_score':best_score,'best_iteration':best_iteration,
            'eval_map':eval_map,'feature_importances':feature_importances}

In [22]:
def eval_output(dev_data):
    with open('lambdarank.run', 'w') as f:
        score_tie_counter = 0
        score_tie_query = set()
        dev_data = dev_data.set_index('qid')
        for qid, group in dev_data.groupby('qid'):
            group = group.reset_index()
            rank = 1
            prev_score = -1e10
            prev_pid = ''
            assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
            # stable sort is also used in LightGBM
            for t in group.sort_values('score', ascending=False, kind='mergesort').itertuples():
                if abs(t.score - prev_score) < 1e-8:
                    score_tie_counter += 1
                    score_tie_query.add(qid)
                assert prev_pid != t.pid
                prev_score = t.score
                prev_pid = t.pid
                f.write(f'{t.qid}\t{t.pid}\t{rank}\n')
                rank += 1
        score_tie = f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries'
        print(score_tie)

    with open('lambdarank.run.trec', 'w') as f:
        for qid, group in dev_data.groupby('qid'):
            group = group.reset_index()
            rank = 1
            assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
            # stable sort is also used in LightGBM
            for t in group.sort_values('score', ascending=False, kind='mergesort').itertuples():
                new_score = t.score - rank * 1e-8
                f.write(f'{t.qid}\tQ0\t{t.pid}\t{rank}\t{new_score:.6f}\tlambdarank\n')
                rank += 1

    mrr_10 = subprocess.check_output(
        ["python3", "tools/scripts/msmarco/msmarco_eval.py", "collections/msmarco-passage/qrels.dev.small.tsv",
         "lambdarank.run"]).decode()
    print(mrr_10)
    map_recall = subprocess.check_output(
        ["tools/eval/trec_eval.9.0.4/trec_eval", "-mmap", "-mrecall", "collections/msmarco-passage/qrels.dev.small.tsv",
         "lambdarank.run.trec"]).decode()
    print(map_recall)
    return {'score_tie': score_tie, 'mrr_10': mrr_10, 'map_recall': map_recall}

In [53]:
dev_data = dev_extracted['data']
score_tie_counter = 0
score_tie_query = set()

MRR = []
for qid, group in tqdm(dev_data.groupby('qid')):
    group = group.reset_index()
    rank = 0
    prev_score = None
    assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
    # stable sort is also used in LightGBM
    
    for t in group.sort_values('score', ascending=False, kind='mergesort').itertuples():
        if prev_score is not None and abs(t.score - prev_score) < 1e-8:
            score_tie_counter += 1
            score_tie_query.add(qid)
        prev_score = t.score
        prev_pid = t.pid
        rank += 1
        if t.rel>0:
            MRR.append(1.0/rank)
            break
        elif rank == 10 or rank == len(group):
            MRR.append(0.)
            break
    
score_tie = f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries'
print(score_tie,np.mean(MRR),type(MRR.tolist()))


100%|██████████| 6980/6980 [00:39<00:00, 175.77it/s]


AttributeError: 'list' object has no attribute 'tolist'

In [48]:
group.shape

(1000, 66)

In [42]:
eval_output(dev_extracted['data'])

score_tie occurs 2177328 times in 6918 queries
#####################
MRR @10: 0.21191903170055007
QueriesRanked: 6980
#####################

map                   	all	0.2186
recall_5              	all	0.3268
recall_10             	all	0.4284
recall_15             	all	0.4832
recall_20             	all	0.5213
recall_30             	all	0.5647
recall_100            	all	0.6870
recall_200            	all	0.7400
recall_500            	all	0.7880
recall_1000           	all	0.8140



{'score_tie': 'score_tie occurs 2177328 times in 6918 queries',
 'mrr_10': '#####################\nMRR @10: 0.21191903170055007\nQueriesRanked: 6980\n#####################\n',
 'map_recall': 'map                   \tall\t0.2186\nrecall_5              \tall\t0.3268\nrecall_10             \tall\t0.4284\nrecall_15             \tall\t0.4832\nrecall_20             \tall\t0.5213\nrecall_30             \tall\t0.5647\nrecall_100            \tall\t0.6870\nrecall_200            \tall\t0.7400\nrecall_500            \tall\t0.7880\nrecall_1000           \tall\t0.8140\n'}

In [29]:
feature_name = fe.feature_names()
train_X = train_extracted['data'].loc[:, feature_name]
train_Y = train_extracted['data']['rel']
dev_X = dev_extracted['data'].loc[:, feature_name]
dev_Y = dev_extracted['data']['rel']
lgb_train = lgb.Dataset(train_X,label=train_Y,group=train_extracted['group'])
lgb_valid = lgb.Dataset(dev_X,label=dev_Y,group=dev_extracted['group'])

params = {
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'max_bin':255,
    'num_leaves':63,
    'max_depth':10,
    'min_data_in_leaf':50,
    'min_sum_hessian_in_leaf':0,
    'bagging_fraction':0.9,
    'bagging_freq':1,
    'feature_fraction':1,
    'learning_rate':0.1,
    'num_boost_round':1000,
    'metric':['map'],
    'eval_at':[10],
    'label_gain':[0,1],
    'lambdarank_truncation_level':20,
    'seed':12345,
    'num_threads':max(multiprocessing.cpu_count()//2,1)
}
num_boost_round = params.pop('num_boost_round')
eval_results={}
cv_gbm = lgb.cv(params, lgb_train, nfold=10, 
                num_boost_round=num_boost_round,
                feature_name=feature_name,
                verbose_eval=False,
                return_cvbooster=True)
cv_gbm

[LightGBM] [Info] Total groups: 294948, total data: 3245643
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9924
[LightGBM] [Info] Number of data points in the train set: 3245643, number of used features: 55
[LightGBM] [Info] Total groups: 32773, total data: 360629
[LightGBM] [Info] Total groups: 294949, total data: 3245645
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9924
[LightGBM] [Info] Number of data points in the train set: 3245645, number of used features: 55
[LightGBM] [Info] Total groups: 32772, total data: 360627
[LightGBM] [Info] Total groups: 294949, total data: 3245645
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9924
[LightGBM] [Info] Number of data points in the

{'map@10-mean': [0.785237987020136,
  0.7952585357000176,
  0.7978100466955723,
  0.79847152623074,
  0.7989002281423414,
  0.7992493655424253,
  0.7996774890467989,
  0.7997218297383024,
  0.7998565114541893,
  0.800027684510167,
  0.8002420518104192,
  0.8003688388119998,
  0.8005603330193967,
  0.8008389700498245,
  0.801096054592602,
  0.8013626565912986,
  0.801532265086097,
  0.8016655455530893,
  0.801842354440892,
  0.8018974204251146,
  0.8021393092261111,
  0.8022514524996431,
  0.8023837233405537,
  0.8026399250067767,
  0.8029022429842232,
  0.8031292978000273,
  0.8032882977329298,
  0.8034614820908328,
  0.8036655009638751,
  0.8038286522409035,
  0.803981575041853,
  0.804146398783111,
  0.8042564375325705,
  0.8044875405624115,
  0.8047315222306368,
  0.804763657089618,
  0.8049679617821072,
  0.8051217563417863,
  0.805265818749804,
  0.8054445458522659,
  0.80554928088861,
  0.8057026892141492,
  0.8058993848345658,
  0.8059343428290573,
  0.806053961890447,
  0.80623

In [30]:
dev_extracted['data']['sum_score'] = 0.
for gbm in cv_gbm['cvbooster'].boosters:
    dev_extracted['data']['score']=gbm.predict(dev_X)
    eval_output(dev_extracted['data'])
    dev_extracted['data']['sum_score']+=dev_extracted['data']['score']
dev_extracted['data']['score'] = dev_extracted['data']['sum_score']
eval_output(dev_extracted['data'])

score_tie occurs 2194532 times in 6920 queries
#####################
MRR @10: 0.20583651703279163
QueriesRanked: 6980
#####################

map                   	all	0.2130
recall_5              	all	0.3206
recall_10             	all	0.4244
recall_15             	all	0.4827
recall_20             	all	0.5209
recall_30             	all	0.5659
recall_100            	all	0.6827
recall_200            	all	0.7388
recall_500            	all	0.7901
recall_1000           	all	0.8140

score_tie occurs 2188250 times in 6919 queries
#####################
MRR @10: 0.20986742165825192
QueriesRanked: 6980
#####################

map                   	all	0.2169
recall_5              	all	0.3219
recall_10             	all	0.4266
recall_15             	all	0.4806
recall_20             	all	0.5190
recall_30             	all	0.5645
recall_100            	all	0.6850
recall_200            	all	0.7387
recall_500            	all	0.7906
recall_1000           	all	0.8140

score_tie occurs 2190794 times in 69

{'score_tie': 'score_tie occurs 2177158 times in 6918 queries',
 'mrr_10': '#####################\nMRR @10: 0.21185757493064047\nQueriesRanked: 6980\n#####################\n',
 'map_recall': 'map                   \tall\t0.2188\nrecall_5              \tall\t0.3272\nrecall_10             \tall\t0.4277\nrecall_15             \tall\t0.4828\nrecall_20             \tall\t0.5231\nrecall_30             \tall\t0.5672\nrecall_100            \tall\t0.6851\nrecall_200            \tall\t0.7381\nrecall_500            \tall\t0.7895\nrecall_1000           \tall\t0.8140\n'}

In [31]:
eval_res = eval_output(dev_extracted['data'])

score_tie occurs 2177158 times in 6918 queries
#####################
MRR @10: 0.21185757493064047
QueriesRanked: 6980
#####################

map                   	all	0.2188
recall_5              	all	0.3272
recall_10             	all	0.4277
recall_15             	all	0.4828
recall_20             	all	0.5231
recall_30             	all	0.5672
recall_100            	all	0.6851
recall_200            	all	0.7381
recall_500            	all	0.7895
recall_1000           	all	0.8140



In [32]:
feature_name = fe.feature_names()
train_X = train_extracted['data'].loc[:, feature_name]
train_Y = train_extracted['data']['rel']
dev_X = dev_extracted['data'].loc[:, feature_name]
dev_Y = dev_extracted['data']['rel']
lgb_train = lgb.Dataset(train_X,label=train_Y,group=train_extracted['group'])
lgb_valid = lgb.Dataset(dev_X,label=dev_Y,group=dev_extracted['group'])

params = {
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'max_bin':255,
    'num_leaves':63,
    'max_depth':10,
    'min_data_in_leaf':50,
    'min_sum_hessian_in_leaf':0,
    'bagging_fraction':0.9,
    'bagging_freq':1,
    'feature_fraction':1,
    'learning_rate':0.1,
    'num_boost_round':1000,
    'metric':['map'],
    'eval_at':[10],
    'label_gain':[0,1],
    'lambdarank_truncation_level':20,
    'seed':12345,
    'num_threads':max(multiprocessing.cpu_count()//2,1)
}
num_boost_round = params.pop('num_boost_round')
eval_results={}
cv_gbm = lgb.cv(params, lgb_train, nfold=5, 
                num_boost_round=num_boost_round,
                feature_name=feature_name,
                verbose_eval=False,
                return_cvbooster=True)
cv_gbm

[LightGBM] [Info] Total groups: 262176, total data: 2885016
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9924
[LightGBM] [Info] Number of data points in the train set: 2885016, number of used features: 55
[LightGBM] [Info] Total groups: 65545, total data: 721256
[LightGBM] [Info] Total groups: 262177, total data: 2885018
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9924
[LightGBM] [Info] Number of data points in the train set: 2885018, number of used features: 55
[LightGBM] [Info] Total groups: 65544, total data: 721254
[LightGBM] [Info] Total groups: 262177, total data: 2885018
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9924
[LightGBM] [Info] Number of data points in the

{'map@10-mean': [0.784799822393804,
  0.7948339328839532,
  0.7972175723384505,
  0.7982587225379985,
  0.7985826992597049,
  0.7989144869550293,
  0.7991393970512813,
  0.7992588012525208,
  0.7996432090182496,
  0.7998855865912693,
  0.8001946722865718,
  0.8002769652239227,
  0.8005691101771314,
  0.8007941654470055,
  0.800908204605741,
  0.8010354393198235,
  0.8012031144677365,
  0.8013462917400889,
  0.8014642364637339,
  0.801663175120449,
  0.8019060723802613,
  0.8020827759818786,
  0.8022229669471587,
  0.8024585454901046,
  0.802568191507536,
  0.8027647568318151,
  0.8029479813299203,
  0.8031216292485259,
  0.8033564407628033,
  0.8035168248059618,
  0.8036837999275399,
  0.8038792364585226,
  0.8040471160605422,
  0.804239433391934,
  0.8043720765120653,
  0.8045825381901663,
  0.8047576999949456,
  0.8049541259487158,
  0.8051171538196942,
  0.8052501126834036,
  0.8054180597399305,
  0.8054940913908715,
  0.8056468556059322,
  0.8058236904564966,
  0.8060360047858361,


In [33]:
dev_extracted['data']['sum_score'] = 0.
for gbm in cv_gbm['cvbooster'].boosters:
    dev_extracted['data']['score']=gbm.predict(dev_X)
    eval_output(dev_extracted['data'])
    dev_extracted['data']['sum_score']+=dev_extracted['data']['score']
dev_extracted['data']['score'] = dev_extracted['data']['sum_score']
eval_output(dev_extracted['data'])

score_tie occurs 2191794 times in 6918 queries
#####################
MRR @10: 0.20927968799745292
QueriesRanked: 6980
#####################

map                   	all	0.2163
recall_5              	all	0.3218
recall_10             	all	0.4250
recall_15             	all	0.4776
recall_20             	all	0.5177
recall_30             	all	0.5629
recall_100            	all	0.6836
recall_200            	all	0.7386
recall_500            	all	0.7886
recall_1000           	all	0.8140

score_tie occurs 2190921 times in 6918 queries
#####################
MRR @10: 0.20647280211033775
QueriesRanked: 6980
#####################

map                   	all	0.2133
recall_5              	all	0.3180
recall_10             	all	0.4201
recall_15             	all	0.4805
recall_20             	all	0.5161
recall_30             	all	0.5621
recall_100            	all	0.6865
recall_200            	all	0.7367
recall_500            	all	0.7880
recall_1000           	all	0.8140

score_tie occurs 2189672 times in 69

{'score_tie': 'score_tie occurs 2177328 times in 6918 queries',
 'mrr_10': '#####################\nMRR @10: 0.21191903170055007\nQueriesRanked: 6980\n#####################\n',
 'map_recall': 'map                   \tall\t0.2186\nrecall_5              \tall\t0.3268\nrecall_10             \tall\t0.4284\nrecall_15             \tall\t0.4832\nrecall_20             \tall\t0.5213\nrecall_30             \tall\t0.5647\nrecall_100            \tall\t0.6870\nrecall_200            \tall\t0.7400\nrecall_500            \tall\t0.7880\nrecall_1000           \tall\t0.8140\n'}

In [23]:
train_res = train(train_extracted, dev_extracted, fe.feature_names())
eval_res = eval_output(dev_extracted['data'])

[LightGBM] [Info] Total groups: 327721, total data: 3606272
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9924
[LightGBM] [Info] Number of data points in the train set: 3606272, number of used features: 55
[LightGBM] [Info] Total groups: 6980, total data: 6668967
0.384993168311578
343
[0.30804003502069405, 0.3404055047679689, 0.3525083951122633, 0.35632206910143877, 0.3595752130046543, 0.36044309326723367, 0.36091015903336826, 0.36239040872636863, 0.362651746865572, 0.3646218030351268, 0.364960971255742, 0.3660089768954382, 0.3670058727884658, 0.367182985021452, 0.3676131066084504, 0.3674312472521642, 0.3676710575188368, 0.3674989482421431, 0.3681578319006685, 0.36835607404375315, 0.36886887706371946, 0.36890611497703185, 0.3691170066402875, 0.3695365253710527, 0.36977891291823956, 0.3703334261912342, 0.37064832817877225, 0.37130846257637085, 0.3714059065205197, 0.3719902575006443, 

score_tie occurs 2315231 times in 6919 queries
#####################
MRR @10: 0.20967685450493428
QueriesRanked: 6980
#####################

map                   	all	0.2167
recall_5              	all	0.3233
recall_10             	all	0.4270
recall_15             	all	0.4824
recall_20             	all	0.5173
recall_30             	all	0.5635
recall_100            	all	0.6855
recall_200            	all	0.7369
recall_500            	all	0.7888
recall_1000           	all	0.8140



In [17]:
def gen_exp_dir():
    dirname = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S') + '_' + str(uuid.uuid1())
    assert not os.path.exists(dirname)
    os.mkdir(dirname)
    return dirname


def save_exp(dirname,
             fe,
             train_extracted, dev_extracted,
             train_res, eval_res):
    dev_extracted['data'][['qid', 'pid', 'score']].to_json(f'{dirname}/output.json')
    subprocess.check_output(['gzip', f'{dirname}/output.json'])
    with open(f'{dirname}/model.pkl', 'wb') as f:
        pickle.dump(train_res['model'], f)
    metadata = {
        'train_df_hash':train_extracted['df_hash'],
        'train_jar_hash':train_extracted['jar_hash'],
        'train_fe_hash':train_extracted['fe_hash'],
        'dev_df_hash':dev_extracted['df_hash'],
        'dev_jar_hash':dev_extracted['jar_hash'],
        'dev_fe_hash':dev_extracted['fe_hash'],
        'best_score':train_res['best_score'],
        'best_iteration':train_res['best_iteration'],
        'eval_map':train_res['eval_map'],
        'feature_importances':train_res['feature_importances'],
        'score_tie': eval_res['score_tie'], 
        'mrr_10': eval_res['mrr_10'], 
        'map_recall': eval_res['map_recall']
    }
    json.dump(metadata,open(f'{dirname}/metadata.json','w'))
    shutil.copytree('anserini_ltr_source', f'{dirname}/anserini_ltr_source')
    shutil.copytree('pyserini_ltr_source', f'{dirname}/pyserini_ltr_source')
    shutil.copy('test.py', f'{dirname}/test.py')

In [18]:
dirname = gen_exp_dir()
save_exp(dirname,fe,train_extracted,dev_extracted,train_res,eval_res)