In [None]:
import argparse
import datetime
import glob
import hashlib
import json
import multiprocessing
import pickle
import os
import shutil
import subprocess
import uuid
import random

import numpy as np
import pandas as pd
import lightgbm as lgb
from collections import defaultdict
from lightgbm.sklearn import LGBMRanker
from tqdm import tqdm

from pyserini.analysis import Analyzer, get_lucene_analyzer
from pyserini.ltr import *
from pyserini.search import get_topics_with_reader

def train_data_loader(task='triple', neg_sample=10, random_seed=12345):
    if os.path.exists(f'train_{task}_sampled_with_{neg_sample}_{random_seed}.pickle'):
        sampled_train = pd.read_pickle(f'train_{task}_sampled_with_{neg_sample}_{random_seed}.pickle')
        print(sampled_train.shape)
        print(sampled_train.index.get_level_values('qid').drop_duplicates().shape)
        print(sampled_train.groupby('qid').count().mean())
        print(sampled_train.head(10))
        print(sampled_train.info())
        return sampled_train
    else:
        if task == 'triple':
            train = pd.read_csv('collections/msmarco-passage/qidpidtriples.train.full.2.tsv', sep="\t",
                                names=['qid', 'pos_pid', 'neg_pid'], dtype=np.int32)
            pos_half = train[['qid', 'pos_pid']].rename(columns={"pos_pid": "pid"}).drop_duplicates()
            pos_half['rel'] = np.int32(1)
            neg_half = train[['qid', 'neg_pid']].rename(columns={"neg_pid": "pid"}).drop_duplicates()
            neg_half['rel'] = np.int32(0)
            del train
            sampled_neg_half = []
            for qid, group in tqdm(neg_half.groupby('qid')):
                sampled_neg_half.append(group.sample(n=min(neg_sample, len(group)), random_state=random_seed))
            sampled_train = pd.concat([pos_half] + sampled_neg_half, axis=0, ignore_index=True)
            sampled_train = sampled_train.sort_values(['qid','pid']).set_index(['qid','pid'])
            print(sampled_train.shape)
            print(sampled_train.index.get_level_values('qid').drop_duplicates().shape)
            print(sampled_train.groupby('qid').count().mean())
            print(sampled_train.head(10))
            print(sampled_train.info())

            sampled_train.to_pickle(f'train_{task}_sampled_with_{neg_sample}_{random_seed}.pickle')
        elif task == 'rank':
            qrel = defaultdict(list)
            with open("collections/msmarco-passage/qrels.train.tsv") as f:
                for line in f:
                    topicid, _, docid, rel = line.strip().split('\t')
                    assert rel == "1", line.split(' ')
                    qrel[topicid].append(docid)
            
            qid2pos = defaultdict(list)
            qid2neg = defaultdict(list)
            with open("runs/msmarco-passage/run.train.small.tsv") as f:
                for line in tqdm(f):
                    topicid, docid, rank = line.split()
                    assert topicid in qrel
                    if docid in qrel[topicid]:
                        qid2pos[topicid].append(docid)
                    else:
                        qid2neg[topicid].append(docid)
            sampled_train = []
            for topicid, pos_list in tqdm(qid2pos.items()):
                neg_list = random.sample(qid2neg[topicid], min(len(qid2neg[topicid]), neg_sample))
                for positive_docid in pos_list:
                    sampled_train.append((int(topicid), int(positive_docid), 1))
                for negative_docid in neg_list:
                    sampled_train.append((int(topicid), int(negative_docid), 0))
            sampled_train = pd.DataFrame(sampled_train,columns=['qid','pid','rel'],dtype=np.int32)
            sampled_train = sampled_train.sort_values(['qid','pid']).set_index(['qid','pid'])
            print(sampled_train.shape)
            print(sampled_train.index.get_level_values('qid').drop_duplicates().shape)
            print(sampled_train.groupby('qid').count().mean())
            print(sampled_train.head(10))
            print(sampled_train.info())

            sampled_train.to_pickle(f'train_{task}_sampled_with_{neg_sample}_{random_seed}.pickle')
        else:
            raise Exception('unknown parameters')
        return sampled_train
sampled_train = train_data_loader(task='rank')

In [None]:
def dev_data_loader(task='anserini'):
    if os.path.exists(f'dev_{task}.pickle'):
        dev = pd.read_pickle(f'dev_{task}.pickle')
        print(dev.shape)
        print(dev.index.get_level_values('qid').drop_duplicates().shape)
        print(dev.groupby('qid').count().mean())
        print(dev.head(10))
        print(dev.info())
        return dev
    else:
        if task == 'rerank':
            dev = pd.read_csv('collections/msmarco-passage/top1000.dev', sep="\t",
                              names=['qid', 'pid', 'query', 'doc'], usecols=['qid', 'pid'], dtype=np.int32)
        elif task == 'anserini':
            dev = pd.read_csv('runs/msmarco-passage/run.msmarco-passage.dev.small.tsv',sep="\t",
                            names=['qid','pid','rank'], dtype=np.int32)
        elif task == 'pygaggle':
            dev = pd.read_csv('../pygaggle/data/msmarco_ans_entire/run.dev.small.tsv',sep="\t",
                            names=['qid','pid','rank'], dtype=np.int32)
        else:
            raise Exception('unknown parameters')
        dev_qrel = pd.read_csv('collections/msmarco-passage/qrels.dev.small.tsv', sep="\t",
                               names=["qid", "q0", "pid", "rel"], usecols=['qid', 'pid', 'rel'], dtype=np.int32)
        dev = dev.merge(dev_qrel, left_on=['qid', 'pid'], right_on=['qid', 'pid'], how='left')
        dev['rel'] = dev['rel'].fillna(0).astype(np.int32)
        dev = dev.sort_values(['qid','pid']).set_index(['qid','pid'])
        
        print(dev.shape)
        print(dev.index.get_level_values('qid').drop_duplicates().shape)
        print(dev.groupby('qid').count().mean())
        print(dev.head(10))
        print(dev.info())

        dev.to_pickle(f'dev_{task}.pickle')
        return dev
dev = dev_data_loader(task='anserini')

In [None]:
def query_loader(choice='default'):
    if os.path.exists(f'query_{choice}_tokenized.pickle'):
        return pickle.load(open(f'query_{choice}_tokenized.pickle','rb'))
    else:
        if choice == 'default':
            analyzer = Analyzer(get_lucene_analyzer())
            nonStopAnalyzer = Analyzer(get_lucene_analyzer(stopwords=False))
            queries = get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader', \
                                             'collections/msmarco-passage/queries.train.tsv')
            queries.update(get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader', \
                                                  'collections/msmarco-passage/queries.dev.tsv'))
            for qid,value in queries.items():
                assert 'tokenized' not in value
                value['tokenized'] = analyzer.analyze(value['title'])
                assert 'nonSW' not in value
                value['nonSW'] = nonStopAnalyzer.analyze(value['title'])
        else:
            raise Exception('unknown parameters')

        pickle.dump(queries,open(f'query_{choice}_tokenized.pickle','wb'))

        return queries
queries = query_loader()

In [11]:
fe = FeatureExtractor('indexes/msmarco-passage/lucene-index-msmarco/',max(multiprocessing.cpu_count()//2,1))
fe.add(BM25(k1=0.9,b=0.4))
fe.add(BM25(k1=1.2,b=0.75))
fe.add(BM25(k1=2.0,b=0.75))

fe.add(LMDir(mu=1000))
fe.add(LMDir(mu=1500))
fe.add(LMDir(mu=2500))

fe.add(LMJM(0.1))
fe.add(LMJM(0.4))
fe.add(LMJM(0.7))

fe.add(NTFIDF())
fe.add(ProbalitySum())

fe.add(DFR_GL2())
fe.add(DFR_In_expB2())
fe.add(DPH())

# fe.add(ContextDFR_GL2(AvgPooler()))
# fe.add(ContextDFR_GL2(VarPooler()))
# fe.add(ContextDFR_In_expB2(AvgPooler()))
# fe.add(ContextDFR_In_expB2(VarPooler()))
# fe.add(ContextDPH(AvgPooler()))
# fe.add(ContextDPH(VarPooler()))

fe.add(Proximity())
fe.add(TPscore())
fe.add(tpDist())
# fe.add(SDM())

fe.add(DocSize())
fe.add(Entropy())
fe.add(StopCover())
fe.add(StopRatio())

fe.add(QueryLength())
fe.add(QueryLengthNonStopWords())
fe.add(QueryCoverageRatio())
fe.add(UniqueTermCount())
fe.add(MatchingTermCount())
fe.add(SCS())

fe.add(tfStat(AvgPooler()))
fe.add(tfStat(SumPooler()))
fe.add(tfStat(MinPooler()))
fe.add(tfStat(MaxPooler()))
fe.add(tfStat(VarPooler()))
fe.add(tfIdfStat(AvgPooler()))
fe.add(tfIdfStat(SumPooler()))
fe.add(tfIdfStat(MinPooler()))
fe.add(tfIdfStat(MaxPooler()))
fe.add(tfIdfStat(VarPooler()))
fe.add(scqStat(AvgPooler()))
fe.add(scqStat(SumPooler()))
fe.add(scqStat(MinPooler()))
fe.add(scqStat(MaxPooler()))
fe.add(scqStat(VarPooler()))
fe.add(normalizedTfStat(AvgPooler()))
fe.add(normalizedTfStat(SumPooler()))
fe.add(normalizedTfStat(MinPooler()))
fe.add(normalizedTfStat(MaxPooler()))
fe.add(normalizedTfStat(VarPooler()))
# fe.add(normalizedDocSizeStat(AvgPooler()))
# fe.add(normalizedDocSizeStat(SumPooler()))
# fe.add(normalizedDocSizeStat(MinPooler()))
# fe.add(normalizedDocSizeStat(MaxPooler()))
# fe.add(normalizedDocSizeStat(VarPooler()))

fe.add(idfStat(AvgPooler()))
fe.add(idfStat(SumPooler()))
fe.add(idfStat(MinPooler()))
fe.add(idfStat(MaxPooler()))
fe.add(idfStat(VarPooler()))
fe.add(idfStat(MaxMinRatioPooler()))
fe.add(idfStat(ConfidencePooler()))
fe.add(ictfStat(AvgPooler()))
fe.add(ictfStat(SumPooler()))
fe.add(ictfStat(MinPooler()))
fe.add(ictfStat(MaxPooler()))
fe.add(ictfStat(VarPooler()))
fe.add(ictfStat(MaxMinRatioPooler()))
fe.add(ictfStat(ConfidencePooler()))

fe.add(UnorderedSequentialPairs(3))
fe.add(UnorderedSequentialPairs(8))
fe.add(UnorderedSequentialPairs(15))
fe.add(OrderedSequentialPairs(3))
fe.add(OrderedSequentialPairs(8))
fe.add(OrderedSequentialPairs(15))
fe.add(UnorderedQueryPairs(3))
fe.add(UnorderedQueryPairs(8))
fe.add(UnorderedQueryPairs(15))
fe.add(OrderedQueryPairs(3))
fe.add(OrderedQueryPairs(8))
fe.add(OrderedQueryPairs(15))

In [12]:
def extract(df, queries, fe):
    df_pieces = []
    fetch_later = []
    qidpid2rel = defaultdict(dict)
    need_rows = 0
    for qid,group in tqdm(df.groupby('qid')):
        for t in group.reset_index().itertuples():
            assert t.pid not in qidpid2rel[t.qid]
            qidpid2rel[t.qid][t.pid] = t.rel
            need_rows += 1
        fe.lazy_extract(str(qid),queries[qid]['nonSW'], queries[qid]['tokenized'],list(qidpid2rel[t.qid].keys()))
        fetch_later.append(str(qid))
        if len(fetch_later) == 10000:
            info = np.zeros(shape=(need_rows,3), dtype=np.int32)
            feature = np.zeros(shape=(need_rows,len(fe.feature_names())), dtype=np.float32)
            idx = 0
            for qid in fetch_later:
                for doc in fe.get_result(qid):
                    info[idx,0] = int(qid)
                    info[idx,1] = int(doc['pid'])
                    info[idx,2] = qidpid2rel[int(qid)][int(doc['pid'])]
                    feature[idx,:] = doc['features']
                    idx += 1
            info = pd.DataFrame(info, columns=['qid','pid','rel'])
            feature = pd.DataFrame(feature, columns=fe.feature_names())
            df_pieces.append(pd.concat([info,feature], axis=1))
            fetch_later = []
            need_rows = 0
    #deal with rest
    if len(fetch_later) > 0:
        info = np.zeros(shape=(need_rows,3), dtype=np.int32)
        feature = np.zeros(shape=(need_rows,len(fe.feature_names())), dtype=np.float32)
        idx = 0
        for qid in fetch_later:
            for doc in fe.get_result(qid):
                info[idx,0] = int(qid)
                info[idx,1] = int(doc['pid'])
                info[idx,2] = qidpid2rel[int(qid)][int(doc['pid'])]
                feature[idx,:] = doc['features']
                idx += 1
        info = pd.DataFrame(info, columns=['qid','pid','rel'])
        feature = pd.DataFrame(feature, columns=fe.feature_names())
        df_pieces.append(pd.concat([info,feature], axis=1))
    data = pd.concat(df_pieces, axis=0, ignore_index=True)
    data = data.sort_values(by='qid', kind='mergesort')
    group = data.groupby('qid').agg(count=('pid', 'count'))['count']
    return data,group

In [13]:
def hash_df(df):
    h = pd.util.hash_pandas_object(df)
    return hex(h.sum().astype(np.uint64))


def hash_anserini_jar():
    find = glob.glob(os.environ['ANSERINI_CLASSPATH'] + "/*fatjar.jar")
    assert len(find) == 1
    md5Hash = hashlib.md5(open(find[0], 'rb').read())
    return md5Hash.hexdigest()


def hash_fe(fe):
    return hashlib.md5(','.join(sorted(fe.feature_names())).encode()).hexdigest()


def data_loader(task, df, queries, fe):
    df_hash = hash_df(df)
    jar_hash = hash_anserini_jar()
    fe_hash = hash_fe(fe)
    if os.path.exists(f'{task}_{df_hash}_{jar_hash}_{fe_hash}.pickle'):
        res = pickle.load(open(f'{task}_{df_hash}_{jar_hash}_{fe_hash}.pickle','rb'))
        print(res['data'].shape)
        print(res['data'].qid.drop_duplicates().shape)
        print(res['group'].mean())
        print(res['data'].head(10))
        print(res['data'].info())
        return res
    else:
        if task == 'train' or task == 'dev': 
            data,group = extract(df, queries, fe)
            obj = {'data':data,'group':group,'df_hash':df_hash,'jar_hash':jar_hash,'fe_hash':fe_hash}
            print(data.shape)
            print(data.qid.drop_duplicates().shape)
            print(group.mean())
            print(data.head(10))
            print(data.info())
            pickle.dump(obj,open(f'{task}_{df_hash}_{jar_hash}_{fe_hash}.pickle','wb'))
            return obj
        else:
            raise Exception('unknown parameters')

In [14]:
import json
def export(df, fn):
    with open(fn,'w') as f:
        for qid, group in tqdm(df.groupby('qid')):
            line = {}
            line['qid'] = qid
            line['queryTokens'] = queries[qid]['tokenized']
            line['queryText'] = queries[qid]['nonSW']
            line['docIds'] = [str(did) for did in group.reset_index().pid.drop_duplicates().tolist()]
            f.write(json.dumps(line)+'\n')

In [15]:
train_extracted = data_loader('train', sampled_train, queries, fe)
# export(sampled_train, 'sampled_train_export.json')
dev_extracted = data_loader('dev', dev, queries, fe)
# export(dev, 'sampled_dev_export.json')
del sampled_train, dev

(4770762, 76)
(431848,)
11.04731757470221
   qid      pid  rel  contents_BM25_k1_0.90_b_0.40  \
0    3    23817    0                     19.794847   
1    3  1142680    1                     26.467836   
2    3  1887104    0                     19.061369   
3    3  2679073    0                     13.294892   
4    3  2724793    0                     24.698397   
5    3  3060834    0                     30.585629   
6    3  6151998    0                     24.326681   
7    3  7099153    0                     15.369918   
8    3  8073992    0                     19.706621   
9    3  8150773    0                     13.005863   

   contents_BM25_k1_1.20_b_0.75  contents_BM25_k1_2.00_b_0.75  \
0                     19.961182                     20.695307   
1                     27.822981                     32.060062   
2                     20.757126                     21.550747   
3                     13.668623                     13.829547   
4                     26.857327       

100%|██████████| 6980/6980 [00:53<00:00, 129.88it/s]


(6974598, 76)
(6980,)
999.2260744985673
   qid     pid  rel  contents_BM25_k1_0.90_b_0.40  \
0    2   55860    0                     12.348820   
1    2   72202    0                     10.927653   
2    2   72210    0                     13.675473   
3    2   98589    0                     12.699286   
4    2   98590    0                     12.492470   
5    2   98593    0                     11.077914   
6    2   98595    0                     11.181725   
7    2  112123    0                     15.955744   
8    2  112126    0                     11.468307   
9    2  112127    0                     21.200821   

   contents_BM25_k1_1.20_b_0.75  contents_BM25_k1_2.00_b_0.75  \
0                     13.151269                     14.964364   
1                     11.041017                     11.088634   
2                     13.716912                     14.579938   
3                     14.042357                     16.412817   
4                     13.508439                    

In [16]:
def eval_mrr(dev_data):
    score_tie_counter = 0
    score_tie_query = set()

    MRR = []
    for qid, group in tqdm(dev_data.groupby('qid')):
        group = group.reset_index()
        rank = 0
        prev_score = None
        assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
        # stable sort is also used in LightGBM

        for t in group.sort_values('score', ascending=False, kind='mergesort').itertuples():
            if prev_score is not None and abs(t.score - prev_score) < 1e-8:
                score_tie_counter += 1
                score_tie_query.add(qid)
            prev_score = t.score
            prev_pid = t.pid
            rank += 1
            if t.rel>0:
                MRR.append(1.0/rank)
                break
            elif rank == 10 or rank == len(group):
                MRR.append(0.)
                break

    score_tie = f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries'
    print(score_tie,np.mean(MRR))


In [25]:
def eval_recall(dev_data):
    score_tie_counter = 0
    score_tie_query = set()

    recall10 = []
    recall20 = []
    recall30 = []
    recall50 = []
    recall100 = []
    recall200 = []
    upper_limit = []
    for qid, group in tqdm(dev_data.groupby('qid')):
        group = group.reset_index()
        rank = 0
        prev_score = None
        assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))
        # stable sort is also used in LightGBM

        for t in group.sort_values('score', ascending=False, kind='mergesort').itertuples():
            if prev_score is not None and abs(t.score - prev_score) < 1e-8:
                score_tie_counter += 1
                score_tie_query.add(qid)
            prev_score = t.score
            prev_pid = t.pid
            rank += 1
            if t.rel>0:
                if rank <= 10:
                    recall10.append(1.0)
                elif rank <= 20:
                    recall20.append(1.0)
                elif rank <= 30:
                    recall30.append(1.0)
                elif rank <= 50:
                    recall50.append(1.0)
                elif rank <= 100:
                    recall100.append(1.0)
                elif rank <= 200:
                    recall200.append(1.0)
                else:
                    upper_limit.append(1.0)
                break

    score_tie = f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries'
    print(score_tie,
          np.mean(recall10),
          np.mean(recall20),
          np.mean(recall30),
          np.mean(recall50),
          np.mean(recall100),
          np.mean(recall200),
          np.mean(upper_limit))


In [18]:
feature_name = fe.feature_names()
train_X = train_extracted['data'].loc[:, feature_name]
train_Y = train_extracted['data']['rel']
dev_X = dev_extracted['data'].loc[:, feature_name]
dev_Y = dev_extracted['data']['rel']
lgb_train = lgb.Dataset(train_X,label=train_Y,group=train_extracted['group'])
lgb_valid = lgb.Dataset(dev_X,label=dev_Y,group=dev_extracted['group'])

In [19]:
fe.feature_names()

['contents_BM25_k1_0.90_b_0.40',
 'contents_BM25_k1_1.20_b_0.75',
 'contents_BM25_k1_2.00_b_0.75',
 'contents_LMD_mu_1000',
 'contents_LMD_mu_1500',
 'contents_LMD_mu_2500',
 'contents_LMJM_lambda_0.10',
 'contents_LMJM_lambda_0.40',
 'contents_LMJM_lambda_0.70',
 'contents_NTFIDF',
 'contents_Prob',
 'contents_DFR_GL2',
 'contents_DFR_In_expB2',
 'contents_DPH',
 'contents_Proximity',
 'contents_TPscore',
 'contents_tpDistWindow100',
 'contents_DocSize',
 'Entropy',
 'StopCover',
 'StopRatio',
 'QueryLength',
 'QueryLengthNonStopWords',
 'contents_QueryCoverageRatio',
 'UniqueQueryTerms',
 'contents_MatchingTermCount',
 'contents_SCS',
 'contents_TF_avg',
 'contents_TF_sum',
 'contents_TF_min',
 'contents_TF_max',
 'contents_TF_var',
 'contents_TFIDF_avg',
 'contents_TFIDF_sum',
 'contents_TFIDF_min',
 'contents_TFIDF_max',
 'contents_TFIDF_var',
 'contents_SCQ_avg',
 'contents_SCQ_sum',
 'contents_SCQ_min',
 'contents_SCQ_max',
 'contents_SCQ_var',
 'contents_NormalizedTF_avg',
 'con

In [22]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'max_bin':255,
    'num_leaves':63,
    'max_depth':-1,
    'min_data_in_leaf':50,
    'min_sum_hessian_in_leaf':0,
    'bagging_fraction':0.8,
    'bagging_freq':50,
    'feature_fraction':1,
    'learning_rate':0.05,
    'num_boost_round':2000,
    'early_stopping_round':500,
    'metric':['map'],
    'eval_at':[10],
    'label_gain':[0,1],
    'lambdarank_truncation_level':20,
    'num_threads':max(multiprocessing.cpu_count()//2,1)
}
num_boost_round = params.pop('num_boost_round')
early_stopping_round = params.pop('early_stopping_round')
eval_results={}
dev_extracted['data']['score']=0.
for seed in [12345,31345,21356,65743,68786]:
    params['seed'] = seed
    gbm = lgb.train(params, lgb_train, 
                    valid_sets=lgb_valid,
                    num_boost_round=num_boost_round,
                    early_stopping_rounds =early_stopping_round,
                    feature_name=feature_name,
                    evals_result=eval_results,
                    verbose_eval=False)
    dev_extracted['data']['score'] += gbm.predict(dev_X)
    best_score = gbm.best_score['valid_0']['map@10']
    print(best_score)
    best_iteration = gbm.best_iteration
    print(best_iteration)
    eval_map = eval_results['valid_0']['map@10']
    # print(eval_map)
    feature_importances = sorted(list(zip(feature_name,gbm.feature_importance().tolist())),key=lambda x:x[1],reverse=True)
    print(feature_importances)
    eval_mrr(dev_extracted['data'])

[LightGBM] [Info] Total groups: 431848, total data: 4770762
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12028
[LightGBM] [Info] Number of data points in the train set: 4770762, number of used features: 65
[LightGBM] [Info] Total groups: 6980, total data: 6974598
0.34519569328845834
739
[('contents_DFR_In_expB2', 2233), ('contents_Prob', 1570), ('contents_SCQ_var', 1570), ('contents_SCQ_avg', 1473), ('contents_NormalizedTF_var', 1328), ('contents_TPscore', 1258), ('contents_NTFIDF', 1236), ('contents_DFR_GL2', 1235), ('contents_TFIDF_var', 1217), ('contents_TFIDF_max', 1216), ('contents_Proximity', 1178), ('contents_NormalizedTF_avg', 1177), ('contents_SCQ_max', 1164), ('contents_NormalizedTF_min', 1086), ('contents_IDF_maxminratio', 1002), ('contents_ICTF_maxminratio', 1001), ('contents_DocSize', 941), ('contents_NormalizedTF_sum', 940), ('contents_TFIDF_sum', 932), ('contents_BM2

100%|██████████| 6980/6980 [00:42<00:00, 163.42it/s]


score_tie occurs 889 times in 666 queries 0.21174358711966162
[LightGBM] [Info] Total groups: 431848, total data: 4770762
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12028
[LightGBM] [Info] Number of data points in the train set: 4770762, number of used features: 65
[LightGBM] [Info] Total groups: 6980, total data: 6974598
0.34236094055578287
255
[('contents_DFR_In_expB2', 1026), ('contents_Prob', 717), ('contents_DFR_GL2', 549), ('contents_SCQ_avg', 534), ('contents_NormalizedTF_var', 513), ('contents_TPscore', 467), ('contents_TFIDF_var', 458), ('contents_LMJM_lambda_0.70', 447), ('contents_NormalizedTF_min', 423), ('contents_TFIDF_max', 416), ('contents_SCQ_var', 391), ('contents_QueryCoverageRatio', 381), ('contents_BM25_k1_0.90_b_0.40', 379), ('contents_MatchingTermCount', 379), ('contents_UnorderedQueryPairs_15', 361), ('contents_LMD_mu_1000', 360), ('contents_TF_var', 323),

100%|██████████| 6980/6980 [00:36<00:00, 191.60it/s]


score_tie occurs 836 times in 627 queries 0.21011449947696365
[LightGBM] [Info] Total groups: 431848, total data: 4770762
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12028
[LightGBM] [Info] Number of data points in the train set: 4770762, number of used features: 65
[LightGBM] [Info] Total groups: 6980, total data: 6974598
0.34248846856475795
219
[('contents_DFR_In_expB2', 882), ('contents_Prob', 640), ('contents_DFR_GL2', 499), ('contents_SCQ_avg', 491), ('contents_LMJM_lambda_0.70', 457), ('contents_NormalizedTF_var', 444), ('contents_TPscore', 409), ('contents_QueryCoverageRatio', 383), ('contents_SCQ_var', 369), ('contents_NormalizedTF_min', 367), ('contents_TFIDF_var', 366), ('contents_TFIDF_max', 361), ('contents_MatchingTermCount', 325), ('contents_BM25_k1_0.90_b_0.40', 319), ('contents_LMD_mu_1000', 318), ('contents_NormalizedTF_sum', 307), ('contents_UnorderedQueryPairs_1

100%|██████████| 6980/6980 [00:38<00:00, 183.31it/s]


score_tie occurs 834 times in 621 queries 0.2099906194569518
[LightGBM] [Info] Total groups: 431848, total data: 4770762
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12028
[LightGBM] [Info] Number of data points in the train set: 4770762, number of used features: 65
[LightGBM] [Info] Total groups: 6980, total data: 6974598
0.34337646488076284
942
[('contents_DFR_In_expB2', 2631), ('contents_SCQ_var', 1980), ('contents_Prob', 1859), ('contents_SCQ_avg', 1813), ('contents_NTFIDF', 1697), ('contents_SCQ_max', 1680), ('contents_TFIDF_max', 1621), ('contents_DFR_GL2', 1617), ('contents_NormalizedTF_var', 1613), ('contents_TPscore', 1560), ('contents_Proximity', 1558), ('contents_TFIDF_var', 1525), ('contents_NormalizedTF_avg', 1483), ('contents_IDF_maxminratio', 1429), ('contents_ICTF_maxminratio', 1369), ('contents_NormalizedTF_min', 1364), ('contents_DocSize', 1300), ('contents_TFIDF_

100%|██████████| 6980/6980 [00:39<00:00, 175.66it/s]


score_tie occurs 827 times in 621 queries 0.20922982898985762
[LightGBM] [Info] Total groups: 431848, total data: 4770762
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12028
[LightGBM] [Info] Number of data points in the train set: 4770762, number of used features: 65
[LightGBM] [Info] Total groups: 6980, total data: 6974598
0.34078870468913447
222
[('contents_DFR_In_expB2', 905), ('contents_Prob', 686), ('contents_SCQ_avg', 501), ('contents_DFR_GL2', 469), ('contents_NormalizedTF_var', 418), ('contents_TPscore', 414), ('contents_LMJM_lambda_0.70', 407), ('contents_TFIDF_var', 398), ('contents_QueryCoverageRatio', 385), ('contents_TFIDF_max', 382), ('contents_LMD_mu_1000', 376), ('contents_SCQ_var', 366), ('contents_BM25_k1_0.90_b_0.40', 349), ('contents_NormalizedTF_min', 344), ('contents_MatchingTermCount', 327), ('contents_UnorderedQueryPairs_15', 294), ('contents_NormalizedTF_su

100%|██████████| 6980/6980 [00:39<00:00, 177.16it/s]

score_tie occurs 837 times in 626 queries 0.20870855505525993





In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'max_bin':255,
    'num_leaves':63,
    'max_depth':-1,
    'min_data_in_leaf':50,
    'min_sum_hessian_in_leaf':100,
    'bagging_fraction':0.8,
    'bagging_freq':50,
    'feature_fraction':1,
    'learning_rate':0.1,
    'num_boost_round':1000,
    'metric':['recall'],
    'eval_at':[10],
    'label_gain':[0,1],
    'lambdarank_truncation_level':20,
    'seed':12345,
    'num_threads':max(multiprocessing.cpu_count()//2,1)
}

num_boost_round = params.pop('num_boost_round')
eval_results={}
cv_gbm = lgb.cv(params, lgb_train, nfold=5, 
                num_boost_round=num_boost_round,
                feature_name=feature_name,
                verbose_eval=False,
                return_cvbooster=True)
dev_extracted['data']['score'] = 0.
for gbm in cv_gbm['cvbooster'].boosters:
    dev_extracted['data']['score']+=gbm.predict(dev_X)
feature_importances = sorted(list(zip(feature_name,gbm.feature_importance().tolist())),key=lambda x:x[1],reverse=True)
print(feature_importances)
eval_mrr(dev_extracted['data'])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12028
[LightGBM] [Info] Number of data points in the train set: 3816608, number of used features: 65
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12028
[LightGBM] [Info] Number of data points in the train set: 3816609, number of used features: 65
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12028
[LightGBM] [Info] Number of data points in the train set: 3816607, number of used features: 65
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12028
[LightGBM] [Info] Number of data points in the train set: 3816612, number of used featu

In [26]:
eval_recall(dev_extracted['data'])

100%|██████████| 6980/6980 [00:48<00:00, 142.96it/s]

score_tie occurs 242967 times in 2952 queries 1.0 1.0 1.0 1.0 1.0 1.0 1.0



