In [1]:
import pandas as pd
import numpy as np
ltr_dev = pd.read_json('2020-11-29_16:56:08_b0444fd2-328d-11eb-a5d8-bfed4aec1d90/output.json')
dev_qrel = pd.read_csv('collections/msmarco-passage/qrels.dev.small.tsv', sep="\t",
                       names=["qid", "q0", "pid", "rel"], 
                       usecols=['qid', 'pid', 'rel'])                                                                                         
ltr_dev = ltr_dev.merge(dev_qrel, left_on=['qid', 'pid'], right_on=['qid', 'pid'], how='left')                                                                                                           
ltr_dev['rel'] = ltr_dev['rel'].fillna(0)                                                                                                                                             
ltr_dev = ltr_dev.sort_values(['qid','pid']).set_index(['qid','pid'])
ltr_dev.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6974598 entries, (2, 55860) to (1102400, 8830447)
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   score   float64
 1   rel     float64
dtypes: float64(2)
memory usage: 335.9 MB


In [2]:
import pandas as pd
import numpy as np
bert_dev = pd.read_csv('../pygaggle/runs/run.monobert.ans_entire.dev.trec', sep="\t", 
                  names=['qid','q0','pid','rank','score','tag'],
                  usecols=['qid', 'pid', 'score'])
dev_qrel = pd.read_csv('collections/msmarco-passage/qrels.dev.small.tsv', sep="\t", 
                       names=["qid", "q0", "pid", "rel"], 
                       usecols=['qid', 'pid', 'rel'])                                                                                         
bert_dev = bert_dev.merge(dev_qrel, left_on=['qid', 'pid'], right_on=['qid', 'pid'], how='left')                                                                                                           
bert_dev['rel'] = bert_dev['rel'].fillna(0)                                                                                                                                            
bert_dev = bert_dev.sort_values(['qid','pid']).set_index(['qid','pid'])
bert_dev.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6974598 entries, (2, 55860) to (1102400, 8830447)
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   score   float64
 1   rel     float64
dtypes: float64(2)
memory usage: 335.9 MB


In [3]:
import pandas as pd
import numpy as np
bm25_dev = pd.read_csv('../pygaggle/data/msmarco_ans_entire/run.bm25.dev.small.tsv', sep="\t", 
                  names=['qid','pid','rank'])
bm25_dev['score'] = 1/bm25_dev['rank']
dev_qrel = pd.read_csv('collections/msmarco-passage/qrels.dev.small.tsv', sep="\t", 
                       names=["qid", "q0", "pid", "rel"], 
                       usecols=['qid', 'pid', 'rel'])                                                                                      
bm25_dev = bm25_dev.merge(dev_qrel, left_on=['qid', 'pid'], right_on=['qid', 'pid'], how='left')                                                                                                           
bm25_dev['rel'] = bm25_dev['rel'].fillna(0)                                                                                                                                            
bm25_dev = bm25_dev.sort_values(['qid','pid']).set_index(['qid','pid'])
bm25_dev.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6974598 entries, (2, 55860) to (1102400, 8830447)
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   rank    int64  
 1   score   float64
 2   rel     float64
dtypes: float64(2), int64(1)
memory usage: 389.1 MB


In [4]:
from tqdm import tqdm
def eval_mrr(dev_data):                                                                                                                                                                                  
    score_tie_counter = 0                                                                                                                                                                                
    score_tie_query = set()                                                                                                                                                                              
    MRR = []                                                                                                                                                                                             
    for qid, group in tqdm(dev_data.groupby('qid')):                                                                                                                                                     
        group = group.reset_index()                                                                                                                                                                      
        rank = 0                                                                                                                                                                                         
        prev_score = None                                                                                                                                                                                
        assert len(group['pid'].tolist()) == len(set(group['pid'].tolist()))                                                                                                                             
        # stable sort is also used in LightGBM                                                                                                                                                           
                                                                                                                                                                                                         
        for t in group.sort_values('score', ascending=False, kind='mergesort').itertuples():                                                                                                             
            if prev_score is not None and abs(t.score - prev_score) < 1e-8:                                                                                                                              
                score_tie_counter += 1                                                                                                                                                                   
                score_tie_query.add(qid)                                                                                                                                                                 
            prev_score = t.score                                                                                                                                                                         
            prev_pid = t.pid                                                                                                                                                                             
            rank += 1                                                                                                                                                                                    
            if t.rel>0:                                                                                                                                                                                  
                MRR.append(1.0/rank)                                                                                                                                                                     
                break                                                                                                                                                                                    
            elif rank == 10 or rank == len(group):
                MRR.append(0.)
                break

    score_tie = f'score_tie occurs {score_tie_counter} times in {len(score_tie_query)} queries'
    print(score_tie)
    mrr_10 = np.mean(MRR).item()
    print(f'MRR@10:{mrr_10} with {len(MRR)} queries')
    return {'score_tie': score_tie, 'mrr_10': mrr_10}

In [5]:
eval_mrr(ltr_dev)

100%|██████████| 6980/6980 [00:21<00:00, 320.64it/s]

score_tie occurs 895 times in 647 queries
MRR@10:0.21159025787965619 with 6980 queries





{'score_tie': 'score_tie occurs 895 times in 647 queries',
 'mrr_10': 0.21159025787965619}

In [6]:
eval_mrr(bert_dev)

100%|██████████| 6980/6980 [00:24<00:00, 285.69it/s]

score_tie occurs 106 times in 103 queries
MRR@10:0.37926735116205024 with 6980 queries





{'score_tie': 'score_tie occurs 106 times in 103 queries',
 'mrr_10': 0.37926735116205024}

In [7]:
eval_mrr(bm25_dev)

100%|██████████| 6980/6980 [00:20<00:00, 340.71it/s]

score_tie occurs 0 times in 0 queries
MRR@10:0.1873645222176741 with 6980 queries





{'score_tie': 'score_tie occurs 0 times in 0 queries',
 'mrr_10': 0.1873645222176741}

In [8]:
merge_dev = ltr_dev.join(bert_dev,lsuffix='ltr',rsuffix='bert')
assert (merge_dev['relltr'] == merge_dev['relbert']).all()
merge_dev = merge_dev.rename(columns={"relltr": "rel"})
del merge_dev['relbert']
merge_dev.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6974598 entries, (2, 55860) to (1102400, 8830447)
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   scoreltr   float64
 1   rel        float64
 2   scorebert  float64
dtypes: float64(3)
memory usage: 709.1 MB


In [11]:
def genLTRBertRerank(merge_dev, cutoff=100):
    bert_rerank = []
    for qid, group in tqdm(merge_dev.groupby('qid')):                                                                                                                                                     
        group = group.reset_index()                                                                                                                                                                      
        rank = 0
        temp = []
        for t in group.sort_values('scoreltr', ascending=False, kind='mergesort').itertuples():                                                                                                                                                                                                                                                                                     
            rank += 1
            if rank <= cutoff:
                temp.append((qid, t.pid, t.scorebert, rank, t.rel))
            else:
                break
        bert_rerank.extend(sorted(temp, key=lambda x:x[2], reverse=True))
    return pd.DataFrame(bert_rerank, 
                        columns=['qid','pid','score','rank','rel']
                       ).set_index(['qid','pid'])

In [12]:
for i in range(100,1001,100):
    eval_mrr(genLTRBertRerank(merge_dev, i))

100%|██████████| 6980/6980 [00:24<00:00, 286.78it/s]
100%|██████████| 6980/6980 [00:21<00:00, 321.93it/s]


score_tie occurs 107 times in 104 queries
MRR@10:0.36238891162959935 with 6980 queries


100%|██████████| 6980/6980 [00:26<00:00, 259.23it/s]
100%|██████████| 6980/6980 [00:20<00:00, 340.69it/s]


score_tie occurs 107 times in 102 queries
MRR@10:0.37100764087870103 with 6980 queries


100%|██████████| 6980/6980 [00:29<00:00, 236.83it/s]
100%|██████████| 6980/6980 [00:20<00:00, 336.76it/s]


score_tie occurs 108 times in 103 queries
MRR@10:0.3739002023923227 with 6980 queries


100%|██████████| 6980/6980 [00:26<00:00, 266.79it/s]
100%|██████████| 6980/6980 [00:21<00:00, 325.09it/s]


score_tie occurs 110 times in 105 queries
MRR@10:0.3757991085641515 with 6980 queries


100%|██████████| 6980/6980 [00:34<00:00, 202.68it/s]
100%|██████████| 6980/6980 [00:20<00:00, 333.27it/s]


score_tie occurs 113 times in 108 queries
MRR@10:0.3772741847455314 with 6980 queries


100%|██████████| 6980/6980 [00:21<00:00, 330.39it/s]
100%|██████████| 6980/6980 [00:18<00:00, 382.53it/s]


score_tie occurs 113 times in 108 queries
MRR@10:0.37806078591895215 with 6980 queries


100%|██████████| 6980/6980 [00:23<00:00, 303.23it/s]
100%|██████████| 6980/6980 [00:14<00:00, 467.02it/s]


score_tie occurs 112 times in 107 queries
MRR@10:0.37832349797607673 with 6980 queries


100%|██████████| 6980/6980 [00:24<00:00, 287.47it/s]
100%|██████████| 6980/6980 [00:15<00:00, 459.82it/s]


score_tie occurs 110 times in 106 queries
MRR@10:0.37886592077136494 with 6980 queries


100%|██████████| 6980/6980 [00:26<00:00, 264.09it/s]
100%|██████████| 6980/6980 [00:14<00:00, 487.87it/s]


score_tie occurs 105 times in 103 queries
MRR@10:0.37900594669577475 with 6980 queries


100%|██████████| 6980/6980 [00:28<00:00, 246.82it/s]
100%|██████████| 6980/6980 [00:13<00:00, 535.28it/s]

score_tie occurs 106 times in 103 queries
MRR@10:0.3792195956701687 with 6980 queries





In [22]:
merge_dev = bm25_dev.join(bert_dev,lsuffix='bm25',rsuffix='bert')
assert (merge_dev['relbm25'] == merge_dev['relbert']).all()
merge_dev = merge_dev.rename(columns={"relbm25": "rel"})
del merge_dev['relbert']
merge_dev.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6974598 entries, (2, 55860) to (1102400, 8830447)
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   rank       int64  
 1   scorebm25  float64
 2   rel        float64
 3   scorebert  float64
dtypes: float64(3), int64(1)
memory usage: 762.3 MB


In [23]:
def genBM25BertRerank(merge_dev, cutoff=100):
    bert_rerank = []
    for qid, group in tqdm(merge_dev.groupby('qid')):                                                                                                                                                     
        group = group.reset_index()                                                                                                                                                                      
        rank = 0
        temp = []
        for t in group.sort_values('scorebm25', ascending=False, kind='mergesort').itertuples():                                                                                                                                                                                                                                                                                     
            rank += 1
            if rank <= cutoff:
                temp.append((qid, t.pid, t.scorebert, rank, t.rel))
            else:
                break
        bert_rerank.extend(sorted(temp, key=lambda x:x[2], reverse=True))
    return pd.DataFrame(bert_rerank, 
                        columns=['qid','pid','score','rank','rel']
                       ).set_index(['qid','pid'])

In [24]:
for i in range(100,1001,100):
    eval_mrr(genBM25BertRerank(merge_dev, i))

100%|██████████| 6980/6980 [00:14<00:00, 483.47it/s]
100%|██████████| 6980/6980 [00:12<00:00, 581.20it/s]


score_tie occurs 101 times in 99 queries
MRR@10:0.3537466457452131 with 6980 queries


100%|██████████| 6980/6980 [00:15<00:00, 460.39it/s]
100%|██████████| 6980/6980 [00:12<00:00, 570.91it/s]


score_tie occurs 101 times in 98 queries
MRR@10:0.36375744758266243 with 6980 queries


100%|██████████| 6980/6980 [00:16<00:00, 424.23it/s]
100%|██████████| 6980/6980 [00:12<00:00, 578.50it/s]


score_tie occurs 102 times in 99 queries
MRR@10:0.36836568199390546 with 6980 queries


100%|██████████| 6980/6980 [00:17<00:00, 400.04it/s]
100%|██████████| 6980/6980 [00:12<00:00, 558.31it/s]


score_tie occurs 101 times in 98 queries
MRR@10:0.3710188975303589 with 6980 queries


100%|██████████| 6980/6980 [00:18<00:00, 374.41it/s]
100%|██████████| 6980/6980 [00:12<00:00, 557.17it/s]


score_tie occurs 108 times in 105 queries
MRR@10:0.37218953245099373 with 6980 queries


100%|██████████| 6980/6980 [00:19<00:00, 350.02it/s]
100%|██████████| 6980/6980 [00:12<00:00, 553.16it/s]


score_tie occurs 108 times in 105 queries
MRR@10:0.3751480420248329 with 6980 queries


100%|██████████| 6980/6980 [00:21<00:00, 328.82it/s]
100%|██████████| 6980/6980 [00:12<00:00, 548.16it/s]


score_tie occurs 108 times in 106 queries
MRR@10:0.37607569245463224 with 6980 queries


100%|██████████| 6980/6980 [00:22<00:00, 312.15it/s]
100%|██████████| 6980/6980 [00:12<00:00, 544.55it/s]


score_tie occurs 107 times in 105 queries
MRR@10:0.37704177468504113 with 6980 queries


100%|██████████| 6980/6980 [00:23<00:00, 297.79it/s]
100%|██████████| 6980/6980 [00:12<00:00, 542.17it/s]


score_tie occurs 107 times in 105 queries
MRR@10:0.37852566289170875 with 6980 queries


100%|██████████| 6980/6980 [00:24<00:00, 282.88it/s]
100%|██████████| 6980/6980 [00:13<00:00, 532.18it/s]

score_tie occurs 106 times in 103 queries
MRR@10:0.3792195956701687 with 6980 queries



