# Retrieval and data fusion

## Make runs with 3, 5, 10, or 100 queries for all three prompt strategies

In [None]:
from tqdm import tqdm
import pandas as pd 
from ranx import Qrels, Run, fuse

import pyterrier as pt
if not pt.started():
  pt.init()
  

def strip_run(run, topics):
    run_dict = run.to_dict()
    for topic in topics:
        del run_dict[topic]
    
    _run = Run.from_dict(run_dict)
    return _run 

def expand_run(run, topics):
    for topic in topics:
        run.add(q_id=topic, doc_ids=['unknown'], scores=[0.0])
    return run 
  
# https://ir-datasets.com/nyt.html  
collection = 'core17'
qv = pd.read_csv('../queries/gpt-4o/core17.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')  
dataset = pt.get_dataset('irds:nyt/trec-core-2017')
index_ref = pt.IndexRef.of('../indices/nyt') 
qrels = Qrels.from_ir_datasets("nyt/trec-core-2017")

# https://ir-datasets.com/wapo.html
# collection = 'core18'
# qv = pd.read_csv('../queries/gpt-4o/core18.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')   
# dataset = pt.get_dataset('irds:wapo/v2/trec-core-2018')
# index_ref = pt.IndexRef.of('../indices/wapo_v2/')
# qrels = Qrels.from_ir_datasets("wapo/v2/trec-core-2018")

# https://ir-datasets.com/disks45.html
# collection = 'robust04'
# qv = pd.read_csv('../queries/gpt-4o/robust04.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')  
# dataset = pt.get_dataset('irds:disks45/nocr/trec-robust-2004')
# index_ref = pt.IndexRef.of('../indices/disks45_nocr') 
# qrels = Qrels.from_ir_datasets("disks45/nocr/trec-robust-2004")

# https://ir-datasets.com/aquaint.html
# collection = 'robust05'
# qv = pd.read_csv('../queries/gpt-4o/robust05.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')  
# dataset = pt.get_dataset('irds:aquaint/trec-robust-2005')
# index_ref = pt.IndexRef.of('../indices/aquaint') 
# qrels = Qrels.from_ir_datasets("aquaint/trec-robust-2005")

bm25 = pt.BatchRetrieve(index_ref, wmodel='BM25')

for prompt in ['P-1', 'P-2', 'P-3']:
    for num_queries in [3, 5, 10, 100]:
        runs = []
        for i in tqdm(range(1, num_queries + 1)):
            queries = qv[(qv['qno']==i) & (qv['prompt']==prompt)][['qid', 'query']]
            for rm_char in ["?", "'", "/", ":", "*", "(", ")", "_", "!"]:
                queries['query'] = queries['query'].str.replace(rm_char, '',regex=False)
            pt.io.write_results(bm25(queries), 'tmp', format='trec', append=False)
            _run = Run.from_file('tmp', kind='trec')
            topic_diff = list(set(list(qrels.keys())) - set(list(_run.keys())))
            _run = expand_run(_run, topic_diff)
            topic_diff = list(set(list(_run.keys())) - set(list(qrels.keys())))
            _run = strip_run(_run, topic_diff)
            runs.append(_run)
            
        fuse_method = 'rrf'
        run_fuse = fuse(runs=runs, method=fuse_method)
        run_fuse.name = fuse_method
        run_fuse.save('../runs/gpt-4o/' + collection + '.' + str(num_queries) + '.' + str(fuse_method) + '.' + str(prompt) + '.txt')

## Make BM25 baselines

In [None]:
from tqdm import tqdm
import pandas as pd 
from ranx import Qrels, Run
import pyterrier as pt

if not pt.started():
  pt.init()

def strip_run(run, topics):
    run_dict = run.to_dict()
    for topic in topics:
        del run_dict[topic]
    
    _run = Run.from_dict(run_dict)
    return _run 

def expand_run(run, topics):
    for topic in topics:
        run.add(q_id=topic, doc_ids=['unknown'], scores=[0.0])
    return run 
  
# https://ir-datasets.com/nyt.html  
collection = 'core17'
qv = pd.read_csv('../queries/gpt-4o/core17.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')  
dataset = pt.get_dataset('irds:nyt/trec-core-2017')
index_ref = pt.IndexRef.of('../indices/nyt') 
qrels = Qrels.from_ir_datasets("nyt/trec-core-2017")

# https://ir-datasets.com/wapo.html
# collection = 'core18'
# qv = pd.read_csv('../queries/gpt-4o/core18.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')   
# dataset = pt.get_dataset('irds:wapo/v2/trec-core-2018')
# index_ref = pt.IndexRef.of('../indices/wapo_v2/')
# qrels = Qrels.from_ir_datasets("wapo/v2/trec-core-2018")

# https://ir-datasets.com/disks45.html
# collection = 'robust04'
# qv = pd.read_csv('../queries/gpt-4o/robust04.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')  
# dataset = pt.get_dataset('irds:disks45/nocr/trec-robust-2004')
# index_ref = pt.IndexRef.of('../indices/disks45_nocr') 
# qrels = Qrels.from_ir_datasets("disks45/nocr/trec-robust-2004")

# https://ir-datasets.com/aquaint.html
# collection = 'robust05'
# qv = pd.read_csv('../queries/gpt-4o/robust05.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')  
# dataset = pt.get_dataset('irds:aquaint/trec-robust-2005')
# index_ref = pt.IndexRef.of('../indices/aquaint') 
# qrels = Qrels.from_ir_datasets("aquaint/trec-robust-2005")

bm25 = pt.BatchRetrieve(index_ref, wmodel='BM25')
baseline = bm25(dataset.get_topics('title'))
pt.io.write_results(baseline, 'tmp', format='trec', append=False)
baseline_ranx = Run.from_file('tmp', kind='trec')
baseline_ranx.name = 'BM25'
baseline_ranx.save('../runs/baselines/' + collection + '.BM25.lz4', kind='lz4')


## Make BM25+RM3 baselines

In [None]:
from tqdm import tqdm
import pandas as pd 
from ranx import Qrels, Run
import pyterrier as pt

if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

def strip_run(run, topics):
    run_dict = run.to_dict()
    for topic in topics:
        del run_dict[topic]
    
    _run = Run.from_dict(run_dict)
    return _run 

def expand_run(run, topics):
    for topic in topics:
        run.add(q_id=topic, doc_ids=['unknown'], scores=[0.0])
    return run 
  
# https://ir-datasets.com/nyt.html  
collection = 'core17'
qv = pd.read_csv('../queries/gpt-4o/core17.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')  
dataset = pt.get_dataset('irds:nyt/trec-core-2017')
index_ref = pt.IndexRef.of('../indices/nyt') 
qrels = Qrels.from_ir_datasets("nyt/trec-core-2017")

# https://ir-datasets.com/wapo.html
# collection = 'core18'
# qv = pd.read_csv('../queries/gpt-4o/core18.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')   
# dataset = pt.get_dataset('irds:wapo/v2/trec-core-2018')
# index_ref = pt.IndexRef.of('../indices/wapo_v2/')
# qrels = Qrels.from_ir_datasets("wapo/v2/trec-core-2018")

# https://ir-datasets.com/disks45.html
# collection = 'robust04'
# qv = pd.read_csv('../queries/gpt-4o/robust04.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')  
# dataset = pt.get_dataset('irds:disks45/nocr/trec-robust-2004')
# index_ref = pt.IndexRef.of('../indices/disks45_nocr') 
# qrels = Qrels.from_ir_datasets("disks45/nocr/trec-robust-2004")

# https://ir-datasets.com/aquaint.html
# collection = 'robust05'
# qv = pd.read_csv('../queries/gpt-4o/robust05.csv', names=['qno', 'prompt', 'qid', 'query'], sep=';')  
# dataset = pt.get_dataset('irds:aquaint/trec-robust-2005')
# index_ref = pt.IndexRef.of('../indices/aquaint') 
# qrels = Qrels.from_ir_datasets("aquaint/trec-robust-2005")

pipe = (pt.BatchRetrieve(index_ref, wmodel="BM25") >> 
        pt.rewrite.RM3(index_ref) >> 
        pt.BatchRetrieve(index_ref, wmodel="BM25")
)

baseline = pipe(dataset.get_topics('title'))
pt.io.write_results(baseline, 'tmp', format='trec', append=False)
baseline_ranx = Run.from_file('tmp', kind='trec')
baseline_ranx.name = 'BM25+RM3'
baseline_ranx.save('../runs/baselines/' + collection + '.BM25+RM3.lz4', kind='lz4')