# The Revenge of Rocchio's Angels

We will be #1 this time

Local Script Dependencies

In [1]:
from engine import SearchEngine
from evaluate_map import *
from optimizing import Optimize
import shutil
import os

  from .autonotebook import tqdm as notebook_tqdm


cuda


Full (Current) Pipeline

In [2]:
topics = load_topics("Data/queriesROBUST.txt")
topics_expanded = load_topics("Data/chatExpandedQueries.txt")
topics_thes = load_topics("Data/chatQueries.txt")
qrels = load_qrels("Data/qrels_50_Queries")

In [3]:
def subset_topics(topics):
    topics_subset = {
        k: v
        for k, v in topics.items()
        if 301 <= int(k) <= 350
    }
    return topics_subset

topics_subset = subset_topics(topics)
topics_expanded_subset = subset_topics(topics_expanded)
topics_thes_subset = subset_topics(topics_thes)


In [6]:
def compare_rerankers(topics_lists, qrels, rerankers, fusion_weights):
    shutil.rmtree("Results")
    os.makedirs(f"Results",exist_ok=True)
    for reranker in rerankers:
        print(f"Starting retrieval with reranker {reranker}")
        se = SearchEngine()
        se.set_searcher(approach="bm25",fb_terms=20, fb_docs=5, original_query_weight=0.6, mu=300, reranker=reranker)
        se.search_all_queries(topics_lists, k=1000, m=400, output_file=f"run_{reranker}", rerank_fusion_weights=fusion_weights, llm_query_fusion_weights=[0.8, 0.2, 0.0])
        for fusion_weight in fusion_weights:
            run = load_run(f"Results/run_{reranker}_rrf_{fusion_weight}.txt")
            map_score, ap_by_q = mean_average_precision(qrels, run)
            print(f"MAP for reranker {reranker} with rrf {fusion_weight}: {map_score}")
            if reranker is None:
                break


In [7]:
compare_rerankers([topics_subset, topics_expanded_subset, topics_thes], qrels, ["CE"], fusion_weights=[0, 0.2, 0.5, 0.7, 1])

Starting retrieval with reranker CE


Searching topics: 100%|██████████| 50/50 [09:52<00:00, 11.86s/it]


MAP for reranker CE with rrf 0: 0.3036872117859504
MAP for reranker CE with rrf 0.2: 0.3149248183435617
MAP for reranker CE with rrf 0.5: 0.31691938068195874
MAP for reranker CE with rrf 0.7: 0.3082415261382576
MAP for reranker CE with rrf 1: 0.28099778493024374


Get LLM datasets and optimize

In [2]:
from processing import create_llm_generated_queries
create_llm_generated_queries("Data/LLM_outputs.txt")

(WindowsPath('queries_col1.txt'),
 WindowsPath('queries_col2.txt'),
 WindowsPath('queries_col3.txt'))

In [None]:
def compare_llm_weights(queries_paths, qrels, rerankers, fusion_weights):
    topics_per_path = [load_topics(path) for path in queries_paths]
    shutil.rmtree("Results")
    os.makedirs(f"Results",exist_ok=True)
    for reranker in rerankers:
        print(f"Starting retrieval with reranker {reranker}")
        se = SearchEngine()
        se.set_searcher(approach="bm25",fb_terms=20, fb_docs=5, original_query_weight=0.6, mu=300, reranker="CE")
        se.search_all_queries(topics, k=1000, m=100, output_file=f"run_{reranker}", rerank_fusion_weights=0.2)
        for fusion_weight in fusion_weights:
            run = load_run(f"Results/run_{reranker}_rrf_{fusion_weight}.txt")
            map_score, ap_by_q = mean_average_precision(qrels, run)
            print(f"MAP for reranker {reranker} with rrf {fusion_weight}: {map_score}")
            if reranker is None:
                break


Save intermediate results

In [5]:
from processing import write_topk_jsonl_query, iter_query_hits
def save_inter_results(topics_subset):
    se = SearchEngine()
    se.set_searcher(approach="bm25",fb_terms=20, fb_docs=5, original_query_weight=0.6, mu=300, reranker=None)
    for qid, query in topics_subset.items():
        hits = se.get_top_k(query, 1000, clean=True)
        write_topk_jsonl_query(hits, "inter_bm25_rm3.jsonl", qid)

In [6]:
save_inter_results(topics_subset)

In [7]:
def check_jsonl_results(jsonl_path):
    for qid, hits in iter_query_hits(jsonl_path):
        with open(f"Results/jsonl_res.txt", "a", encoding="utf-8") as f:
            for rank, hit in enumerate(hits, start=1):
                f.write(
                    f"{qid} Q0 {hit.docid} {rank} {hit.score:.6f} {1}\n"
                )
    run = load_run(f"Results/jsonl_res.txt")
    map_score, ap_by_q = mean_average_precision(qrels, run)
    print(f"MAP is: {map_score}")

In [8]:
check_jsonl_results("inter_bm25_rm3.jsonl")

MAP is: 0.27210851494078553


 Extract Train Set Results

In [None]:
HARD_QUERIES =[309, 308, 338, 344, 348, 320, 328, 334, 303, 339] # From EDA

In [None]:
all_hits = {}
hard_hits = {}
for i, (qid, topic) in enumerate(topics.items()):
    results = se.get_top_k(topic, k=1000, clean=True)
    all_hits[f"{qid}_{topic}"] = results
    if int(qid) in HARD_QUERIES:
        hard_hits[f"{qid}_{topic}"] = results
    if i==49:
        print(qid)
        break

import pickle
with open("pkls/top1000_rm3_train.pkl", "wb") as f:
    pickle.dump(all_hits, f)
with open("pkls/top1000_rm3_train_hard.pkl", "wb") as f:
    pickle.dump(hard_hits, f)

In [None]:
res

In [None]:
# from pyserini.analysis import Analyzer, get_lucene_analyzer
# analyzer = get_lucene_analyzer(stemmer='porter', stopwords=False)
# se.reader.get_term_counts("spanish",analyzer) #(df,cf)

In [None]:
# This one creates a file called "run.txt" with submitting format, can change file name
se.search_all_queries(topics, k=5, m=2, output_file="Results/hey.txt")

In [None]:
qrels = load_qrels("Data/qrels_50_Queries")   # or "qrel301.txt"
run   = load_run("Results/run.txt")

map_score, ap_by_q = mean_average_precision(qrels, run)
map_score

In [None]:
stats = se.reader.stats()
print(f"average terms per doc: {stats['total_terms']/stats['documents']}")

In [None]:
doc = se.searcher.doc("FT921-3160")
text = doc.raw()
text

In [None]:
map = get_map_by_paths("Data/qrels_50_Queries", "Results/run.txt")

In [None]:
opti = Optimize()
# fb_terms_values = [5, 6, 8, 10, 15, 20]
# fb_docs_values = [5, 7, 10, 15]
# og_query_weight_values = [0.3, 0.4, 0.5, 0.6, 0.7]
mus = [200,300,400,500,600,700,800,900,1000,1100,1200]

opti.optimize_qld(topics, [20], [5], [0.6], mus, k=1000)