# Reranking Capability of SuRe 

- Note. We assume that one already run (1) SuRe and obtained the conditional summarization, and (2) Preference Evaluation and obtained the generic summarization.

In [None]:
import pprint
import json
import copy
import numpy as np
from tqdm import tqdm
import time
from datetime import timedelta, datetime
pp = pprint.PrettyPrinter(indent=4)

## Loading dataset
- Available: ['nq-test', 'wq-test', 'hotpotqa', '2wikimultihopqa']

In [None]:
data_type = '2wikimultihopqa'

In [None]:
dataset = json.load(open(f'./datasets/{data_type}-bm25.json'))

## Setup OpenAI
- Caution. One needs to insert the proper API Key

In [None]:
import openai

openai.api_key = ""
model = "gpt-3.5-turbo"

In [None]:
from functions import api_query

# Reranking Results

## Method #1: Top-1 with BM25

In [None]:
from functions import use_api_base

In [None]:
base_retrieval1 = use_api_base(model, dataset, iters=1, n_articles=1)

In [None]:
from data_utils import get_em_f1

In [None]:
em_top1, f1_top1 = get_em_f1(dataset, base_retrieval1)

In [None]:
print("EM: {}, F1: {}".format(em_top1.mean(), f1_top1.mean()))

## Method #2. Similarity with question (using sentence embedding)

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
model_sent = SentenceTransformer('msmarco-MiniLM-L6-cos-v5')

In [None]:
def get_sim_question(model, examples, n_articles=10):
    res = []

    for i, example in tqdm(enumerate(examples)):
        query_embedding = model.encode(example['question'])
        
        ctxs = []
        for i in range(n_articles):
            ctxs.append(example['contexts'][i]['text'])
        passage_embedding = model.encode(ctxs)     

        cosine_sim = util.dot_score(query_embedding, passage_embedding)
        res.append(example['contexts'][int(np.argmax(cosine_sim[0]))])
    return res

In [None]:
rerank_sim_q = get_sim_question(model_sent, dataset, n_articles=10)

In [None]:
def get_pred_selected(dataset, selected, idx):
    data = dataset[idx]
    text = ""
    text += f"Passage #1 Title: {selected[idx]['title']}\nPassage #1 Text: {selected[idx]['text']} \n\n"
    text += f"Task description: predict the answer to the following question. Do not exceed 3 words."
    text += f"\n\nQuestion: {data['question']}."
    text += f"\n\nAnswer: "          
    return text

In [None]:
def use_api_rerank(model, dataset, selected, iters=1, temp=0.0):
    res = []
    
    for i, example in tqdm(enumerate(dataset)):
        query = get_pred_selected(dataset, selected, idx=i)
        answer = api_query(model, query, temp, iters)
        res.append(answer)

    return res

In [None]:
pred_sim_q = use_api_rerank(model, dataset, rerank_sim_q)

In [None]:
em_sim_q, f1_sim_q = get_em_f1(dataset, pred_sim_q)

In [None]:
print("EM: {}, F1: {}".format(em_sim_q.mean(), f1_sim_q.mean()))

## Method #3: LLM as Reranker
- This idea is published in EMNLP23 ("Is ChatGPT Good at Search? Investigating Large Language Models as Re-Ranking Agents")
- Codes are adopted from [the official repository](https://github.com/sunnweiwei/RankGPT)

In [None]:
def prompt_rerank(examples, idx, n_articles=10, start_idx=0):
    data = examples[idx]
    len_ctxs = len(data['contexts'])
    num = n_articles
    query = f"Question: {data['question']}"
    
    text = "This is RankGPT, an intelligent assistant that can rank passages based on their relevancy to the query.\n\n"
    text += f"The following are {num} passages, each indicated by number identifier []. I can rank them based on their relevance to query: {query}\n\n"
    supporting_articles = ''
    for i in range(n_articles):
        idx_ctx = ((i+start_idx) % len_ctxs)
        supporting_articles += f"[{i+1}] Title: {data['contexts'][idx_ctx]['title']}\nText: {data['contexts'][idx_ctx]['text']}\n\n"
    text += supporting_articles
    
    text += f"The search query is: {query}\n\n"
    text += f"I will rank the {num} passages above based on their relevance to the search query. The passages will be listed in descending order using identifiers, and the most relevant passages should be listed first, and the output format should be [] > [] > etc, e.g., [1] > [2] > etc.\n\n"
    text += f"The ranking results of the {num} passages (only identifiers) is:"
    
    return text

In [None]:
def parsing_output(output, window_size=5):
    res_rank = np.zeros(window_size).astype(np.int64)
    splitted = output.split(' > ')
    if len(splitted) != window_size:
        res_rank = np.arange(window_size).astype(np.int64) + 1
    else:    
        for i in range(window_size):
            try: 
                if splitted[i][2] == '0':
                    ele = int(splitted[i][1:3])
                else:
                    ele = int(splitted[i][1])
            except:
                print(splitted)
                for j in range(window_size):
                    if f'[{j+1}]' not in splitted:
                        ele = f'{j+1}'
                        break
            res_rank[i] = int(ele)

    return res_rank - 1

In [None]:
def sliding_window(model, examples, idx, n_articles=10, window_size=5, step_size=3):
    data = examples[idx]
    n_ctxs = n_articles
    init_rank = np.arange(n_ctxs)

    for iter in range(n_ctxs // step_size):
        if iter == (n_ctxs // step_size) - 1:
            start_idx = 0
        else:
            start_idx = n_ctxs - window_size - iter * step_size
        query_rerank = prompt_rerank(examples, idx, window_size, start_idx)
        output = api_query(model, query_rerank, temp=0, iters=1)[0]
        ordering = parsing_output(output, window_size)
        init_rank[start_idx:start_idx+window_size] = init_rank[start_idx:start_idx+window_size][ordering]

    return init_rank

In [None]:
def rerank_all_data(model, dataset):
    res = []
    n_samples = len(dataset)
    for i in tqdm(range(n_samples)):
        res_i = sliding_window(model, dataset, i)
        res.append(res_i)
    return res

In [None]:
get_ranking_window = rerank_all_data(model, dataset)

In [None]:
rerank_llm = []

for i, item in enumerate(dataset):
    rerank_llm.append(item['contexts'][int(get_ranking_window[i][0])])

In [None]:
pred_llm = use_api_rerank(model, dataset, rerank_llm)

In [None]:
em_llm, f1_llm = get_em_f1(dataset, pred_llm)

In [None]:
print("EM: {}, F1: {}".format(em_llm.mean(), f1_llm.mean()))

## Method #4: Similarity with Generic Summarization 

In [None]:
generic_loc = './temp.json'

In [None]:
generic_summary = json.load(open(generic_loc))[:5]

In [None]:
def get_sim_summary(model, dataset, summaries, n_articles=10):
    res = []

    for i, example in tqdm(enumerate(dataset)):
        query_embedding = model.encode(summaries[i])
        
        ctxs = []
        for i in range(n_articles):
            ctxs.append(example['contexts'][i]['text'])
        passage_embedding = model.encode(ctxs)     

        # compute and print the cosine similarity matrix
        cosine_sim = util.dot_score(query_embedding, passage_embedding)
        res.append(example['contexts'][int(np.argmax(cosine_sim[0]))])
    return res

In [None]:
rerank_sim_gen_summary = get_sim_summary(model_sent, dataset, generic_summary)

In [None]:
pred_sim_gen_summary = use_api_rerank(model, dataset, rerank_sim_gen_summary)

In [None]:
em_sim_gen_summary, f1_sim_gen_summary = get_em_f1(dataset, pred_sim_gen_summary)

In [None]:
print("EM: {}, F1: {}".format(em_sim_gen_summary.mean(), f1_sim_gen_summary.mean()))

## Method #5: Similarity with SuRe's Summarization 

In [None]:
sure_loc = './temp2/results_summary.json'

In [None]:
sure_summary = json.load(open(sure_loc))[:5]

In [None]:
rerank_sim_sure_summary = get_sim_summary(model_sent, dataset, sure_summary)

In [None]:
pred_sim_sure_summary = use_api_rerank(model, dataset, rerank_sim_sure_summary)

In [None]:
em_sim_sure_summary, f1_sim_sure_summary = get_em_f1(dataset, pred_sim_sure_summary)

In [None]:
print("EM: {}, F1: {}".format(em_sim_sure_summary.mean(), f1_sim_sure_summary.mean()))