# Preference Evaluation between SuRe's summarization and Generic summarization

- Note. We assume that one already run SuRe and obtained the conditional summarization.

In [None]:
import pprint
import json
import copy
import numpy as np
from tqdm import tqdm
import time
from datetime import timedelta, datetime
pp = pprint.PrettyPrinter(indent=4)

## Loading dataset
- Available: ['nq-test', 'wq-test', 'hotpotqa', '2wikimultihopqa']

In [None]:
data_type = '2wikimultihopqa-bm25'

In [None]:
dataset = json.load(open(f'./datasets/{data_type}.json'))

## Setup OpenAI
- Caution. One needs to insert the proper API Key

In [None]:
import openai

openai.api_key = ""
model = "gpt-3.5-turbo"

## Generic Summarization

In [None]:
from functions import api_query

### Generation

In [None]:
def summarize(dataset, idx, n_articles=10, start_idx=0):
    data = dataset[idx]
    len_ctxs = len(data['contexts'])

    text = ''
    for i in range(start_idx, start_idx + n_articles):
        idx_ctx = (i % len_ctxs)
        text += f"\n\nPassage #{i+1} Title: {data['contexts'][idx_ctx]['title']}\nPassage #{i+1} Text: {data['contexts'][idx_ctx]['text']}"

    text += f"\n\nYour job is to act as a professional writer. You will write a good-quality passage that can support the prediction about the question only based on the information in the provided supporting passages.\n\nNow, let's start. After you write, please write [DONE] to indicate you are done. Do not write a prefix (e.g., \"Response:\") while writing a passage."
    text += f"\n\nQuestion: {data['question']}"
    text += f"\nPassage: "
    
    return text

In [None]:
def use_api_summarize(model, dataset, n_articles=10, start_idx=0, temp=0, iters=1):
    res = []
    
    for i, example in tqdm(enumerate(dataset)):
        waiting_time = 0.5
        query = summarize(dataset, i, n_articles)
        answer = api_query(model, query, temp, iters)
        res.append(answer)
    return res

In [None]:
generic_summary = use_api_summarize(model, dataset, n_articles=10)

In [None]:
save_loc = 'output_path'

In [None]:
with open(f'./{save_loc}/{data_type}_generic_summary.json', "w", encoding='utf-8') as writer:
    writer.write(json.dumps(generic_summary, indent=4, ensure_ascii=False) + "\n")

### Inference

In [None]:
def prediction(dataset, idx, summarization):
    data = dataset[idx]
    summary = summarization[idx]
    text = f"Below is the passage related to the question at the end. After reading the passage, provide correct answer to the question at the end. Answer should not exceed 3 words."
    text += f"\n\nPassage:\n{summary}"
    text += f"\n\nQuestion:\n{data['question']}"
    text += f"\n\nAnswer: "
    return text

In [None]:
def use_api_prediction(model, dataset, summarization, temp=0, iters=1):
    res = []
    for i, example in tqdm(enumerate(dataset)):
        waiting_time = 0.5
        query = prediction(dataset, i, summarization)
        answer = api_query(model, query, temp, iters)
        res.append(answer)
    return res

In [None]:
generic_pred = use_api_prediction(model, dataset, generic_summary)

In [None]:
with open(f'./{save_loc}/{data_type}_generic_pred.json', "w", encoding='utf-8') as writer:
    writer.write(json.dumps(generic_pred, indent=4, ensure_ascii=False) + "\n")

In [None]:
from data_utils import get_em_f1

In [None]:
em_generic, f1_generic = get_em_f1(dataset, generic_pred)

In [None]:
generic_ans_idx = np.where(em_generic == 1)[0]

## Load SuRe's Data

In [None]:
sure_loc = './test_folder/2wiki_start0_end5_sure_ret10/'

In [None]:
sure_ans_idx = np.load(sure_loc + 'sure_ans_idx.npy')
sure_summary1 = json.load(open(sure_loc + 'summary1.json'))
sure_summary2 = json.load(open(sure_loc + 'summary2.json'))
all_indices = json.load(open(sure_loc + 'indices.json'))
sure_choice1_idx = np.array(all_indices[0])
sure_choice2_idx = np.array(all_indices[2])

### Obtain indices of mutually correct predictions 

In [None]:
test_idx = []
intesrsect = np.intersect1d(generic_ans_idx, sure_ans_idx)
for i in intesrsect:
    if i in np.concatenate([sure_choice1_idx, sure_choice2_idx]):
        test_idx.append(i)
print(len(test_idx))

# Prompting for evaluation

In [None]:
def preference1(dataset, idx, naive_reason, ours_reason, ours_idx):
    data = dataset[idx]
    question = data['question']
    answer = data['answers']
    naive_reasoning_txt = naive_reason[idx]
    ours_reasoning = ours_reason[ours_idx][idx]
    
    text = "Question: Given the following summaries for the target question, determine which one is more informative and plausible as rationale to support a given target question-answer pair."
    text += f"\n\nSummary 1:\n{naive_reasoning_txt}"
    text += f"\n\nSummary 2:\n{ours_reasoning}"
    text += f"\n\nTarget Question:\n{question}"
    text += "\n\nTarget Answer:\n"
    for j in range(len(answer)):
        text += answer[j]
        if j != (len(answer)-1):
            text += ", "
    text += "\n\nYour Task:\nIdentify which summary (Summary 1 or Summary 2) is more informative and plausible as rationale to support a given answer at hand. Choices: [Summary 1, Summary 2].\n\nAnswer:"
    return text

def preference2(dataset, idx, naive_reason, ours_reason, ours_idx):
    data = dataset[idx]
    question = data['question']
    answer = data['answers']
    naive_reasoning_txt = naive_reason[idx]
    ours_reasoning = ours_reason[ours_idx][idx]
    
    text = "Question: Given the following summaries for the target question, determine which one is more informative and plausible as rationale to support a given target question-answer pair."
    text += f"\n\nSummary 1:\n{ours_reasoning}"
    text += f"\n\nSummary 2:\n{naive_reasoning_txt}"
    text += f"\n\nTarget Question:\n{question}"
    text += f"\n\nTarget Answer:\n"
    for j in range(len(answer)):
        text += answer[j]
        if j != (len(answer)-1):
            text += ", "
    text += "\n\nYour Task:\nIdentify which summary (Summary 1 or Summary 2) is more informative and plausible as rationale to support a given answer at hand. Choices: [Summary 1, Summary 2].\n\nAnswer:"
    return text

In [None]:
def use_api_preference(model, dataset, test_idx, naive_reason, ours_reason, choice1_idx, choice2_idx, iters=1, temp=0.0):
    res1, res2 = [], []
    for i in tqdm(range(len(test_idx))):
        data_idx = test_idx[i]
        if data_idx in choice1_idx:
            ours_idx_tmp = 0
        elif data_idx in choice2_idx:
            ours_idx_tmp = 1
        else:
            print('Something wrong')
            
        query1 = preference1(dataset, data_idx, naive_reason, ours_reason, ours_idx_tmp)
        query2 = preference2(dataset, data_idx, naive_reason, ours_reason, ours_idx_tmp)
        
        answer1 = api_query(model, query1, temp, iters)[0]
        answer2 = api_query(model, query2, temp, iters)[0]
        
        res1.append(answer1)
        res2.append(answer2)
    return res1, res2

In [None]:
preference1_result, preference2_result = use_api_preference(model, dataset, test_idx, generic_summary, [sure_summary1, sure_summary2], sure_choice1_idx, sure_choice2_idx)

In [None]:
result = []
for i in range(len(preference1_result)):
    result1 = preference1_result[i]
    result2 = preference2_result[i]
    tmp = np.zeros(3)
    r1 = result1[:11]
    r2 = result2[:11]
    if ('Summary 1' in r1 and 'Summary 2' in r2):
        tmp[0] += 1
    elif ('Summary 2' in r1 and 'Summary 1' in r2):
        tmp[2] += 1
    elif ('Summary 1' in r1 and 'Summary 1' in r2):
        tmp[1] += 1
    elif ('Summary 2' in r1 and 'Summary 2' in r2):
        tmp[1] += 1
    else:
        tmp[1] += 1
    result.append(tmp)

In [None]:
lose = 0
tie = 0
win = 0
for x in result:
    l = x[0]/sum(x)
    t = x[1]/sum(x)
    w = x[2]/sum(x)
    lose += l
    tie += t
    win += w
print(lose/len(result), tie/len(result), win/len(result))