In [1]:
import json
import numpy as np
from openai import OpenAI
from order_by.sorting import *
from order_by.utils import *
import openai
from collections import defaultdict

with open(".open_ai_api_key", "r") as f:
    api_key = f.read().strip()

print(openai.__version__)
client = OpenAI(api_key=api_key)

1.95.1


# Sample and save data

In [2]:
# import datasets

# data = datasets.load_dataset("ms_marco", "v1.1", split="train")
# sampled_data = data.shuffle(seed=42).select(range(100))

# data_list = [dict(item) for item in sampled_data]
# with open("msmarco_v1.1_sampled_100.json", "w") as f:
#     json.dump(data_list, f, indent=2)

# print("Saved to 'msmarco_v1.1_sampled_100.json'")

# Load Experiment Data

In [3]:
with open("msmarco_v1.1_sampled_100.json", "r") as f:
    data = json.load(f)

# Prompts Template

In [None]:
# TODO: problem: many output scores are the same, so we need to figure out how to measure the precision of these scenarios
pointwise_prompt_template = """You are given a question and a passage. Evaluate how well the passage answers the question by assigning a score from 0 to 5.\nOutput only a float score.\nQuestion: {question}\nPassage: {passage}\nScore:"""

external_pointwise_prompt_template = """You are given a question and a list of passages. Evaluate how well each passage answers the question. For each passage, assign a float score from 0 to 5.\nOutput a JSON list of float scores in the same order as the input passages.\nQuestion: {question}\nPassages:\n{numbered_passages}\nScore:"""

pairwise_comparison_prompt_template = """You are given a question and two passages. Determine which passage answers the question better.\nQuestion: {question}\nPassage A: {passage_a}\nPassage B: {passage_b}\nOutput only one word: 'A' if Passage A is better, 'B' if Passage B is better, or 'Equal' if both are equally good."""

external_comparison_prompt_template = """You are given a question and a list of passages. Rank the passages based on how well they answer the question, from best to worst.\nQuestion: {question}\nPassages:\n{numbered_passages}\nOutput a JSON list of passage numbers in ranked order (best to worst)."""

In [18]:
def create_numbered_passages(passages):
    return "\n".join([f"{i+1}. {p}" for i, p in enumerate(passages)])

def precision(selected_ground_truth, output):
    """ Evaluate the precision of the ranked result.
    precision@k = (#relevant output items in top k / k) where k is the number of relevant items in selected_ground_truth.
    """
    selected_ground_truth = np.asarray(selected_ground_truth)
    k = np.sum(selected_ground_truth, axis=1)
    idxs = np.asarray(output) - 1
    rows = np.arange(idxs.shape[0])[:, None]
    relevance = selected_ground_truth[rows, idxs]
    rank_positions = np.arange(idxs.shape[1])
    mask = rank_positions < k[:, None]
    return np.sum(mask * relevance, axis=1) / k

In [14]:
for i, s in enumerate(data):
    query = s['query']
    passage = s['passages']['passage_text']
    selected = s['passages']['is_selected']
    if sum(selected) == 0:
        print(f"skip bad experiment {i}")
        print(selected)
        continue
    
    # print(pointwise_prompt_template.format(question=query, passage=passage[0]))
    # print(external_pointwise_prompt_template.format(question=query, numbered_passages=create_numbered_passages(passage)))
    # print(pairwise_comparison_prompt_template.format(question=query, passage_a=passage[0], passage_b=passage[1]))
    # print(external_comparison_prompt_template.format(question=query, numbered_passages=create_numbered_passages(passage)))

skip bad experiment 6
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
skip bad experiment 7
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
skip bad experiment 11
[0, 0, 0, 0, 0, 0, 0, 0, 0]
skip bad experiment 67
[0, 0, 0, 0, 0, 0, 0]
skip bad experiment 97
[0, 0, 0, 0, 0, 0]


In [36]:
def store_metadata(d, raw_scores, rank_indices, precision, api_calls, tokens, memory_size = 0):
    if memory_size == 0:
        d['rank_indices'] = rank_indices
        d['precision'] = precision
        d['api_calls'] = api_calls
        d['tokens'] = tokens
        d['raw_scores'] = raw_scores
    else:
        d[memory_size] = {}
        d[memory_size]['rank_indices'] = rank_indices
        d[memory_size]['precision'] = precision
        d[memory_size]['api_calls'] = api_calls
        d[memory_size]['tokens'] = tokens
        d[memory_size]['raw_scores'] = raw_scores

In [None]:
from order_by.pointwise import PointwiseRelevanceKey


async def PointwiseRelevanceExperiment(client, data, prompt_template, modelname, output_type):
    print(f"pointwise_sort {modelname}")
    query = data['query']
    passages = data['passages']['passage_text']
    selected = data['passages']['is_selected']

    if sum(selected) == 0:
        print("skip bad experiment")
        return

    async def compute_sort_key(query, passage):
        pwKey = PointwiseRelevanceKey(query, passage, prompt_template)
        return await pwKey.value(client, modelname, output_type)

    tasks = [compute_sort_key(query, passage) for passage in passages]
    results = await asyncio.gather(*tasks)

    raw_scores = np.array([s[0] for s in results])
    rank_indices = np.argsort(-raw_scores) + 1
    total_api_calls = sum(api_calls for _, api_calls, _, _ in results)
    total_tokens = sum(tokens for _, _, tokens, _ in results)
    accuracy = precision([selected], [rank_indices])
    print(f"raw score: {raw_scores}, ranked: {rank_indices}, precision {accuracy}")

    return total_api_calls, total_tokens, accuracy[0], rank_indices, raw_scores

In [57]:
output_type = float
results = {}

for modelname in ["gpt-4o-mini", "gpt-4o"]:
    results[modelname] = {}

    for i, d in enumerate(data):
        selected = d['passages']['is_selected']
        if sum(selected) == 0:
            print(f"skip bad experiment {i}")
            continue

        results[modelname][i] = {'pointwise_sort':{}, 'quick_sort':{}, 'heap_sort':{},\
                              'external_pointwise_sort':{}, 'external_bubble_sort':{},\
                              'external_merge_sort':{}}
    
        total_api_calls, total_tokens, p, rank_indices, raw_scores = await PointwiseRelevanceExperiment(client, d, pointwise_prompt_template, modelname, output_type)
        store_metadata(results[modelname][i]['pointwise_sort'], raw_scores.tolist(), rank_indices.tolist(), float(p), total_api_calls, total_tokens)



pointwise_sort gpt-4o-mini
raw score: [3. 3. 3. 5. 5. 4. 4.], ranked: [4 5 6 7 1 2 3], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [5. 5. 4. 5. 5. 5.], ranked: [1 2 4 5 6 3], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [4.  2.5 2.  3.  1.  1.  3.  4. ], ranked: [1 8 4 7 2 3 5 6], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [1. 5. 5. 5. 1. 5.], ranked: [2 3 4 6 1 5], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [5.  5.  1.  1.  3.  2.  4.5 5. ], ranked: [1 2 8 7 5 6 3 4], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [2.  5.  5.  4.5 5.  5.  5. ], ranked: [2 3 5 6 7 4 1], precision [0.]
skip bad experiment 6
skip bad experiment 7
pointwise_sort gpt-4o-mini
raw score: [3.5 2.  3.  3.  4.5 4.  3. ], ranked: [5 6 1 3 4 7 2], precision [1.]
pointwise_sort gpt-4o-mini
raw score: [4.5 4.  3.  4.  1.  2.  2. ], ranked: [1 2 4 3 6 7 5], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [4.  5.  5.  3.5 5.  4.  4.  2. ], ranked: [2 3 5 1 6 7 4 8], pre

In [58]:
print(len(results['gpt-4o']))
for modelname in ["gpt-4o-mini", "gpt-4o"]:
    for algo in ['pointwise_sort']:
    # for algo in ['pointwise_sort', 'quick_sort', 'heap_sort', 'external_pointwise_sort', 'external_bubble_sort', 'external_merge_sort']:
        average_precision = 0
        for iter in results[modelname]:
            average_precision += results[modelname][iter][algo]['precision']
        
        print(f'model: {modelname}, algo: {algo} average_precision: {average_precision / len(results[modelname])}')
        
print(results)

95
model: gpt-4o-mini, algo: pointwise_sort average_precision: 0.29298245614035084
model: gpt-4o, algo: pointwise_sort average_precision: 0.3157894736842105
{'gpt-4o-mini': {0: {'pointwise_sort': {'rank_indices': [4, 5, 6, 7, 1, 2, 3], 'precision': 0.0, 'api_calls': 0, 'tokens': 2833, 'raw_scores': [3.0, 3.0, 3.0, 5.0, 5.0, 4.0, 4.0]}, 'quick_sort': {}, 'heap_sort': {}, 'external_pointwise_sort': {}, 'external_bubble_sort': {}, 'external_merge_sort': {}}, 1: {'pointwise_sort': {'rank_indices': [1, 2, 4, 5, 6, 3], 'precision': 0.0, 'api_calls': 0, 'tokens': 2027, 'raw_scores': [5.0, 5.0, 4.0, 5.0, 5.0, 5.0]}, 'quick_sort': {}, 'heap_sort': {}, 'external_pointwise_sort': {}, 'external_bubble_sort': {}, 'external_merge_sort': {}}, 2: {'pointwise_sort': {'rank_indices': [1, 8, 4, 7, 2, 3, 5, 6], 'precision': 0.0, 'api_calls': 0, 'tokens': 3548, 'raw_scores': [4.0, 2.5, 2.0, 3.0, 1.0, 1.0, 3.0, 4.0]}, 'quick_sort': {}, 'heap_sort': {}, 'external_pointwise_sort': {}, 'external_bubble_sort': 

In [None]:
# with open("result_pointwise.json", "w") as f:
#     json.dump(results, f, indent=2)