In [102]:
import json
import numpy as np
from openai import OpenAI, AsyncOpenAI
from order_by.sorting import *
from order_by.utils import *
import openai
from collections import defaultdict

with open(".open_ai_api_key", "r") as f:
    api_key = f.read().strip()

print(openai.__version__)
# client = OpenAI(api_key=api_key)
client = AsyncOpenAI(api_key=api_key)


1.95.1


# Sample and save data

In [2]:
# import datasets

# data = datasets.load_dataset("ms_marco", "v1.1", split="train")
# sampled_data = data.shuffle(seed=42).select(range(100))

# data_list = [dict(item) for item in sampled_data]
# with open("msmarco_v1.1_sampled_100.json", "w") as f:
#     json.dump(data_list, f, indent=2)

# print("Saved to 'msmarco_v1.1_sampled_100.json'")

# Load Experiment Data

In [3]:
with open("msmarco_v1.1_sampled_100.json", "r") as f:
    data = json.load(f)

# Prompts Template

In [None]:
# TODO: problem: many output scores are the same, so we need to figure out how to measure the precision of these scenarios
pointwise_prompt_template = """You are given a question and a passage. Evaluate how well the passage answers the question by assigning a score from 0 to 5.\nOutput only a float score.\nQuestion: {question}\nPassage: {passage}\nScore:"""

external_pointwise_prompt_template = """You are given a question and a list of passages. Evaluate how well each passage answers the question. For each passage, assign a float score from 0 to 5.\nOutput a JSON list of float scores in the same order as the input passages.\nQuestion: {question}\nPassages:\n{numbered_passages}\nScore:"""

pairwise_comparison_prompt_template = """You are given a question and two passages. Determine which passage answers the question better.\nQuestion: {question}\nPassage A: {passage_a}\nPassage B: {passage_b}\nOutput only one word: 'A' if Passage A is better, 'B' if Passage B is better, or 'Equal' if both are equally good."""

external_comparison_prompt_template = """You are given a question and a list of passages. Rank the passages based on how well they answer the question, from best to worst.\nQuestion: {question}\nPassages:\n{numbered_passages}\nOutput a JSON list of passage numbers in ranked order (best to worst)."""

In [18]:
def create_numbered_passages(passages):
    return "\n".join([f"{i+1}. {p}" for i, p in enumerate(passages)])

def precision(selected_ground_truth, output):
    """ Evaluate the precision of the ranked result.
    precision@k = (#relevant output items in top k / k) where k is the number of relevant items in selected_ground_truth.
    """
    selected_ground_truth = np.asarray(selected_ground_truth)
    k = np.sum(selected_ground_truth, axis=1)
    idxs = np.asarray(output) - 1
    rows = np.arange(idxs.shape[0])[:, None]
    relevance = selected_ground_truth[rows, idxs]
    rank_positions = np.arange(idxs.shape[1])
    mask = rank_positions < k[:, None]
    return np.sum(mask * relevance, axis=1) / k

In [14]:
for i, s in enumerate(data):
    query = s['query']
    passage = s['passages']['passage_text']
    selected = s['passages']['is_selected']
    if sum(selected) == 0:
        print(f"skip bad experiment {i}")
        print(selected)
        continue
    
    # print(pointwise_prompt_template.format(question=query, passage=passage[0]))
    # print(external_pointwise_prompt_template.format(question=query, numbered_passages=create_numbered_passages(passage)))
    # print(pairwise_comparison_prompt_template.format(question=query, passage_a=passage[0], passage_b=passage[1]))
    # print(external_comparison_prompt_template.format(question=query, numbered_passages=create_numbered_passages(passage)))

skip bad experiment 6
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
skip bad experiment 7
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
skip bad experiment 11
[0, 0, 0, 0, 0, 0, 0, 0, 0]
skip bad experiment 67
[0, 0, 0, 0, 0, 0, 0]
skip bad experiment 97
[0, 0, 0, 0, 0, 0]


In [36]:
def store_metadata(d, raw_scores, rank_indices, precision, api_calls, tokens, memory_size = 0):
    if memory_size == 0:
        d['rank_indices'] = rank_indices
        d['precision'] = precision
        d['api_calls'] = api_calls
        d['tokens'] = tokens
        d['raw_scores'] = raw_scores
    else:
        d[memory_size] = {}
        d[memory_size]['rank_indices'] = rank_indices
        d[memory_size]['precision'] = precision
        d[memory_size]['api_calls'] = api_calls
        d[memory_size]['tokens'] = tokens
        d[memory_size]['raw_scores'] = raw_scores

In [None]:
from order_by.pointwise import PointwiseRelevanceKey


async def PointwiseRelevanceExperiment(client, data, prompt_template, modelname, output_type):
    print(f"pointwise_sort {modelname}")
    query = data['query']
    passages = data['passages']['passage_text']
    selected = data['passages']['is_selected']

    if sum(selected) == 0:
        print("skip bad experiment")
        return

    async def compute_sort_key(query, passage):
        pwKey = PointwiseRelevanceKey(query, passage, prompt_template)
        return await pwKey.value(client, modelname, output_type)

    tasks = [compute_sort_key(query, passage) for passage in passages]
    results = await asyncio.gather(*tasks)

    raw_scores = np.array([s[0] for s in results])
    rank_indices = np.argsort(-raw_scores) + 1
    total_api_calls = sum(api_calls for _, api_calls, _, _ in results)
    total_tokens = sum(tokens for _, _, tokens, _ in results)
    accuracy = precision([selected], [rank_indices])
    print(f"raw score: {raw_scores}, ranked: {rank_indices}, precision {accuracy}")

    return total_api_calls, total_tokens, accuracy[0], rank_indices, raw_scores

<diskcache.core.Cache object at 0x118cbe060>


In [57]:
output_type = float
results = {}

for modelname in ["gpt-4o-mini", "gpt-4o"]:
    results[modelname] = {}

    for i, d in enumerate(data):
        selected = d['passages']['is_selected']
        if sum(selected) == 0:
            print(f"skip bad experiment {i}")
            continue

        results[modelname][i] = {'pointwise_sort':{}, 'quick_sort':{}, 'heap_sort':{},\
                              'external_pointwise_sort':{}, 'external_bubble_sort':{},\
                              'external_merge_sort':{}}
    
        total_api_calls, total_tokens, p, rank_indices, raw_scores = await PointwiseRelevanceExperiment(client, d, pointwise_prompt_template, modelname, output_type)
        store_metadata(results[modelname][i]['pointwise_sort'], raw_scores.tolist(), rank_indices.tolist(), float(p), total_api_calls, total_tokens)



pointwise_sort gpt-4o-mini
raw score: [3. 3. 3. 5. 5. 4. 4.], ranked: [4 5 6 7 1 2 3], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [5. 5. 4. 5. 5. 5.], ranked: [1 2 4 5 6 3], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [4.  2.5 2.  3.  1.  1.  3.  4. ], ranked: [1 8 4 7 2 3 5 6], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [1. 5. 5. 5. 1. 5.], ranked: [2 3 4 6 1 5], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [5.  5.  1.  1.  3.  2.  4.5 5. ], ranked: [1 2 8 7 5 6 3 4], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [2.  5.  5.  4.5 5.  5.  5. ], ranked: [2 3 5 6 7 4 1], precision [0.]
skip bad experiment 6
skip bad experiment 7
pointwise_sort gpt-4o-mini
raw score: [3.5 2.  3.  3.  4.5 4.  3. ], ranked: [5 6 1 3 4 7 2], precision [1.]
pointwise_sort gpt-4o-mini
raw score: [4.5 4.  3.  4.  1.  2.  2. ], ranked: [1 2 4 3 6 7 5], precision [0.]
pointwise_sort gpt-4o-mini
raw score: [4.  5.  5.  3.5 5.  4.  4.  2. ], ranked: [2 3 5 1 6 7 4 8], pre

In [58]:
print(len(results['gpt-4o']))
for modelname in ["gpt-4o-mini", "gpt-4o"]:
    for algo in ['pointwise_sort']:
    # for algo in ['pointwise_sort', 'quick_sort', 'heap_sort', 'external_pointwise_sort', 'external_bubble_sort', 'external_merge_sort']:
        average_precision = 0
        for iter in results[modelname]:
            average_precision += results[modelname][iter][algo]['precision']
        
        print(f'model: {modelname}, algo: {algo} average_precision: {average_precision / len(results[modelname])}')
        
print(results)

95
model: gpt-4o-mini, algo: pointwise_sort average_precision: 0.29298245614035084
model: gpt-4o, algo: pointwise_sort average_precision: 0.3157894736842105
{'gpt-4o-mini': {0: {'pointwise_sort': {'rank_indices': [4, 5, 6, 7, 1, 2, 3], 'precision': 0.0, 'api_calls': 0, 'tokens': 2833, 'raw_scores': [3.0, 3.0, 3.0, 5.0, 5.0, 4.0, 4.0]}, 'quick_sort': {}, 'heap_sort': {}, 'external_pointwise_sort': {}, 'external_bubble_sort': {}, 'external_merge_sort': {}}, 1: {'pointwise_sort': {'rank_indices': [1, 2, 4, 5, 6, 3], 'precision': 0.0, 'api_calls': 0, 'tokens': 2027, 'raw_scores': [5.0, 5.0, 4.0, 5.0, 5.0, 5.0]}, 'quick_sort': {}, 'heap_sort': {}, 'external_pointwise_sort': {}, 'external_bubble_sort': {}, 'external_merge_sort': {}}, 2: {'pointwise_sort': {'rank_indices': [1, 8, 4, 7, 2, 3, 5, 6], 'precision': 0.0, 'api_calls': 0, 'tokens': 3548, 'raw_scores': [4.0, 2.5, 2.0, 3.0, 1.0, 1.0, 3.0, 4.0]}, 'quick_sort': {}, 'heap_sort': {}, 'external_pointwise_sort': {}, 'external_bubble_sort': 

In [None]:
# with open("result_pointwise.json", "w") as f:
#     json.dump(results, f, indent=2)

# Pairwise experiments

In [192]:
from typing import List, Callable, Tuple
import asyncio
import diskcache
import hashlib
from order_by.pair_comparison import ComparisonReasoning

# use global cache and client
cache = diskcache.Cache('./pair_wise_cache')
client = AsyncOpenAI(api_key=api_key)

# Define the comparator

In [250]:
def hash_text(text: str) -> str:
    return hashlib.blake2b(text.encode("utf-8"), digest_size=8).hexdigest()

class AsyncComparator:
    def __init__(self, question, texts, experiment_id, model_id):
        self.question = question
        self.texts = texts
        self.experiment_id = experiment_id
        self.text_hashes = [hash_text(t) for t in texts]
        self.model_id = model_id

    def _make_key(self, i, j):
        h1 = self.text_hashes[i]
        h2 = self.text_hashes[j]
        return (self.experiment_id, self.model_id, h1, h2)
    
    async def compare_indices(self, i: int, j: int) -> Tuple[str, int, int]:
        key = self._make_key(i, j)
        if key in cache:
            cached = cache[key]
            parsed = ComparisonReasoning(**cached['parsed'])
            # print(parsed.key, cached['tokens'])
            # print(self.text_hashes[i], self.texts[i])
            # print(self.text_hashes[j], self.texts[j])
            return parsed.key, 0, cached['tokens']

        key_reverse = self._make_key(j, i)
        if key_reverse in cache:
            cached = cache[key_reverse]
            parsed = ComparisonReasoning(**cached['parsed'])
            # print(("A" if parsed.key == "B"
            #     else "B" if parsed.key == "A"
            #     else "Equal"), 0, cached['tokens'])
            return (
                "A" if parsed.key == "B"
                else "B" if parsed.key == "A"
                else "Equal"
            ), 0, cached['tokens']
    
        # Format the prompt
        prompt = pairwise_comparison_prompt_template.format(
            question=self.question,
            passage_a=self.texts[i],
            passage_b=self.texts[j])
    
        # print(key)
        # print('A', self.text_hashes[i], self.texts[i])
        # print('B', self.text_hashes[j], self.texts[j])

        api_call = 0
        total_tokens = 0
        while api_call < 3:
            api_call += 1
            try:
                # def call_llm():
                response = await client.responses.parse(
                    model=self.model_id,
                    input=[
                        {"role": "system", "content": "You are a helpful agent. Think step by step. Output a JSON object."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.0,
                    text_format=ComparisonReasoning
                )
                parsed = response.output[0].content[0].parsed
                total_tokens += response.usage.total_tokens
                cache[key] = {
                    'parsed': parsed.model_dump(),
                    'tokens': total_tokens,
                }

                if parsed.key in ["A", "B", "Equal"]:
                    # print(parsed.key, api_call, total_tokens)
                    return parsed.key, api_call, total_tokens
                else:
                    print(f"[WARN] Unexpected output: {parsed.key}; retrying...\n")

            except Exception as e:
                print(f"[ERROR] Attempt {api_call}: {e}. {self.experiment_id}, {self.model_id}")

        # After 3 failures or invalid results, return random fallback
        fallback = np.random.choice(["A", "B", "Equal"])
        print(f"[FALLBACK] Returning {fallback} {self.experiment_id}, {self.model_id}")
        return fallback, api_call, total_tokens


# Define the sort algorithm

In [239]:
async def async_bubble_sort(
    question: str,
    texts: List[str],
    experiment_id: str,
    model_id: str,
) -> Tuple[List[int], int, int]:
    """
    Returns:
        sorted_indices (List[int]): indices sorted from best to worst (0-based)
        total_api_calls (int)
        total_tokens (int)
    """
    n = len(texts)
    total_api_calls = 0
    total_tokens = 0

    indices = list(range(n))
    comparator = AsyncComparator(question, texts, experiment_id, model_id)

    for i in range(n):
        for j in range(0, n - i - 1):
            result, api_calls, tokens = await comparator.compare_indices(indices[j], indices[j + 1])
            total_api_calls += api_calls
            total_tokens += tokens

            if result != "A":  # i.e., Passage B is better -> swap
                indices[j], indices[j + 1] = indices[j + 1], indices[j]

    # print(indices)
    return indices, total_api_calls, total_tokens


async def async_quick_sort(
    question: str,
    texts: List[str],
    experiment_id: str,
    model_id: str,
) -> Tuple[List[int], int, int]:
    """
    Returns:
        sorted_indices (List[int]): indices sorted from best to worst (0-based)
        total_api_calls (int)
        total_tokens (int)
    """
    comparator = AsyncComparator(question, texts, experiment_id, model_id)

    async def quick_sort(indices: List[int]) -> Tuple[List[int], int, int]:
        if len(indices) <= 1:
            return indices, 0, 0

        pivot = indices[0]
        less, greater_equal = [], []
        api_calls, tokens = 0, 0

        # compare pivot with all others
        tasks = [
            comparator.compare_indices(i, pivot)
            for i in indices[1:]
        ]
        results = await asyncio.gather(*tasks)

        for i, (result, call_count, token_count) in zip(indices[1:], results):
            api_calls += call_count
            tokens += token_count
            if result == "A":
                less.append(i)  # i is better than pivot
            else:
                greater_equal.append(i)  # i is worse or equal

        left_task = asyncio.create_task(quick_sort(less))
        right_task = asyncio.create_task(quick_sort(greater_equal))

        left_sorted, left_calls, left_tokens = await left_task
        right_sorted, right_calls, right_tokens = await right_task

        return (
            left_sorted + [pivot] + right_sorted,
            api_calls + left_calls + right_calls,
            tokens + left_tokens + right_tokens
        )

    indices = list(range(len(texts)))
    return await quick_sort(indices)


In [249]:
for i, d in enumerate(data):
    print(d['query'])
    for p in d['passages']['passage_text']:
        print(p)
    print(d)
    resb = await async_bubble_sort(d['query'], d['passages']['passage_text'], i, "gpt-4o-mini")
    resq = await async_quick_sort(d['query'], d['passages']['passage_text'], i, "gpt-4o-mini")
    print(resb, resq)
    break

what makes a sore throat sore
A sore throat refers to pain, itchiness, or irritation of the throat. You may have difficulty swallowing food and liquids, and the pain may get worse when you try to swallow. A bacterial infection can also cause a sore throat. These types of infections include: 1  strep throat: inflammation of the throat caused by the Streptococcal bacteria. 2  diphtheria: infectious disease that causes throat inflammation.
Viruses cause most sore throats, but bacteria, such as streptococcus, which causes strep throat, may be the cause of a sore throat. An antibiotic can help get rid of the bacteria and provide relief. If your sore throat isn’t gone within a week, or if it gets worse and leads to a fever, you may need a prescription. 
Feel better fast. by Amanda MacMillan. A sore throat can be the first sign of a cold, a side effect of strained vocal cords, or an indication of something more serious (like strep throat). Regardless of the cause, your immediate concern when 

# define experiments

In [None]:
async def BubbleSortExperiment(i, data, modelname):
    print(f"bubble sort {modelname} {i}")
    question = data['query']
    passages = data['passages']['passage_text']
    selected = data['passages']['is_selected']

    if sum(selected) == 0:
        print("skip bad experiment")
        return
    
    indices, total_api_calls, total_tokens = await async_bubble_sort(question, passages, i, modelname)
    indices = np.array(indices) + 1
    accuracy = precision([selected], [indices])
    # print(f"ranked: {indices}, precision {accuracy}")

    return total_api_calls, total_tokens, accuracy[0], indices

async def QuickSortExperiment(i, data, modelname):
    print(f"quick sort {modelname} {i}")
    question = data['query']
    passages = data['passages']['passage_text']
    selected = data['passages']['is_selected']

    if sum(selected) == 0:
        print("skip bad experiment")
        return
    
    indices, total_api_calls, total_tokens = await async_quick_sort(question, passages, i, modelname)
    indices = np.array(indices) + 1
    accuracy = precision([selected], [indices])
    # print(indices)
    # print(f"ranked: {indices}, precision {accuracy}")

    return total_api_calls, total_tokens, accuracy[0], indices

In [253]:
# output_type = float
# results = {}
def store_metadata(d, raw_scores, rank_indices, precision, api_calls, tokens, memory_size = 0):
    if memory_size == 0:
        d['rank_indices'] = rank_indices
        d['precision'] = precision
        d['api_calls'] = api_calls
        d['tokens'] = tokens
        d['raw_scores'] = raw_scores
    else:
        d[memory_size] = {}
        d[memory_size]['rank_indices'] = rank_indices
        d[memory_size]['precision'] = precision
        d[memory_size]['api_calls'] = api_calls
        d[memory_size]['tokens'] = tokens
        d[memory_size]['raw_scores'] = raw_scores

for modelname in ["gpt-4o-mini", "gpt-4o"]:
    results[modelname] = {}

    for i, d in enumerate(data):
        selected = d['passages']['is_selected']
        if sum(selected) == 0:
            print(f"skip bad experiment {i}")
            continue

        results[modelname][i] = {'bubble_sort':{}, 'quick_sort':{}}
    
        total_api_calls, total_tokens, pr, rank_indices = await BubbleSortExperiment(i, d, modelname)
        store_metadata(results[modelname][i]['bubble_sort'], [], rank_indices.tolist(), float(pr), total_api_calls, total_tokens)

        total_api_calls, total_tokens, pr, rank_indices = await QuickSortExperiment(i, d, modelname)
        store_metadata(results[modelname][i]['quick_sort'], [], rank_indices.tolist(), float(pr), total_api_calls, total_tokens)
        break

bubble sort gpt-4o-mini 0
quick sort gpt-4o-mini 0
[7 4 5 2 6 1 3]
bubble sort gpt-4o 0
quick sort gpt-4o 0
[5 4 7 6 2 1 3]
