In [1]:
import json
import pandas as pd

In [2]:
def jsonl_to_json(jsonl_path, json_path):
    data = []

    # Read JSONL file (each line is a complete JSON object)
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  # skip empty lines
                data.append(json.loads(line))

    # Write as a JSON array
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"Converted {jsonl_path} → {json_path}")


# Example usage:
jsonl_to_json("./data/models_responses_900.jsonl", "./data/model_responses_900.json")

Converted ./data/models_responses_900.jsonl → ./data/model_responses_900.json


In [14]:
import itertools
import os
from openai import OpenAI
from tqdm import tqdm

# client = OpenAI(api_key="")

SYSTEM_PROMPT = (
    "You are an expert math evaluator. Compare two model answers for the SAME math question. "
    "Focus on mathematical correctness, sound reasoning, and clarity. "
    "Choose which answer is better. "
    "If both answers are equally good, output 'tie'. "
    "If both are clearly wrong or nonsensical, output 'both_bad'. "
    "Respond ONLY with one choice exactly from this set:\n"
    "model_a, model_b, tie, both_bad.\n"
    "Do NOT include any extra words or explanation."
)

def build_user_prompt(question, ans_a, ans_b):
    return (
        f"Question:\n{question}\n\n"
        f"Answer A (model_a):\n{ans_a}\n\n"
        f"Answer B (model_b):\n{ans_b}\n\n"
        "Which answer is better for correctness and reasoning?"
    )
    
def run_judge(question, ans_a, ans_b):
    """Returns: model_a, model_b, tie, or both_bad"""
    user_prompt = build_user_prompt(question, ans_a, ans_b)

    resp = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.0,
        max_tokens=5
    )

    label = resp.choices[0].message.content.strip()
    if label not in {"model_a", "model_b", "tie", "both_bad"}:
        raise ValueError(f"LLM judge produced invalid label: {label}")

    return label


# ----------------------------------------------------------
# Main conversion: read JSONL → add judge_label → save JSONL
# ----------------------------------------------------------

def add_judge_labels(input_path, output_path):
    out = []

    # Load the whole JSON array
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for item in tqdm(data):
        q = item["question"]
        a = item["answer_a"]
        b = item["answer_b"]

        label = run_judge(q, a, b)
        item["judge_label"] = label

    # Write back as JSON array
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"Saved judged dataset with judge_label to {output_path}")

  


# Example usage:
# add_judge_labels("arena_like_data.json", "arena_with_labels.json")

In [15]:
add_judge_labels("./data/arena_140k_math_filtered.json", "./data/arena_math_900.json")

100%|██████████| 894/894 [06:01<00:00,  2.47it/s]

Saved judged dataset with judge_label to ./data/arena_math_900.json





In [20]:
with open("./data/arena_math_900.json", "r", encoding="utf-8") as f:
    data = json.load(f)
total = 0
num_agree = 0
for item in data:
    if item["judge_label"] in ["model_a", "model_b"]:
        if item["human_label"] == item["judge_label"]:
            num_agree += 1
        # else:
        #     print(f"human_label={item["human_label"]}, judge_label={item["judge_label"]}")
        total += 1
agree_ratio = num_agree / total
print(f"LLM judge agreement with human labels = {agree_ratio:.2f}")

LLM judge agreement with human labels = 0.43


In [28]:
def combine_static_csvs(path_a, path_b, out_path=None):
    # Load the two csvs (each has 5 models per question)
    df_a = pd.read_csv(path_a)
    df_b = pd.read_csv(path_b)

    # (Optional sanity check) – same questions in same order?
    if not (df_a["question_id"].tolist() == df_b["question_id"].tolist()):
        print("Warning: question_id order differs between files. "
              "We'll still combine but you may want to inspect.")

    # Tag source so we can control within-question ordering if desired
    df_a["source"] = "A"
    df_b["source"] = "B"

    # Stack them
    combined = pd.concat([df_a, df_b], ignore_index=True)

    # Sort so that for each question_id we see 10 consecutive rows
    # (first all from A, then all from B; change order if you prefer)
    combined = (
        combined
        .sort_values(["question_id", "source"])  # + e.g. ["model_name"] if you want
        .reset_index(drop=True)
    )

    # Drop the helper column if you don’t want it in the final csv
    combined = combined.drop(columns=["source"])

    if out_path is not None:
        combined.to_csv(out_path, index=False)
        print(f"Saved combined csv to {out_path}")

    return combined


In [29]:
combined_df = combine_static_csvs(
    "./data/static_1.csv",
    "./data/static_2.csv",
    out_path="./data/static_10_models.csv",
)

print(combined_df.head(20))   # you should see 10 rows per question

Saved combined csv to ./data/static_10_models.csv
   question_id                                           question  \
0     GSM-8k_1  Janet’s ducks lay 16 eggs per day. She eats th...   
1     GSM-8k_1  Janet’s ducks lay 16 eggs per day. She eats th...   
2     GSM-8k_1  Janet’s ducks lay 16 eggs per day. She eats th...   
3     GSM-8k_1  Janet’s ducks lay 16 eggs per day. She eats th...   
4     GSM-8k_1  Janet’s ducks lay 16 eggs per day. She eats th...   
5     GSM-8k_1  Janet’s ducks lay 16 eggs per day. She eats th...   
6     GSM-8k_1  Janet’s ducks lay 16 eggs per day. She eats th...   
7     GSM-8k_1  Janet’s ducks lay 16 eggs per day. She eats th...   
8     GSM-8k_1  Janet’s ducks lay 16 eggs per day. She eats th...   
9     GSM-8k_1  Janet’s ducks lay 16 eggs per day. She eats th...   
10   GSM-8k_10  Eliza's rate per hour for the first 40 hours s...   
11   GSM-8k_10  Eliza's rate per hour for the first 40 hours s...   
12   GSM-8k_10  Eliza's rate per hour for the first 4

In [27]:
# 1. Load original static file (all models) and the new file (one model)
orig = pd.read_csv("./data/static_1_old.csv")
new  = pd.read_csv("./data/gemini_static.csv")

# 2. Figure out which model is being updated (assumes only one in the file)
model_to_replace = new["model_name"].iloc[0]

# 3. Remove old rows for this model from the original dataframe
orig_wo_model = orig[orig["model_name"] != model_to_replace].copy()

# 4. Make sure columns align (optional but safe)
new = new[orig.columns]

# 5. Add the updated rows and sort
updated = pd.concat([orig_wo_model, new], ignore_index=True)
updated = updated.sort_values(["question_id", "model_name"]).reset_index(drop=True)

# 6. Save to a new CSV (or overwrite the old one)
updated.to_csv("./data/static_1.csv", index=False)