In [None]:
import json
from pathlib import Path

path = Path(
    "/data/userdata/v-lijingyuan/JobAndExp/amlt_project/amlt/adjusted-phoenix"
)

# with open(path, "r") as f:
#     data = json.load(f)

# print(type(data))


In [None]:
import json
from collections import defaultdict
from itertools import combinations
import os

import re

def find_nearest_parent_with_metric(node_id, node2parent, node_dict):
    cur = node2parent.get(node_id)
    visited = set()
    while cur is not None and cur not in visited:
        visited.add(cur)
        node = node_dict.get(cur)
        if node is None:
            return None
        metric = node.get("metric", {}).get("value")
        if metric is not None:
            return node
        cur = node2parent.get(cur)
    return None

def infer_competition_from_path(path: str) -> str:
    """
    path = .../20260107-174936_xxx/logs/logs/run/.../journal.json
    """
    parts = path.split(os.sep)

    for p in parts:
        if p.startswith("2026") and "_" in p:
            # 20260107-174936_aerial-cactus-identification
            return p.split("_", 1)[1]

    raise ValueError(f"Cannot infer competition from path: {path}")

def build_metric_plan_chain(node_id, node2parent, node_dict):
    steps = []
    cur = node_id
    visited = set()

    while cur is not None and cur not in visited:
        visited.add(cur)
        node = node_dict.get(cur)
        if node is None:
            break

        if node.get("metric", {}).get("value") is not None:
            plan = node.get("plan")
            if plan:
                steps.append(plan)

        cur = node2parent.get(cur)

    if not steps:
        return "ROOT"

    return "->".join(reversed(steps))




def build_pairs_from_journal(path: str):
    # -------- 1. competition --------
    comp_name = infer_competition_from_path(path)

    # -------- 2. load --------
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    nodes = data["nodes"]
    node2parent = data.get("node2parent", {})
    node_dict = {n["id"]: n for n in nodes}

    # -------- 3. collect valid nodes (with metric) --------
    items = []

    for node in nodes:
        metric = node.get("metric", {})
        cur_val = metric.get("value")
        if cur_val is None:
            continue

        parent_node = find_nearest_parent_with_metric(
            node["id"], node2parent, node_dict
        )
        if parent_node is None:
            continue

        parent_val = parent_node.get("metric", {}).get("value")
        if parent_val is None:
            continue

        score_diff = float(cur_val) - float(parent_val)

        items.append({
            "node_id": node["id"],
            "plan_chain": build_metric_plan_chain(
                node["id"], node2parent, node_dict
            ),
            "score_diff": score_diff,
            "maximize": metric.get("maximize", True),
        })

    if len(items) < 2:
        return []

    # -------- 4. pairwise preference (global, like old code) --------
    maximize = items[0]["maximize"]
    direction = 1 if maximize else -1

    pairs = []

    for a, b in combinations(items, 2):
        sa = direction * a["score_diff"]
        sb = direction * b["score_diff"]

        if sa == sb:
            continue

        winner, loser = (a, b) if sa > sb else (b, a)

        pairs.append({
            "agent": "ml-master",
            "comptation_name": comp_name,
            "winner": winner["plan_chain"],
            "loser": loser["plan_chain"],
            "score_diff": abs(winner["score_diff"] - loser["score_diff"]),
        })

    return pairs

import os


import os
import random
from collections import defaultdict

def find_journal_json(comp_dir: str) -> str | None:
    for root, _, files in os.walk(comp_dir):
        if "journal.json" in files:
            return os.path.join(root, "journal.json")
    return None

def build_pairs_from_run_group(
    run_group_dir: str,
    max_pairs_per_comp: int = 500,
    seed: int = 42,
):
    """
    遍历 run-group 目录下所有比赛，汇总 preference pairs
    对每个 competition 施加 pair 数量上限
    """
    random.seed(seed)

    comp2pairs = defaultdict(list)
    missing = []
    all_pairs = []
    for name in os.listdir(run_group_dir):
        comp_dir = os.path.join(run_group_dir, name)
        if not os.path.isdir(comp_dir):
            continue

        journal_path = find_journal_json(comp_dir)

        try:
            pairs = build_pairs_from_journal(journal_path)
            if len(pairs) > max_pairs_per_comp:
                pairs = random.sample(pairs, max_pairs_per_comp)

            all_pairs.extend(pairs)


            # competition 名从目录中抽
            comp_name = name.split("_")[0]
            comp2pairs[comp_name].extend(pairs)

            print(f"[OK] {name}: {len(pairs)} pairs")

        except Exception as e:
            print(f"[ERROR] {name}: {e}")

    # -------- apply per-competition cap --------
    

    # for comp, plist in comp2pairs.items():
    #     original = len(plist)

    #     if original > max_pairs_per_comp:
    #         plist = random.sample(plist, max_pairs_per_comp)
    #         print(f"[CAP] {comp}: {original} -> {max_pairs_per_comp}")

    #     all_pairs.extend(plist)

    print("=" * 80)
    print(f"Total competitions processed: {len(comp2pairs)}")
    print(f"Total preference pairs: {len(all_pairs)}")
    print(f"Missing journal.json: {len(missing)}")

    return all_pairs


In [None]:
all_pairs1 = build_pairs_from_run_group(path)


In [None]:
len(all_pairs1)

In [None]:
na = [n["comptation_name"] for n in all_pairs1]

In [None]:
all_pairs1[0]

In [None]:
import json

out_path = "/data/userdata/v-lijingyuan/dpo/ml-master1.json"

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(all_pairs1, f, ensure_ascii=False, indent=2)

print(f"Saved {len(all_pairs1)} pairs to {out_path}")


In [None]:
import json

data = [
  {
    "competition": "aerial-cactus-identification",
    "num_pairs": 67,
    "pairwise_accuracy": 0.47761194029850745
  },
  {
    "competition": "aptos2019-blindness-detection",
    "num_pairs": 21,
    "pairwise_accuracy": 0.42857142857142855
  },
  {
    "competition": "denoising-dirty-documents",
    "num_pairs": 100,
    "pairwise_accuracy": 0.93
  },
  {
    "competition": "detecting-insults-in-social-commentary",
    "num_pairs": 91,
    "pairwise_accuracy": 0.4725274725274725
  },
  {
    "competition": "dog-breed-identification",
    "num_pairs": 1,
    "pairwise_accuracy": 1.0
  },
  {
    "competition": "mlsp-2013-birds",
    "num_pairs": 100,
    "pairwise_accuracy": 0.59
  },
  {
    "competition": "new-york-city-taxi-fare-prediction",
    "num_pairs": 61,
    "pairwise_accuracy": 0.5901639344262295
  },
  {
    "competition": "nomad2018-predict-transparent-conductors",
    "num_pairs": 100,
    "pairwise_accuracy": 0.65
  },
  {
    "competition": "plant-pathology-2020-fgvc7",
    "num_pairs": 73,
    "pairwise_accuracy": 0.4246575342465753
  },
  {
    "competition": "random-acts-of-pizza",
    "num_pairs": 100,
    "pairwise_accuracy": 0.66
  },
  {
    "competition": "siim-isic-melanoma-classification",
    "num_pairs": 1,
    "pairwise_accuracy": 1.0
  },
  {
    "competition": "spooky-author-identification",
    "num_pairs": 100,
    "pairwise_accuracy": 0.49
  },
  {
    "competition": "tabular-playground-series-may-2022",
    "num_pairs": 62,
    "pairwise_accuracy": 0.3870967741935484
  }
]
total_pairs = sum(d["num_pairs"] for d in data)
weighted_correct = sum(d["num_pairs"] * d["pairwise_accuracy"] for d in data)

global_accuracy = weighted_correct / total_pairs

print(f"Total pairs: {total_pairs}")
print(f"Global pairwise accuracy: {global_accuracy:.4f}")


In [None]:
0.5416

In [None]:
519/1011=0.5134