In [None]:

import json
from pathlib import Path

from itertools import combinations
from collections import defaultdict
from transformers import AutoTokenizer



In [None]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Thinking-2507")
tokenizer

In [None]:
import ijson

def load_first_n_json(path, n):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        objects = ijson.items(f, "item")
        for i, obj in enumerate(objects):
            if i >= n:
                break
            data.append(obj)
    return data

pairs = load_first_n_json("/data/userdata/v-lijingyuan/dpo/final_data.json", 10000)
print(len(pairs))


In [None]:
# from collections import Counter

# winner_lens = [len(p["winner"]) for p in pairs]
# loser_lens = [len(p["loser"]) for p in pairs]

# print("Winner max length:", max(winner_lens))
# print("Loser max length:", max(loser_lens))

# # 统计长度分布（出现次数）
# winner_len_counter = Counter(winner_lens)
# loser_len_counter = Counter(loser_lens)

# print("Winner length distribution:", winner_len_counter)
# print("Loser length distribution:", loser_len_counter)

# # 可视化（直方图）
# import matplotlib.pyplot as plt

# plt.hist(winner_lens, bins=30, alpha=0.5, label="winner")
# plt.hist(loser_lens, bins=30, alpha=0.5, label="loser")
# plt.xlabel("Sequence length")
# plt.ylabel("Count")
# plt.title("Winner/Loser Length Distribution")
# plt.legend()
# plt.show()



In [None]:
import json

# 读取数据
with open("/data/userdata/v-lijingyuan/dpo/final_pairs.json", "r") as f:
    data = json.load(f)
print(f"Loaded {len(data)} data points for inference.")

def get_parent_hypotheses(id_to_entry, entry):
    """
    递归向上查找所有父条目（parent_id 链），
    仅保留其中 feedback_decision == True 的 hypothesis。
    """
    hypotheses = []
    parent_id = entry['input'].get('parent_id')

    visited = set()  # 防止循环引用
    while parent_id and parent_id not in visited:
        visited.add(parent_id)
        parent_entry = id_to_entry.get(parent_id)

        if parent_entry is None:
            break  # 没找到父节点则停止

        parent_input = parent_entry.get('input', {})
        if parent_input.get('feedback_decision') is True:
            hypotheses.append(parent_input.get('hypothesis_chain'))

        parent_id = parent_input.get('parent_id')  # 继续往上找

    return hypotheses

# 只保留目标任务的条目
entries = [d for d in data if d["input"].get("comptation_name") == "tweet-sentiment-extraction"]
print(f"Filtered to {len(entries)} tweet-sentiment-extraction entries.")

# 构建 id -> entry 的映射，用于追踪父节点
id_to_entry = {
    f"{d['input']['exp_name']} {d['input']['comptation_name']} {d['input']['loop_id']}": d
    for d in entries
}

# 为每个 entry 提取父 hypothesis 链
for e in entries:
    parent_hyps = get_parent_hypotheses(id_to_entry, e)
    e["parent_hypotheses"] = parent_hyps



In [None]:
entries

In [None]:
#"/home/bowen/workspace/fine-tune/amlt_jsons_new/needed-killdeer.json"

In [None]:
parent_hyps = [
    "Try logistic regression with standard scaling",
    "Switch to XGBoost with tuned learning rate",
    "Add cross-validation and early stopping"
]

current_hyp = "Stack a transformer encoder on top of tabular embeddings and tune via SWA"
max_tokens = 35  # 设很小，方便演示


In [None]:
build_hypo_chain(parent_hyps = parent_hyps, current_hyp = current_hyp, max_tokens = max_tokens)

In [None]:
def build_hypo_chain(parent_hyps, current_hyp, max_tokens=9000):
    """
    拼接 parent_hyps （按时间顺序），加入 current_hyp，截断到 max_tokens
    """
    sep = "->"
    chain = parent_hyps[::-1] + [current_hyp]  # parent从早到晚 + 当前

    while True:
        chain_text = sep.join(chain)
        tokenized = tokenizer(chain_text, add_special_tokens=False)
        tokens = len(tokenized["input_ids"])

        # ✅ 已满足长度
        if tokens <= max_tokens:
            return chain_text

        # ✅ 截断最前面的 parent hypothesis
        if len(chain) > 1:
            chain.pop(0)
            continue

        # ✅ 到这说明只剩 current_hyp 还超 → 硬截断 current_hyp 尾部
        truncated_ids = tokenized["input_ids"][-max_tokens:]
        return tokenizer.decode(truncated_ids, skip_special_tokens=True)

        
        
def get_parent_hypotheses(id_to_entry,entry):
    """递归获取父条目的 hypothesis"""
    hypotheses = []
    parent_id = entry['input'].get('parent_id')
    while parent_id:
        parent_entry = id_to_entry.get(parent_id)
        if not parent_entry:
            break
        if parent_entry['input']["feedback_decision"] == True:
            hypotheses.append(parent_entry['input']['hypothesis'])
        parent_id = parent_entry['input'].get('parent_id')
    return hypotheses


def get_parent_scores(id_to_entry,entry):
    """递归获取父条目的 hypothesis"""
    scores = []
    parent_id = entry['input'].get('parent_id')
    while parent_id:
        parent_entry = id_to_entry.get(parent_id)
        if not parent_entry:
            break
        if parent_entry['input']["feedback_decision"] == True:
            scores.append(parent_entry['input']['valid_score'])
        parent_id = parent_entry['input'].get('parent_id')
    return scores


In [None]:
final_data = []
final_pairs = []
amlt_json_path = Path("/home/bowen/workspace/fine-tune/amlt_jsons")
for exp_json in amlt_json_path.iterdir():
    with open(exp_json, "r") as f:
        data = json.load(f)
    
    all_pairs = []
    for ids, loop_data in data.items():
        comptation_name = ids.split(" ")[1]
        if ids.split(" ")[-1]== "scenario":
            bigger_is_better = int(loop_data['metric_direction'])

        if "final_hypothesis" in loop_data and "feedback" in loop_data and "code" in loop_data:
            first_metric = next(iter(loop_data["valid_score"].values()))
            alpaca_data = {
                    "input": {
                        "exp_name": exp_json.name.replace(".json", ""),
                        "comptation_name":comptation_name,
                        "bigger_is_better": bigger_is_better,
                        "loop_id": int(ids.split(" ")[-1]),
                        "hypothesis": loop_data["final_hypothesis"]["hypothesis"],
                        #"test_report": loop_data["test_report"]["score"],
                        "valid_score": first_metric.get("ensemble", None),
                        "feedback_decision": loop_data["feedback"]['decision'],
                        "parent_id": loop_data.get("parent_id", None) 
                    }
                }
            all_pairs.append(alpaca_data)

    all_pairs_new = []
    id_to_entry = {}
    for entry in all_pairs:
        key = f"{entry['input']['exp_name']} {entry['input']['comptation_name']} {entry['input']['loop_id']}"
        id_to_entry[key] = entry
    for target_entry in all_pairs:
        parent_hyps = get_parent_hypotheses(id_to_entry,target_entry)
        target_entry['input']['hypothesis_chain'] =build_hypo_chain(parent_hyps, target_entry['input'].get('hypothesis'))#"<think_step>".join(parent_hyps[::-1] + [target_entry['input'].get('hypothesis')] )
        parnet_scores = get_parent_scores(id_to_entry,target_entry)
        if len(parnet_scores)>0: 
            target_entry['input']['parent_score'] = parnet_scores[0]
        else:
            target_entry['input']['parent_score'] = 10000000

        all_pairs_new.append(target_entry)
    del all_pairs
    final_pairs.extend(all_pairs_new)

    preference_pairs = []

    # --- 第一步：按比赛名分组 ---
    groups = defaultdict(list)
    for item in all_pairs_new:
        inp = item["input"]
        comp_name = inp["comptation_name"]
        groups[comp_name].append(inp)

    # --- 第二步：在每个比赛内生成 C(n, 2) 偏好对 ---
    for comp_name, items in groups.items():
        if len(items) < 2:
            continue  # 跳过不足两条的比赛
        
        bigger_is_better = items[0]["bigger_is_better"]

        for a, b in combinations(items, 2):
            score_a = a["valid_score"] - a["parent_score"]
            score_b = b["valid_score"] - a["parent_score"]

            # 判断优劣关系
            if bigger_is_better == 1:
                winner, loser = (a, b) if score_a > score_b else (b, a)
            else:
                winner, loser = (a, b) if score_a < score_b else (b, a)
            if score_a >-10000:
                preference_pairs.append({
                    "comptation_name": comp_name,
                    "loop_pair": (a["loop_id"], b["loop_id"]),
                    "winner": winner["hypothesis_chain"],
                    "loser": loser["hypothesis_chain"],
    #               "score_diff": abs(score_a - score_b),  # 可选：分数差
                })
    final_data.extend(preference_pairs)

In [None]:
len(final_data)

In [None]:
len(final_pairs)

In [None]:
import json

with open("final_data_diff.json", "w", encoding="utf-8") as f:
    json.dump(final_data, f, ensure_ascii=False, indent=2)

with open("final_pairs_diff.json", "w", encoding="utf-8") as f:
    json.dump(final_pairs, f, ensure_ascii=False, indent=2)


    

In [None]:
cp -f /data/userdata/v-lijingyuan/dpo/final_data.json /home/bowen/workspace/fine-tune/custom/

In [None]:
data = final_pairs

In [None]:
final_data = []
final_pairs = []

for exp_json in amlt_json_path.iterdir():
    with open(exp_json, "r") as f:
        data = json.load(f)
    

    all_pairs = []
    for ids, loop_data in data.items():
        comptation_name = ids.split(" ")[1]
        if ids.split(" ")[-1]== "scenario":
            bigger_is_better = int(loop_data['metric_direction'])

        if "final_hypothesis" in loop_data and "feedback" in loop_data and "code" in loop_data and "sota_hypothesis" in loop_data:
            first_metric = next(iter(loop_data["valid_score"].values()))
            alpaca_data = {
                    "input": {
                        "comptation_name":comptation_name,
                        "bigger_is_better": bigger_is_better,
                        "loop_id": int(ids.split(" ")[-1]),
                        "hypothesis": loop_data["final_hypothesis"]["hypothesis"] + "currnet sota hypothesis is " + loop_data["sota_hypothesis"]["hypothesis"],
                        "test_report": loop_data["test_report"]["score"],
                        "valid_score": first_metric.get("ensemble", None),
                        
                        "feedback_decision": loop_data["feedback"]['decision']
                    }
                }
            all_pairs.append(alpaca_data)
    final_pairs.extend(all_pairs)

    
    preference_pairs = []

    # --- 第一步：按比赛名分组 ---
    groups = defaultdict(list)
    for item in all_pairs:
        inp = item["input"]
        comp_name = inp["comptation_name"]
        groups[comp_name].append(inp)

    # --- 第二步：在每个比赛内生成 C(n, 2) 偏好对 ---
    for comp_name, items in groups.items():
        if len(items) < 2:
            continue  # 跳过不足两条的比赛
        
        bigger_is_better = items[0]["bigger_is_better"]

        for a, b in combinations(items, 2):
            score_a = a["valid_score"]
            score_b = b["valid_score"]

            # 判断优劣关系
            if bigger_is_better == 1:
                winner, loser = (a, b) if score_a > score_b else (b, a)
            else:
                winner, loser = (a, b) if score_a < score_b else (b, a)

            preference_pairs.append({
                "comptation_name": comp_name,
                "loop_pair": (a["loop_id"], b["loop_id"]),
                "winner": winner["hypothesis"],
                "loser": loser["hypothesis"],
                "score_diff": abs(score_a - score_b),  # 可选：分数差
            })
    final_data.extend(preference_pairs)