In [3]:
import pandas as pd
import numpy as np

# ========= 1. Load your 500-pair CSV file =========
df_pairs = pd.read_csv("human_feedback_pairs (1).csv")  

print("输入列：", df_pairs.columns.tolist())
print("样本数：", len(df_pairs))

# ========= 2. Use resume_a_sim_score / resume_b_sim_score =========
def decide_preference(row, margin=0.02, p_equal=0.2, rng=None):
    if rng is None:
        rng = np.random.RandomState()
    
    sim_a = row["resume_a_sim_score"]
    sim_b = row["resume_b_sim_score"]
    diff = sim_a - sim_b
    
    # 相似度差距大 → 更高的
    if diff > margin:
        return "A"
    elif diff < -margin:
        return "B"
    else:
        # 差距接近 → 有部分 Equal
        if rng.rand() < p_equal:
            return "Equal"
        return "A" if diff >= 0 else "B"

# ========= 3. 自动决定 choice_raw =========
rng = np.random.RandomState(42)
df_pairs["choice_raw"] = [
    decide_preference(row, rng=rng) for _, row in df_pairs.iterrows()
]

# 构造 choice = A/B/EQ（这里 EQ 对应 Equal）
def normalize_choice(c):
    return c if c in ["A", "B"] else "Equal"

df_pairs["choice"] = df_pairs["choice_raw"].apply(normalize_choice)

# ========= 4. similarity_ratio_b_over_a =========
df_pairs["similarity_ratio_b_over_a"] = (
    df_pairs["resume_b_sim_score"] / (df_pairs["resume_a_sim_score"] + 1e-8)
)

# ========= 5. label_b_is_better =========
def label_b_better(row):
    if row["choice"] == "B":
        return 1.0
    elif row["choice"] == "A":
        return 0.0
    else:
        return 0.5  # Equal

df_pairs["label_b_is_better"] = df_pairs.apply(label_b_better, axis=1)

# ========= 6. Add required fields =========
df_pairs["rater_id"] = "simulated_rater_1"
df_pairs["question"] = df_pairs["pair_id"]

# rename to be consistent with your previous human file
df_pairs.rename(columns={
    "resume_a_group": "candidate_a_group",
    "resume_b_group": "candidate_b_group",
    "resume_a_name": "candidate_a_label",
    "resume_b_name": "candidate_b_label",
    "resume_a_sim_score": "similarity_a",
    "resume_b_sim_score": "similarity_b"
}, inplace=True)

# ========= 7. Select output columns =========
cols_out = [
    "rater_id",
    "question",
    "choice_raw",
    "pair_id",
    "choice",
    "label_b_is_better",
    "jd_text",
    "resume_a_id",
    "resume_b_id",
    "resume_a_text",
    "resume_b_text",
    "similarity_a",
    "similarity_b",
    "similarity_ratio_b_over_a",
    "candidate_a_group",
    "candidate_b_group",
    "candidate_a_label",
    "candidate_b_label"
]

df_clean = df_pairs[cols_out].copy()

# ========= 8. Export final clean file =========
df_clean.to_csv("human_preferences_clean.csv", index=False, encoding="utf-8")
print("✅ 导出成功：human_preferences_clean.csv")
print("行数：", len(df_clean))



输入列： ['pair_id', 'jd_id', 'jd_text', 'resume_a_id', 'resume_b_id', 'resume_a_text', 'resume_b_text', 'resume_a_name', 'resume_b_name', 'resume_a_group', 'resume_b_group', 'resume_a_sim_score', 'resume_b_sim_score', 'choice', 'label_b_is_better']
样本数： 500
✅ 导出成功：human_preferences_clean.csv
行数： 500


In [7]:
df_pairs.to_csv("human_preferences_clean.csv", index=False, encoding="utf-8")
print("Saved: human_preferences_clean.csv")

Saved: human_preferences_clean.csv


In [9]:
from IPython.display import FileLink
FileLink("human_preferences_clean.csv")