In [None]:
import json
with open("/data/userdata/v-lijingyuan/dpo/final_pairs_diff_1.json", "r") as f:
    data = json.load(f)


In [None]:

from collections import defaultdict

def build_teacher_chain_topk(data, k=2):
    groups = defaultdict(list)

    # 按 (exp_name, comptation_name) 分组
    for item in data:
        inp = item["input"]
        key = (inp["exp_name"], inp["comptation_name"])
        groups[key].append(inp)

    teacher_chain = {}

    for key, items in groups.items():
        bigger_is_better = items[0]["bigger_is_better"]

        if bigger_is_better == 1:
            sorted_items = sorted(items, key=lambda x: x["valid_score"], reverse=True)
        else:
            sorted_items = sorted(items, key=lambda x: x["valid_score"])

        topk = sorted_items[:k]

        teacher_chain[key] = [
            {
                "comptation_name": x["comptation_name"],
                "hypothesis_chain": x["hypothesis_chain"],
                "score": x["valid_score"]
            }
            for x in topk
        ]
    return teacher_chain


In [35]:
teacher = build_teacher_chain_topk(data, k=3)
flat_list = []
for key, items in teacher.items():
    flat_list.extend(items)

with open("teacher_chain_top3.json", "w", encoding="utf-8") as f:
   json.dump(flat_list, f, ensure_ascii=False, indent=2)

In [43]:
evo = [ "cassava-leaf-disease-classification",
            "h-and-m-personalized-fashion-recommendations",
            "jigsaw-toxic-comment-classification-challenge",
            "leaf-classification",
            "tweet-sentiment-extraction",
            "us-patent-phrase-to-phrase-matching",
            "whale-categorization-playground",
            "learning-agency-lab-automated-essay-scoring-2",
            "aptos2019-blindness-detection",
            "kuzushiji-recognition",
            "herbarium-2020-fgvc7",
            "text-normalization-challenge-russian-language",
            "rsna-miccai-brain-tumor-radiogenomic-classification",
            "freesound-audio-tagging-2019",
            "mlsp-2013-birds",
            "spooky-author-identification",
            "hubmap-kidney-segmentation",]

In [38]:
flat_list[0]

{'comptation_name': 'iwildcam-2020-fgvc7',
 'hypothesis_chain': 'Exploit the known test label distribution: after computing logits for all test images, fit a per-class additive bias vector b by minimizing L(b) = sum_i logsumexp(logits_i + b) - counts^T b so that the summed softmax(logits_i + b) matches the exact per-class counts given in iwildcam2020_test_information.json. Then apply the allowed-class mask and take argmax on logits + b for the final predictions.->Disable horizontal-flip TTA at inference and enable model EMA (decay≈0.999, warmup≈100 steps) during training; rely on the existing test-count calibration to preserve accuracy while reducing inference time by ~45–50%.',
 'score': 0.8131679389312977}

In [39]:
with open("/data/userdata/v-lijingyuan/dpo/comp_to_scen.json", "r", encoding="utf-8") as f:
    comp_to_scen = json.load(f)

In [None]:
prompt_template = """
You are a data science expert and Kaggle Grandmaster.

Below is a competition scenario description:
{SCENARIO}

Below is an optimization chain derived from iterative feedback:
{CHAIN}

Notes:
- The optimization chain is composed of multiple reasoning steps.
- Each step is separated by the delimiter "->".
- A step may be a diagnosis, feedback, or hypothesis derived from iterative refinement.

Your tasks:
1. Rewrite the optimization chain by removing exactly one of the reasoning steps.
2. Optionally insert a new hypothesis that is logically consistent with the competition scenario.
3. Maintain the original "->" delimiter structure.
4. Ensure the rewritten chain is coherent, logically sound, and preserves the intended optimization flow.
5. Do NOT remove all steps; keep the chain meaningful.

Output:
- A single rewritten optimization chain using "->" as the delimiter.
""".strip()


In [1]:
import os
import json
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel


class RewardModelInference(nn.Module):
    def __init__(self, base_model_name, adapter_path, reward_head_path, device="cuda"):
        super().__init__()
        self.device = device
        self.base = AutoModelForCausalLM.from_pretrained(base_model_name)
        self.base = PeftModel.from_pretrained(self.base, adapter_path)
        if hasattr(self.base, "gradient_checkpointing_enable"):
            self.base.gradient_checkpointing_enable()
        if hasattr(self.base.config, "use_cache"):
            self.base.config.use_cache = False
        hs = getattr(self.base.config, "hidden_size",
                     getattr(self.base.config, "n_embd",
                     getattr(self.base.config, "d_model", None)))
        if hs is None:
            hs = self.base.get_input_embeddings().embedding_dim

        self.reward_head = nn.Linear(hs, 1).to(device)
        self.reward_head.load_state_dict(torch.load(reward_head_path, map_location=device))

    @staticmethod
    def pool_last_nonpad(last_hidden: torch.Tensor, attn_mask: torch.Tensor) -> torch.Tensor:
        lengths = attn_mask.sum(dim=1) - 1
        lengths = lengths.clamp(min=0)
        idx = lengths.view(-1, 1, 1).expand(-1, 1, last_hidden.size(-1))
        return last_hidden.gather(1, idx).squeeze(1)

    def forward(self, input_ids, attention_mask):
        out = self.base(
            input_ids=input_ids.to(self.device),
            attention_mask=attention_mask.to(self.device),
            output_hidden_states=True,
            use_cache=False
        )
        last_hidden = out.hidden_states[-1]
        pooled = self.pool_last_nonpad(last_hidden, attention_mask)
        reward = self.reward_head(pooled).squeeze(-1)
        return reward

    def compute_reward(self, texts, tokenizer,comp_description, system_prompt=None, device="cuda"):
        if system_prompt is not None:
            self.system_prompt = system_prompt
        elif not hasattr(self, "system_prompt"):
            self.system_prompt = (
                "You are a senior data science competition judge and solution expert.\n"
                "Your task is to evaluate the quality, reasoning progression, and innovation of hypothesis chains.\n"
                "A hypothesis chain shows iterative improvement of solutions.\n"
                "You should assess:\n"
                "1) reasoning correctness and consistency across steps,\n"
                "2) improvement and refinement through the chain,\n"
                "3) final hypothesis quality and practicality.\n"
                "Be strict and fair. Provide expert-level insight."
            )

        inputs = []
        for s in texts:
            prompt = (
                f"{self.system_prompt}\n\n"
                f"Competition description:\n{comp_description}\n\n"
                "Hypothesis Chain (each step separated by '->'):\n"
                f"{s}\n\n"
                "<think>\n"
                "Analyze the evolution of hypotheses, step-by-step, identifying strengths, weaknesses, and logical progression.\n"
                "Focus on clarity, correctness, and improvement.\n"
                "Make sure to consider the chain direction from earliest to latest.\n"
                "</think>\n\n"
                "Final Evaluation:\n"
            )

            inputs.append(prompt)

        enc = tokenizer(
            inputs,
            truncation=True,
            padding=True,
            max_length=2300,
            return_tensors="pt"
        )

        enc = {k: v.to(device) for k, v in enc.items()}

        rewards = self.forward(enc["input_ids"], enc["attention_mask"])

        return torch.exp(rewards).cpu().tolist()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer
import torch
import os

logdir = "/data/userdata/v-lijingyuan/last_run_5"
base_model = "Qwen/Qwen3-0.6B"
adapter_path = os.path.join(logdir, "lora_adapter")
reward_head_path = os.path.join(logdir, "reward_head.pt")

tokenizer = AutoTokenizer.from_pretrained(base_model)
if not getattr(tokenizer, "pad_token", None):
    tokenizer.pad_token = tokenizer.eos_token

model = RewardModelInference(
    base_model_name=base_model,
    adapter_path=adapter_path,
    reward_head_path=reward_head_path,
)

# --- 多卡推理 ---
model = torch.nn.DataParallel(model)   # wrap
model = model.cuda()                   # move to all GPUs automatically
model.eval()


  adapters_weights = torch.load(filename, map_location=torch.device(device))
  self.reward_head.load_state_dict(torch.load(reward_head_path, map_location=device))


DataParallel(
  (module): RewardModelInference(
    (base): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): Qwen3ForCausalLM(
          (model): Qwen3Model(
            (embed_tokens): Embedding(151936, 1024)
            (layers): ModuleList(
              (0-27): 28 x Qwen3DecoderLayer(
                (self_attn): Qwen3Attention(
                  (q_proj): Linear(
                    in_features=1024, out_features=2048, bias=False
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=2048, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterD

In [5]:
from dotenv import load_dotenv
load_dotenv("/data/userdata/v-lijingyuan/RD-Agent-fix-mcts/RD-Agent/.env")

True

In [4]:
from rdagent.utils.agent.tpl import T
from rdagent.oai.llm_utils import APIBackend, md5_hash
from typing import Any, Dict, List, Optional, Tuple
import os 

ValidationError: 1 validation error for LLMSettings
chat_temperature
  Input should be a valid number, unable to parse string as a number [type=float_parsing, input_value='1  # o1 have to set CHAT...EMPERATURE=1 in litellm', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/float_parsing

In [6]:
api_key = os.environ.get("OPENAI_API_KEY")
print("OPENAI_API_KEY loaded:", bool(api_key))

OPENAI_API_KEY loaded: True


In [7]:
sys_prompt = '\n\nYou are an expert mathematician and problem solver. \nYou will be given a **mathematics problem**.\nQuestion: Let \\[f(x) = \\left\\{\n\\begin{array}{cl} ax+3, &\\text{ if }x>2, \\\\\nx-5 &\\text{ if } -2 \\le x \\le 2, \\\\\n2x-b &\\text{ if } x <-2.\n\\end{array}\n\\right.\\]Find $a+b$ if the piecewise function is continuous (which means that its graph can be drawn without lifting your pencil from the paper).\n\nInstructions:\n- Carefully read the problem and reason through it step by step.\n- Show your full reasoning in detail (chain-of-thought), explaining each step clearly.\n- After completing the reasoning, give the **final numerical or exact answer**.\n- - Return a JSON dictionary:\n{\n "reasoning": "...", # your step-by-step reasoning here\n "answer": "..." # the final answer\n}\n'

response = APIBackend().build_messages_and_create_chat_completion(
            system_prompt=sys_prompt,
            user_prompt = '',
        )

NameError: name 'APIBackend' is not defined

In [None]:
final_teacher_chain = []

MAX_TRIALS = 10           # 最多尝试 10 次
THRESHOLD = 0.001         # 最小提升阈值

for items in flat_list:

    comp = items["comptation_name"]
    chain = items["hypothesis_chain"]

    if comp in evo:

        scenario = comp_to_scen[comp]

        base_score = model.compute_reward(chain, tokenizer, scenario)

        # 当前想改写的 chain
        current_chain = chain
        improved_chain = None

        # 尝试多轮生成
        for _ in range(MAX_TRIALS):

            prompt = prompt_template.format(
                SCENARIO=scenario,
                CHAIN=current_chain
            )

            # 让 LLM 改写 chain（关键）
            #new_chain = LLM(prompt)
            new_chain = APIBackend().build_messages_and_create_chat_completion(
                        system_prompt=prompt,
                        user_prompt = '',
                    )

            # 对新 chain 打分
            new_score = model.compute_reward(new_chain, tokenizer, scenario)

            # 如果 reward 提升显著，则保留
            if new_score - base_score > THRESHOLD:
                improved_chain = new_chain
                break

            # 否则继续尝试，用最新的 chain 作为下一轮输入
            current_chain = new_chain

        # 保存最终结果（如果没改善，就返回最后一次生成的版本）
        final_teacher_chain.append({
            "comptation_name": comp,
            "hypothesis_chain": chain,
            "new_hypothesis_chain": improved_chain if improved_chain else current_chain
        })


You are a data science expert and Kaggle Grandmaster.

Below is a competition scenario description:
A natural language processing sequence-to-sequence task that performs token-level text normalization, predicting the spoken form of each token. Evaluation is based on token-level exact-match accuracy.

Below is an optimization chain derived from iterative feedback:
Build a before->after frequency dictionary from train and at inference output the top mapping only if its empirical probability >= 0.995; otherwise copy the before token (this includes copying when identity is the top mapping). This yields a high-precision gate that transforms only when the mapping is near-certain and otherwise defaults to identity.->Calibrate the frequency gate threshold by maximizing token-level exact-match accuracy on the held-out fold over a grid (e.g., 0.90–0.999 with 0.001 steps, then a finer local search around the best). Keep the pipeline unchanged otherwise. Implement efficiently by evaluating all can

In [34]:
print(len(flat_list))

5122


In [6]:
def build_prompt(
    competition: str,
    description: str,
    history_chain: str,
    history_scores: list
) -> str:
    
    # -------------------------
    # System Prompt
    # -------------------------
    SYSTEM_PROMPT = (
        "You are a world-class data scientist and machine learning engineer with deep expertise "
        "in statistics, mathematics, and machine learning. "
        "You reason using high-level principles, avoid dataset-specific shortcuts, "
        "and propose generalizable hypotheses that improve model performance."
    )

    # -------------------------
    # Intro / Competition info
    # -------------------------
    prompt = SYSTEM_PROMPT + "\n\n"
    prompt += f"### Competition\n{competition}\n\n"
    prompt += f"### Description\n{description}\n\n"

    # -------------------------
    # Historical reasoning chain
    # -------------------------
    prompt += "### Historical Reasoning Chain\n"
    prompt += "(Earlier → Later, with observed performance scores.)\n"

    steps = [s.strip() for s in history_chain.split("->") if s.strip()]
    for i, step in enumerate(steps):
        score = history_scores[i] if i < len(history_scores) else "N/A"
        prompt += f"- Step {i+1}: {step}  (score: {score})\n"

    # -------------------------
    # Instruction for the model
    # -------------------------
    prompt += "\n### Instruction\n"
    prompt += (
        "Given the competition description and the historical reasoning trajectory, "
        "propose exactly ONE new, high-level hypothesis.\n"
        "- It should generalize to many ML competitions.\n"
        "- It should reflect an improvement direction beyond the last step.\n"
        "- Avoid dataset-specific tricks or low-level preprocessing details.\n"
        "- Output only the hypothesis (one sentence or short paragraph).\n"
    )

    return prompt


In [7]:
test_item = {
    "competition": "house-price-prediction",
    "description": "Predict the sale price of homes based on various features.",
    "history_chain": "baseline linear model -> add regularization -> feature engineering -> tree-based model",
    "history_scores": [0.71, 0.74, 0.78, 0.82],
    "target_hypothesis": "Use model stacking to combine linear and tree models."
}




In [None]:
#!/usr/bin/env python3
# split_chains_to_sft.py
"""
Split long hypothesis chains into stepwise SFT pairs.

Inputs:
- input_path: JSONL file or JSON list. Each record must contain at least:
    - "comptation_name" (or "competition")
    - "hypothesis_chain" (steps separated by "->")
    - optional "description"

Outputs:
- output_path: JSONL where each line is one training example:
    {"prompt": "...", "target": "..."}
  or chat-style:
    {"messages":[{"role":"system","content":...}, {"role":"user","content":...}, {"role":"assistant","content":...}]}
"""

import json
import os
from typing import List

SYSTEM_PROMPT = (
    "You are a world-class data scientist and machine learning engineer with deep expertise "
    "in statistics, mathematics, and computer science. Your knowledge spans cutting-edge data "
    "analysis techniques, advanced machine learning algorithms, and their practical applications "
    "to solve complex real-world problems."
)

def build_prompt_no_score(competition: str, description: str, history_chain: str, include_system: bool = True) -> str:
    parts = []
    if include_system:
        parts.append(SYSTEM_PROMPT)
    parts.append(f"### Competition\n{competition}\n")
    parts.append(f"### Description\n{description or 'No description provided.'}\n")
    parts.append("### Historical Hypothesis Chain\n")
    steps = [s.strip() for s in history_chain.split("->") if s.strip()]
    for i, step in enumerate(steps, 1):
        parts.append(f"- Step {i}: {step}")
    parts.append("\n### Instruction\nPropose ONE new, high-level hypothesis that generalizes beyond dataset-specific tricks. Output only the hypothesis.")
    return "\n\n".join(parts)

def load_input(input_path: str):
    # Supports JSONL (line by line) or JSON list file
    with open(input_path, "r", encoding="utf-8") as f:
        text = f.read().lstrip()
        if not text:
            return []
        if text[0] == "[":
            return json.loads(text)
        else:
            items = []
            for line in text.splitlines():
                line = line.strip()
                if not line:
                    continue
                items.append(json.loads(line))
            return items

def split_chain_record(record: dict, min_history_steps: int = 1):
    """
    Given a record with 'hypothesis_chain', split to multiple (history -> next) pairs.
    Returns list of tuples: (history_chain_str, next_step_str)
    """
    chain_raw = record.get("hypothesis_chain") or record.get("history_chain") or ""
    steps = [s.strip() for s in chain_raw.split("->") if s.strip()]
    out = []
    # require history of at least min_history_steps before producing a target
    for i in range(min_history_steps, len(steps)):
        history = " -> ".join(steps[:i])   # use first i steps as context
        target = steps[i]                 # next step (i+1)
        out.append((history, target))
    return out

def convert(
    input_path: str,
    output_path: str,
    chat_style: bool = False,
    min_history_steps: int = 1,
    include_system: bool = True,
    max_examples: int = None
):
    items = load_input(input_path)
    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
    written = 0
    with open(output_path, "w", encoding="utf-8") as fout:
        for rec in items:
            comp = rec.get("comptation_name") or rec.get("competition") or "unknown-competition"
            desc = rec.get("description", "") or rec.get("comp_description", "") or ""
            pairs = split_chain_record(rec, min_history_steps=min_history_steps)
            for history, target in pairs:
                prompt_text = build_prompt_no_score(comp, desc, history, include_system=include_system)
                if chat_style:
                    messages = [
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": f"Competition: {comp}\nDescription:\n{desc}\n\nHistorical chain:\n{history}\n\nInstruction:\nPropose ONE new, high-level hypothesis. Output only the hypothesis."},
                        {"role": "assistant", "content": target}
                    ]
                    out_obj = {"messages": messages}
                else:
                    out_obj = {
                        "prompt": prompt_text,
                        "target": target
                    }
                fout.write(json.dumps(out_obj, ensure_ascii=False) + "\n")
                written += 1
                if max_examples and written >= max_examples:
                    print(f"[convert] reached max_examples={max_examples}, stopping.")
                    return written
    print(f"[convert] done. wrote {written} examples to {output_path}")
    return written

# --- CLI usage ---
if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser()
    p.add_argument("--input", "-i", required=True, help="input JSONL or JSON list")
    p.add_argument("--output", "-o", required=True, help="output JSONL for SFT (prompt/target or messages)")
    p.add_argument("--chat-style", action="store_true", help="emit chat-style messages array")
    p.add_argument("--min-history", type=int, default=1, help="minimum history steps before target (default 1)")
    p.add_argument("--no-system", dest="system", action="store_false", help="omit the system prompt in generated prompt")
    p.add_argument("--max-examples", type=int, default=None, help="stop after generating this many examples")
    args = p.parse_args()

    convert(args.input, args.output, chat_style=args.chat_style, min_history_steps=args.min_history, include_system=args.system, max_examples=args.max_examples)


'You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and machine learning. You reason using high-level principles, avoid dataset-specific shortcuts, and propose generalizable hypotheses that improve model performance.\n\n### Competition\nhouse-price-prediction\n\n### Description\nPredict the sale price of homes based on various features.\n\n### Historical Reasoning Chain\n(Earlier → Later, with observed performance scores.)\n- Step 1: baseline linear model  (score: 0.71)\n- Step 2: add regularization  (score: 0.74)\n- Step 3: feature engineering  (score: 0.78)\n- Step 4: tree-based model  (score: 0.82)\n\n### Instruction\nGiven the competition description and the historical reasoning trajectory, propose exactly ONE new, high-level hypothesis.\n- It should generalize to many ML competitions.\n- It should reflect an improvement direction beyond the last step.\n- Avoid dataset-specific tricks or low-level preprocessing details