In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm


# Load model and LoRA adapters

In [None]:
class CFG:
    MAX_TRAIN = 100
    MAX_TOKENS = 2048
    NUM_GENERATIONS = 4
    USE_PEFT = True
    BATCH_SIZE=1
    MAX_STEPS = 80
    
    BETA = 0.04
    LR = 1.e-5
    
    MODEL_NAME = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'
    
    step_count=10
    DEBUG = False

    SYSTEM_PROMPT = """You are an expert clinical reasoning agent specializing in pharmaceutical therapeutics. Your task is to analyze multiple-choice questions. First, provide a detailed step-by-step reasoning process within <think> tags, explaining why the correct option is right and the others are wrong. Then, state the final correct answer in the format 'Letter: Full Answer Text'."""

    DATA_FILE_PATH = "/kaggle/input/curebench-training/curebench_validation_preprocessed.json" 
    OUTPUT_DIR = "/kaggle/working/cureseek-sft-v1"

In [None]:
ADAPTER_PATH = "model_ckpoints/results/cureseek-sft-v1/final_adapter" # Path from the previous training script
MERGED_MODEL_PATH = "model_ckpoints/cureseek-sft-v1-merged" # Where to save the new model



In [None]:

# --- Load Base Model and Tokenizer ---
print("Loading base model...")
# Load the base model in a higher precision for merging.
# bfloat16 is a good choice for performance and memory.
original_model = AutoModelForCausalLM.from_pretrained(
    CFG.MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    # trust_remote_code=True,
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME, 
                                        #   trust_remote_code=True
                                         )

In [None]:

# --- Load LoRA Adapter and Merge ---
print(f"Loading LoRA adapter from {ADAPTER_PATH}...")
# Load the PeftModel by combining the base model with the adapter
model = PeftModel.from_pretrained(original_model, ADAPTER_PATH)

print("Merging adapter weights with the base model...")
# This method merges the LoRA weights into the base model's weights
merged_model = model.merge_and_unload()
print("Merge complete.")

In [None]:


# --- Save the Merged Model ---
# Now you have a standalone model that can be used without the PEFT library
print(f"Saving merged model to {MERGED_MODEL_PATH}...")
merged_model.save_pretrained(MERGED_MODEL_PATH)
tokenizer.save_pretrained(MERGED_MODEL_PATH)
print("Merged model and tokenizer saved successfully!")

In [None]:

# --- Configuration ---
MERGED_MODEL_PATH = "./cureseek-sft-v1-merged" # The model we just created
ORIGINAL_DATA_FILE = "./path/to/your/full_curebench_data.json" # The ORIGINAL dataset with all questions
TXAGENT_OUTPUT_FILE = "./your_filtered_data.json" # Your file with the "correct_or_not" flag

# --- Load the Fine-Tuned Model and Tokenizer ---
print(f"Loading fine-tuned model from {MERGED_MODEL_PATH}...")
model = AutoModelForCausalLM.from_pretrained(
    MERGED_MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_PATH, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Ensure pad token is set for generation

# --- Prepare the Data ---
# We need to find the questions that TxAgent answered incorrectly.
print("Preparing data for inference...")
original_df = pd.read_json(ORIGINAL_DATA_FILE, lines=True) # Assuming JSONL format
txagent_df = pd.read_json(TXAGENT_OUTPUT_FILE, lines=True) # Assuming your filtered file is also JSONL

# Merge to get the correct answer letter and the TxAgent correctness flag
merged_df = pd.merge(original_df, txagent_df[['id', 'correct_or_not']], on='id', how='left')

# Filter for questions that were answered incorrectly by TxAgent
incorrect_df = merged_df[merged_df['correct_or_not'] == False].copy()
print(f"Found {len(incorrect_df)} questions that TxAgent answered incorrectly.")

# --- Define the Generation Prompt Template ---
# This is the crucial part: we guide the model by giving it the correct answer.
def create_generation_prompt(question, options, correct_answer_letter):
    options_str = "\n".join([f"{key}: {value}" for key, value in options.items()])
    prompt = (
        f"You are an expert in pharmaceutical therapeutics. Your task is to answer the following multiple-choice question "
        f"and provide a step-by-step reasoning process. The correct final answer is known to be '{correct_answer_letter}'. "
        f"Explain in detail why '{correct_answer_letter}' is correct and the other options are wrong.\n\n"
        f"Question: {question}\n\nOptions:\n{options_str}\n\n"
        f"The correct answer is {correct_answer_letter}."
    )
    
    # Use the same ChatML format we used for training
    chat_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
    return chat_prompt

# --- Generate Reasoning ---
results = []
for _, row in tqdm(incorrect_df.iterrows(), total=len(incorrect_df), desc="Generating Reasoning"):
    # Create the specific prompt for this question
    prompt = create_generation_prompt(row['question'], row['options'], row['correct_answer'])
    
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate the output
    # We use do_sample=True and a non-zero temperature to encourage more creative/natural reasoning
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode and clean up the response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    # The response will be everything after the final 'assistant\n'
    reasoning_and_answer = generated_text.split("<|im_start|>assistant\n")[-1]
    # Remove the final end token if it exists
    reasoning_and_answer = reasoning_and_answer.replace("<|im_end|>", "").strip()
    
    results.append({
        "id": row['id'],
        "question": row['question'],
        "options": row['options'],
        "correct_answer_letter": row['correct_answer'],
        "chosen_response": reasoning_and_answer # This is the high-quality response for DPO
    })

# --- Save the "Chosen" Dataset for DPO ---
chosen_df = pd.DataFrame(results)
chosen_df.to_json("dpo_chosen_responses.jsonl", orient="records", lines=True)
print("Successfully generated and saved the 'chosen' responses for DPO.")