In [None]:
print("hello")

In [None]:
import torch
import textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import numpy as np

# ==========================================
# 1. Ë®≠ÂÆö
# ==========================================
BASE_MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
SUBSPACE_PATH = "common_subspace.pt"  # ‰ª•Ââç‰ΩúÊàê„Åó„Åü„Éï„Ç°„Ç§„É´
SAVED_MODEL_PATH = "./saved_models/run-research-hybrid-01" # ‚òÖ„Åì„Åì„ÇíÂÆüÈöõ„ÅÆ‰øùÂ≠ò„Éë„Çπ„Å´Â§âÊõ¥ÔºÅ

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# ==========================================
# 2. „Çπ„Ç≥„Ç¢Ë®àÁÆóÁî®„ÇØ„É©„Çπ (ÂÜçÊé≤)
# ==========================================
class ResidualCuriosityRewardModel(torch.nn.Module):
    def __init__(self, path, device):
        super().__init__()
        data = torch.load(path, map_location="cpu", weights_only=False)
        self.register_buffer("basis", data["basis"].to(device, dtype=torch.float16))
        self.register_buffer("mean", data["mean"].to(device, dtype=torch.float16))

    def get_score(self, hidden_states):
        """Hidden States [Seq, D] „Åã„ÇâÂ•ΩÂ•áÂøÉ„Çπ„Ç≥„Ç¢„ÇíË®àÁÆó"""
        h = hidden_states.to(self.basis.dtype)
        h_centered = h - self.mean
        z_common = h_centered @ self.basis.T
        h_common = z_common @ self.basis
        h_residual = h_centered - h_common
        norms = torch.norm(h_residual, dim=-1)
        return torch.log1p(norms).mean().item()

# ==========================================
# 3. Êé®Ë´ñ„ÉªÊØîËºÉÈñ¢Êï∞
# ==========================================
def compare_models(prompts):
    print(f"Loading Base Model: {BASE_MODEL_NAME} ...")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
    
    # „Éô„Éº„Çπ„É¢„Éá„É´„ÅÆ„É≠„Éº„Éâ
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    # Ë©ï‰æ°Âô®„ÅÆ„É≠„Éº„Éâ
    scorer = ResidualCuriosityRewardModel(SUBSPACE_PATH, DEVICE)
    
    # --- 1. Base Model ÁîüÊàê ---
    print("\n>>> Generating with [BASE MODEL]...")
    base_results = generate(base_model, tokenizer, prompts, scorer)
    
    # --- 2. Trained Model ÁîüÊàê ---
    print(f"\n>>> Loading Adapter from {SAVED_MODEL_PATH}...")
    # LoRA„Ç¢„ÉÄ„Éó„Çø„Éº„ÇíÂêà‰Ωì
    trained_model = PeftModel.from_pretrained(base_model, SAVED_MODEL_PATH)
    
    print(">>> Generating with [RDRL TRAINED MODEL]...")
    trained_results = generate(trained_model, tokenizer, prompts, scorer)
    
    # --- 3. ÁµêÊûúË°®Á§∫ ---
    print_comparison(prompts, base_results, trained_results)

def generate(model, tokenizer, prompts, scorer):
    results = []
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=True,
                temperature=0.9,
                top_p=0.95,
                repetition_penalty=1.1,
                output_hidden_states=True,
                return_dict_in_generate=True
            )
        
        # „ÉÜ„Ç≠„Çπ„Éà
        gen_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
        response = gen_text[len(prompt):]
        
        # „Çπ„Ç≥„Ç¢Ë®àÁÆó (ÁîüÊàêÈÉ®ÂàÜ„ÅÆHidden State„Çí‰ΩøÁî®)
        # outputs.hidden_states: (step, layer, batch, 1, dim)
        hidden_list = []
        # „Éó„É≠„É≥„Éó„Éà‰ª•Èôç„ÅÆ„Çπ„ÉÜ„ÉÉ„Éó„ÇíÂèñÂæó
        start_idx = inputs.input_ids.shape[1] - 1
        
        # generate„ÅÆÊàª„ÇäÂÄ§ÊßãÈÄ†„Å´Âêà„Çè„Åõ„Å¶ÊäΩÂá∫
        # (ÈÄöÂ∏∏„ÅØ tuple(steps) „Å™„ÅÆ„Åß„ÄÅ„Åù„Åì„Åã„ÇâÊúÄÁµÇÂ±§„ÇíÂèñ„ÇäÂá∫„Åô)
        if len(outputs.hidden_states) > 1:
            for step_data in outputs.hidden_states:
                last_layer = step_data[-1].squeeze(0).squeeze(0)
                hidden_list.append(last_layer)
            
            if hidden_list:
                h_seq = torch.stack(hidden_list, dim=0)
                score = scorer.get_score(h_seq)
            else:
                score = 0.0
        else:
            score = 0.0
            
        results.append({"text": response, "score": score})
    return results

def print_comparison(prompts, base_res, trained_res):
    print("\n" + "="*80)
    print(" CREATIVITY SHOWDOWN: Base vs RDRL")
    print("="*80)
    
    wins = 0
    
    for i, (p, b, t) in enumerate(zip(prompts, base_res, trained_res)):
        print(f"\nPrompt {i+1}: {p.strip()}")
        print("-" * 40)
        
        # Base
        print(f"[Base Model] (Score: {b['score']:.4f})")
        print(textwrap.fill(b['text'].strip(), width=80))
        print("-" * 20)
        
        # Trained
        print(f"[RDRL Model] (Score: {t['score']:.4f})")
        print(textwrap.fill(t['text'].strip(), width=80))
        print("-" * 20)
        
        # Âà§ÂÆö
        diff = t['score'] - b['score']
        if diff > 0:
            print(f"üèÜ WINNER: RDRL Model (+{diff:.4f})")
            wins += 1
        else:
            print(f"üëæ WINNER: Base Model ({diff:.4f})")
            
    print("="*80)
    print(f"Final Result: RDRL Model won {wins} / {len(prompts)} rounds.")

In [None]:
# ==========================================
# 4. „ÉÜ„Çπ„ÉàÁî®„Éó„É≠„É≥„Éó„Éà (Á†îÁ©∂„Ç¢„Ç§„Éá„Ç¢)
# ==========================================
test_prompts = [
    """You are an expert LLM researcher. Propose a novel and concrete research idea about large language models.
Output ONLY in the following format:
Title: <concise LLM research title>
Abstract: <150-220 word abstract>
Draft a research proposal about Mixture of Experts (MoE).""",
    
    """You are an expert LLM researcher. Propose a novel and concrete research idea about large language models.
Output ONLY in the following format:
Title: <concise LLM research title>
Abstract: <150-220 word abstract>
Propose an experiment improving Hallucination Detection with a focus on interpretability.""",
]

In [None]:
compare_models(test_prompts)