In [None]:
# !pip install -U -q transformers==4.51.3 accelerate==1.6.0 datasets==3.5.0 bitsandbytes==0.45.5 triton==3.2.0 unsloth==2025.3.19 torch==2.6.0 peft==0.15.2 trl==0.15.2 wandb==0.19.10

In [None]:
import os
os.environ["WANDB_API_KEY"] = 

### Генерация ответов моделями

In [None]:
from unsloth import FastLanguageModel
import random
import pandas as pd
import torch
from datasets import load_dataset
from transformers import BitsAndBytesConfig
from peft import PeftModel
import wandb
import numpy as np
from tqdm import tqdm

SEED = 42

NUM_SAMPLES = 150
DATASET_NAME = "HuggingFaceH4/ultrafeedback_binarized"
MODELS = {
    "BNF Model": "animavestra888-independent/Coursework/model-crmnp6jy:v24",
    "DPO Model": "animavestra888-independent/Coursework/model-izl7baxm:v14",
    "GRPO Model": "animavestra888-independent/Coursework/model-6enu8o1w:v7",
    "Base Model": "Qwen/Qwen2.5-0.5B-Instruct"
}
SYSTEM_PROMPT = "You are a helpful AI assistant."

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

GENERATION_CONFIG = {
    "max_new_tokens": 768,
    "temperature": 0.7,
    "top_p": 0.9,
    "do_sample": True
}

OUTPUT_DIR = "evaluation_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)


def load_model(model_id, is_peft=False):
    if is_peft:
        wandb.init(project="Coursework", entity="animavestra888-independent", job_type="artifact_download")

        base_model, tokenizer = FastLanguageModel.from_pretrained(
            "Qwen/Qwen2.5-0.5B-Instruct",
            attn_implementation="flash_attention_2",
            quantization_config=bnb_cfg,
            load_in_4bit=True,
        )

        try:
            art = wandb.use_artifact(model_id, type="model")
            peft_dir = art.download()
            model = PeftModel.from_pretrained(base_model, peft_dir)
        finally:
            wandb.finish()
    else:
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_id,
            attn_implementation="flash_attention_2",
            quantization_config=bnb_cfg,
            load_in_4bit=True,
        )

    model.eval()
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    return model, tokenizer

def format_prompt(prompt, tokenizer):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
    ]
    return tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

def generate_answer(model, tokenizer, prompt):
    formatted_prompt = format_prompt(prompt, tokenizer)
    inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=GENERATION_CONFIG["max_new_tokens"],
            do_sample=GENERATION_CONFIG["do_sample"],
            temperature=GENERATION_CONFIG["temperature"],
            top_p=GENERATION_CONFIG["top_p"],
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    return tokenizer.decode(
        outputs[0][inputs.input_ids.shape[1]:],
        skip_special_tokens=True
    ).strip()


def main():
    print("Загрузка датасета...")
    dataset = load_dataset(DATASET_NAME, split="test_sft")
    all_indices = list(range(len(dataset)))
    random.seed(SEED)
    selected_indices = random.sample(all_indices, min(NUM_SAMPLES, len(dataset)))
    dataset = dataset.select(selected_indices)
    results_file = os.path.join(OUTPUT_DIR, f"model_responses.csv")


    print("Загрузка моделей...")
    models_dict = {}
    tokenizers = {}

    for name, model_id in MODELS.items():
        is_peft = name != "Base Model"
        print(f"Загрузка {name}...")
        models_dict[name], tokenizers[name] = load_model(model_id, is_peft)

    print("\nГенерация ответов...")
    results = []

    for example in tqdm(dataset, desc="Генерация"):
        prompt = example["prompt"]
        responses = {}

        for name, model in models_dict.items():
            responses[name] = generate_answer(model, tokenizers[name], prompt)

        results.append({
            "prompt": prompt,
            **responses
        })

    df = pd.DataFrame(results)

    df.to_csv(results_file, index=False)
    print(f"Ответы моделей сохранены в: {results_file}")

main()

### Оценка ответов с помощью LLM

In [None]:
import pandas as pd
import random
from tqdm import tqdm
import torch
from unsloth import FastLanguageModel
from transformers import BitsAndBytesConfig
import wandb
import os
import numpy as np

SEED = 42
random.seed(SEED)

WANDB_PROJECT = "llm-evaluation"
WANDB_ENTITY = "animavestra888-independent"
WANDB_NAME = "llm-judge-evaluation"
USE_WANDB = True

MODELS = {
    "BNF Model": "animavestra888-independent/Coursework/model-crmnp6jy:v24",
    "DPO Model": "animavestra888-independent/Coursework/model-izl7baxm:v14",
    "GRPO Model": "animavestra888-independent/Coursework/model-6enu8o1w:v7",
    "Base Model": "Qwen/Qwen2.5-0.5B-Instruct"
}

if USE_WANDB:
    wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, name=WANDB_NAME)
    wandb.define_metric("step")
    wandb.define_metric("evaluation/*", step_metric="step")
    wandb.config.update({
        "seed": SEED,
        "judge_model": "Qwen/Qwen2.5-14B-Instruct",
        "eval_models": list(MODELS.keys()),
        "temperature": 0,
        "max_new_tokens": 10
    })

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

INPUT_FILE = "/kaggle/input/responses/model_responses.csv"
OUTPUT_DIR = "evaluation_results/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def load_judge_model():
    print("Загрузка модели-судьи...")
    judge_model, judge_tokenizer = FastLanguageModel.from_pretrained(
        "Qwen/Qwen2.5-14B-Instruct",
        attn_implementation="flash_attention_2",
        quantization_config=bnb_cfg,
        load_in_4bit=True,
        max_seq_length=4096,
    )
    judge_model.eval()
    if judge_tokenizer.pad_token_id is None:
        judge_tokenizer.pad_token_id = judge_tokenizer.eos_token_id
    return judge_model, judge_tokenizer

def format_judge_prompt(prompt, responses):
    return [
        {
            "role": "system",
            "content": (
                "You are an expert evaluator. Compare responses based on "
                "helpfulness, accuracy, coherence, and alignment with human values. "
                "Respond ONLY with the letter of the best response (A, B, C, or D)."
            )
        },
        {
            "role": "user",
            "content": (
                f"Prompt: {prompt}\n\n" +
                "\n\n".join(f"Response {label}:\n{resp}" 
                          for label, resp in responses.items()) +
                "\n\nWhich response is best? Answer with A, B, C, or D only."
            )
        }
    ]
    
def run_llm_judge_evaluation(df):
    print("\n" + "="*50)
    print("Запуск оценки LLM (4 модели)")
    print("="*50)
    
    labels = ['A', 'B', 'C', 'D']
    for label in labels:
        df[f"model_{label}"] = df.get(f"model_{label}", None)
    
    df["judge_choice"] = df.get("judge_choice", None)
    df["judge_model"] = df.get("judge_model", None)
    
    judge_counts = {model: 0 for model in MODELS.keys()}
    judge_model, judge_tokenizer = load_judge_model()
    
    if USE_WANDB:
        wandb_table = wandb.Table(columns=[
            "Prompt", 
            "Model A", "Response A", 
            "Model B", "Response B", 
            "Model C", "Response C", 
            "Model D", "Response D", 
            "Judge Verdict", "Chosen Model",
        ])

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating"):
        prompt = row["prompt"]
        model_names = list(MODELS.keys())
        random.shuffle(model_names)
        label_map = {model: labels[i] for i, model in enumerate(model_names)}
        responses = {label_map[model]: row[model] for model in model_names}
        
        for i, model in enumerate(model_names):
            df.at[idx, f"model_{labels[i]}"] = model
        
        messages = format_judge_prompt(prompt, responses)
        formatted_prompt = judge_tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        
        inputs = judge_tokenizer(formatted_prompt, return_tensors="pt").to(judge_model.device)
        with torch.no_grad():
            outputs = judge_model.generate(
                **inputs,
                max_new_tokens=10,
                temperature=0,
                do_sample=False,
                pad_token_id=judge_tokenizer.pad_token_id,
                eos_token_id=judge_tokenizer.eos_token_id,
            )
        
        judge_response = judge_tokenizer.decode(
            outputs[0][inputs.input_ids.shape[1]:], 
            skip_special_tokens=True
        ).strip()

        verdict = None
        for char in judge_response:
            if char in labels:
                verdict = char
                break

        
        chosen_model = None
        if verdict:
            for model, label in label_map.items():
                if label == verdict:
                    chosen_model = model
                    judge_counts[chosen_model] += 1
                    break

        df.at[idx, "judge_choice"] = verdict
        df.at[idx, "judge_model"] = chosen_model
        
        if wandb_table:
            wandb_table.add_data(
                prompt,
                model_names[0], responses.get('A', ''),
                model_names[1], responses.get('B', ''),
                model_names[2], responses.get('C', ''),
                model_names[3], responses.get('D', ''),
                verdict or "N/A",
                chosen_model or "N/A",
            )
    
    total_evaluated = len(df) - df["judge_choice"].isna().sum()
    print("\n" + "="*50)
    print("Результаты")
    print("="*50)
    
    win_rates = {}
    for model, count in judge_counts.items():
        win_rates[model] = (count / total_evaluated * 100) if total_evaluated > 0 else 0
        print(f"{model}: {count} wins ({win_rates[model]:.1f}%)")
    if USE_WANDB:
        summary_data = []
        for model in MODELS.keys():
            summary_data.append([
                model, 
                judge_counts.get(model, 0),
                win_rates.get(model, 0)
            ])
        
        summary_table = wandb.Table(
            data=summary_data,
            columns=["Model", "Wins", "Win Rate (%)"]
        )
        
        comparison_table = wandb.Table(columns=["Model", "Wins", "Win Rate (%)"])
        for model in MODELS.keys():
            comparison_table.add_data(
                model,
                judge_counts.get(model, 0),
                f"{win_rates.get(model, 0):.1f}%"
            )
        
        bar_chart = wandb.plot.bar(
            comparison_table,
            "Model",
            "Wins",
            title="Model Preferences"
        )
        
        wandb.log({
            "evaluation/summary": summary_table,
            "evaluation/comparison_chart": bar_chart,
            "evaluation/detailed_responses": wandb_table,
            "evaluation/total_examples": len(df),
            "evaluation/evaluated_examples": total_evaluated,
            **{f"evaluation/win_rate_{model.replace(' ', '_')}": win_rates.get(model, 0) 
               for model in MODELS.keys()}
        })
    
    return df


def main():
    df = pd.read_csv(INPUT_FILE)
    df_results = run_llm_judge_evaluation(df)
    
    judge_file = os.path.join(OUTPUT_DIR, f"judge_evaluation.csv")
    df_results.to_csv(judge_file, index=False)
    print(f"\nРезультаты оценки LLM-судьей сохранены в: {judge_file}")

    if USE_WANDB:
        wandb.finish()

main()