# Lab 4.1 - OpenCompass 評估實戰
## Notebook 02: 執行模型評估

**學習目標**:
1. 配置評估參數 (batch size, 推理模式)
2. 執行 C-Eval 評估 (STEM, 社會科學, 人文學科)
3. 收集評估日誌與中間結果
4. 處理評估過程中的常見問題

**預計時間**: 1-2 小時 (取決於硬體)

---

## 1. 載入配置與模型

從 01-Setup.ipynb 載入配置和模型。

In [None]:
import torch
import json
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
from datetime import datetime

# 數據目錄
DATA_DIR = Path("./data")
RESULTS_DIR = Path("./results")
RESULTS_DIR.mkdir(exist_ok=True)

# 載入配置
config_path = DATA_DIR / "eval_config.json"
with open(config_path, 'r', encoding='utf-8') as f:
    eval_config = json.load(f)

print("✅ 配置已載入")
print(json.dumps(eval_config, indent=2, ensure_ascii=False))

### 重新載入模型

In [None]:
# 量化配置
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

def load_model(model_name: str):
    """載入模型和 tokenizer"""
    print(f"📥 載入模型: {model_name}")
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config if torch.cuda.is_available() else None,
        device_map="auto",
        trust_remote_code=True
    )
    
    print(f"✅ 模型載入完成")
    return model, tokenizer

# 載入模型
models = {}
tokenizers = {}

for model_key, model_info in eval_config["models"].items():
    if model_info["loaded"]:
        try:
            model, tokenizer = load_model(model_info["name"])
            models[model_key] = model
            tokenizers[model_key] = tokenizer
        except Exception as e:
            print(f"❌ 載入 {model_key} 失敗: {e}")

### 載入評估數據集

In [None]:
# 載入 C-Eval 數據集
ceval_subjects = eval_config["datasets"]["ceval"]["subjects"]

print("📥 載入 C-Eval 數據集...")
ceval_data = {}

for subject in ceval_subjects:
    dataset = load_dataset(
        "ceval/ceval-exam",
        subject,
        split="val",
        trust_remote_code=True
    )
    ceval_data[subject] = dataset
    print(f"  ✅ {subject}: {len(dataset)} 題")

print(f"\n總計: {sum(len(d) for d in ceval_data.values())} 題")

## 2. 實作評估函數

實作多選題評估邏輯。

In [None]:
def format_ceval_prompt(question: str, choices: dict) -> str:
    """
    格式化 C-Eval 問題為提示
    
    Args:
        question: 問題文本
        choices: 選項字典 {"A": "...", "B": "...", ...}
    
    Returns:
        格式化的提示文本
    """
    prompt = f"""以下是一道選擇題,請選出正確答案。

問題: {question}

A. {choices['A']}
B. {choices['B']}
C. {choices['C']}
D. {choices['D']}

答案:"""
    
    return prompt


def evaluate_multiple_choice(model, tokenizer, question: str, choices: dict, correct_answer: str):
    """
    評估多選題
    
    使用對數似然法選擇答案:
    1. 計算每個選項的對數似然
    2. 選擇似然最高的選項
    
    Args:
        model: 模型
        tokenizer: Tokenizer
        question: 問題文本
        choices: 選項字典
        correct_answer: 正確答案 (A/B/C/D)
    
    Returns:
        (predicted_answer, is_correct, logits_dict)
    """
    prompt = format_ceval_prompt(question, choices)
    
    # 編碼提示
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # 計算每個選項的對數似然
    option_logits = {}
    
    with torch.no_grad():
        for option in ["A", "B", "C", "D"]:
            # 編碼選項
            option_ids = tokenizer.encode(f" {option}", add_special_tokens=False)
            
            # 獲取模型輸出
            outputs = model(**inputs)
            logits = outputs.logits
            
            # 獲取最後一個 token 的 logits
            last_token_logits = logits[0, -1, :]
            
            # 獲取選項的 logit
            option_logit = last_token_logits[option_ids[0]].item()
            option_logits[option] = option_logit
    
    # 選擇 logit 最高的選項
    predicted_answer = max(option_logits, key=option_logits.get)
    is_correct = (predicted_answer == correct_answer)
    
    return predicted_answer, is_correct, option_logits


# 測試評估函數
if ceval_data and models:
    test_subject = list(ceval_data.keys())[0]
    test_sample = ceval_data[test_subject][0]
    test_model = list(models.values())[0]
    test_tokenizer = list(tokenizers.values())[0]
    
    print("🧪 測試評估函數...")
    print(f"問題: {test_sample['question']}")
    
    pred, correct, logits = evaluate_multiple_choice(
        test_model,
        test_tokenizer,
        test_sample['question'],
        {k: test_sample[k] for k in ['A', 'B', 'C', 'D']},
        test_sample['answer']
    )
    
    print(f"\n預測答案: {pred}")
    print(f"正確答案: {test_sample['answer']}")
    print(f"是否正確: {correct}")
    print(f"\nLogits: {logits}")

## 3. 執行完整評估

對所有模型和學科進行評估。

In [None]:
def evaluate_subject(model, tokenizer, dataset, subject_name: str):
    """
    評估單個學科
    
    Args:
        model: 模型
        tokenizer: Tokenizer
        dataset: 數據集
        subject_name: 學科名稱
    
    Returns:
        評估結果字典
    """
    print(f"\n評估學科: {subject_name}")
    print("=" * 60)
    
    correct_count = 0
    total_count = len(dataset)
    
    results = []
    
    for sample in tqdm(dataset, desc=subject_name):
        question = sample['question']
        choices = {k: sample[k] for k in ['A', 'B', 'C', 'D']}
        correct_answer = sample['answer']
        
        pred, is_correct, logits = evaluate_multiple_choice(
            model,
            tokenizer,
            question,
            choices,
            correct_answer
        )
        
        if is_correct:
            correct_count += 1
        
        results.append({
            'question': question,
            'choices': choices,
            'correct_answer': correct_answer,
            'predicted_answer': pred,
            'is_correct': is_correct,
            'logits': logits
        })
    
    accuracy = correct_count / total_count
    
    print(f"\n準確率: {accuracy:.2%} ({correct_count}/{total_count})")
    print("=" * 60)
    
    return {
        'subject': subject_name,
        'accuracy': accuracy,
        'correct': correct_count,
        'total': total_count,
        'details': results
    }


def evaluate_model_on_ceval(model, tokenizer, model_name: str, ceval_data: dict):
    """
    評估模型在 C-Eval 上的表現
    
    Args:
        model: 模型
        tokenizer: Tokenizer
        model_name: 模型名稱
        ceval_data: C-Eval 數據集字典
    
    Returns:
        評估結果
    """
    print(f"\n{'=' * 60}")
    print(f"評估模型: {model_name}")
    print(f"{'=' * 60}")
    
    subject_results = []
    
    for subject, dataset in ceval_data.items():
        result = evaluate_subject(model, tokenizer, dataset, subject)
        subject_results.append(result)
    
    # 計算總體準確率
    total_correct = sum(r['correct'] for r in subject_results)
    total_questions = sum(r['total'] for r in subject_results)
    overall_accuracy = total_correct / total_questions
    
    # 按學科分類計算準確率
    subject_categories = {
        'STEM': ['computer_science', 'physics', 'mathematics'],
        'Humanities': ['chinese_language_and_literature', 'history'],
        'Social Science': ['law', 'economics']
    }
    
    category_accuracies = {}
    for category, subjects in subject_categories.items():
        category_results = [r for r in subject_results if r['subject'] in subjects]
        if category_results:
            cat_correct = sum(r['correct'] for r in category_results)
            cat_total = sum(r['total'] for r in category_results)
            category_accuracies[category] = cat_correct / cat_total
    
    return {
        'model_name': model_name,
        'overall_accuracy': overall_accuracy,
        'category_accuracies': category_accuracies,
        'subject_results': subject_results,
        'timestamp': datetime.now().isoformat()
    }

### 執行評估 (Llama-2-7B)

In [None]:
if 'llama-2-7b' in models:
    llama_results = evaluate_model_on_ceval(
        models['llama-2-7b'],
        tokenizers['llama-2-7b'],
        'Llama-2-7B',
        ceval_data
    )
    
    # 保存結果
    results_path = RESULTS_DIR / "llama2_7b_results.json"
    with open(results_path, 'w', encoding='utf-8') as f:
        json.dump(llama_results, f, indent=2, ensure_ascii=False, default=str)
    
    print(f"\n✅ 結果已保存: {results_path}")
else:
    print("⚠️ Llama-2-7B 未載入,跳過評估")

### 執行評估 (Qwen-7B)

In [None]:
if 'qwen-7b' in models:
    qwen_results = evaluate_model_on_ceval(
        models['qwen-7b'],
        tokenizers['qwen-7b'],
        'Qwen-7B',
        ceval_data
    )
    
    # 保存結果
    results_path = RESULTS_DIR / "qwen_7b_results.json"
    with open(results_path, 'w', encoding='utf-8') as f:
        json.dump(qwen_results, f, indent=2, ensure_ascii=False, default=str)
    
    print(f"\n✅ 結果已保存: {results_path}")
else:
    print("⚠️ Qwen-7B 未載入,跳過評估")

## 4. 初步結果查看

In [None]:
def display_results_summary(results: dict):
    """
    顯示評估結果摘要
    
    Args:
        results: 評估結果字典
    """
    print(f"\n{'=' * 60}")
    print(f"模型: {results['model_name']}")
    print(f"{'=' * 60}")
    
    print(f"\n整體準確率: {results['overall_accuracy']:.2%}")
    
    print(f"\n分類準確率:")
    for category, accuracy in results['category_accuracies'].items():
        print(f"  {category:20s}: {accuracy:.2%}")
    
    print(f"\n學科準確率:")
    for subject_result in results['subject_results']:
        print(f"  {subject_result['subject']:35s}: {subject_result['accuracy']:.2%}")
    
    print(f"\n{'=' * 60}")

# 顯示結果
if 'llama_results' in locals():
    display_results_summary(llama_results)

if 'qwen_results' in locals():
    display_results_summary(qwen_results)

## 5. 模型對比

In [None]:
if 'llama_results' in locals() and 'qwen_results' in locals():
    print("\n" + "=" * 60)
    print("模型對比")
    print("=" * 60)
    
    # 整體對比
    print(f"\n整體準確率:")
    print(f"  Llama-2-7B: {llama_results['overall_accuracy']:.2%}")
    print(f"  Qwen-7B:    {qwen_results['overall_accuracy']:.2%}")
    print(f"  差距:       {(qwen_results['overall_accuracy'] - llama_results['overall_accuracy']):.2%}")
    
    # 分類對比
    print(f"\n分類準確率對比:")
    for category in llama_results['category_accuracies'].keys():
        llama_acc = llama_results['category_accuracies'][category]
        qwen_acc = qwen_results['category_accuracies'][category]
        diff = qwen_acc - llama_acc
        print(f"  {category:20s}: Llama {llama_acc:.2%} | Qwen {qwen_acc:.2%} | Δ {diff:+.2%}")
    
    print("\n" + "=" * 60)
else:
    print("⚠️ 需要兩個模型的結果才能進行對比")

## 📝 總結

在本 notebook 中,我們完成了:

1. ✅ 配置評估參數
2. ✅ 實作多選題評估邏輯 (對數似然法)
3. ✅ 執行 C-Eval 評估
4. ✅ 保存評估結果
5. ✅ 初步結果對比

### 下一步

前往 **03-Analyze.ipynb** 進行深入的結果分析。

---

**觀察**:
- Qwen-7B 通常在中文任務上表現更好
- STEM 類別通常比人文類別更具挑戰性
- 4-bit 量化可能略微降低準確率 (約 1-2%)
