# Lab 4.1 - OpenCompass 評估實戰
## Notebook 03: 結果分析

**學習目標**:
1. 載入評估結果並進行統計分析
2. 錯誤分析 (找出常見錯誤模式)
3. 信心度分析 (檢查 logits 分佈)
4. 學科難度排名
5. 模型強弱項識別

**預計時間**: 30-45 分鐘

---

## 1. 載入評估結果

從 02-Evaluate.ipynb 產生的 JSON 檔案中載入結果。

In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# 設定路徑
RESULTS_DIR = Path("./results")
ANALYSIS_DIR = Path("./analysis")
ANALYSIS_DIR.mkdir(exist_ok=True)

print("📂 載入評估結果...")
print("=" * 60)

# 載入結果檔案
results = {}
result_files = {
    'llama-2-7b': RESULTS_DIR / "llama2_7b_results.json",
    'qwen-7b': RESULTS_DIR / "qwen_7b_results.json"
}

for model_key, result_file in result_files.items():
    if result_file.exists():
        with open(result_file, 'r', encoding='utf-8') as f:
            results[model_key] = json.load(f)
        print(f"✅ {model_key}: {result_file.name}")
    else:
        print(f"⚠️  {model_key}: 檔案不存在 ({result_file})")

if not results:
    raise FileNotFoundError("未找到任何評估結果,請先執行 02-Evaluate.ipynb")

print("\n✅ 結果載入完成")
print(f"共載入 {len(results)} 個模型的結果")
print("=" * 60)

## 2. 整體統計分析

計算各模型的整體表現統計。

In [None]:
def calculate_overall_statistics(results: dict) -> pd.DataFrame:
    """
    計算整體統計資訊
    
    Args:
        results: 評估結果字典
    
    Returns:
        統計資訊 DataFrame
    """
    stats = []
    
    for model_key, result in results.items():
        # 計算總題數
        total_questions = sum(sr['total'] for sr in result['subject_results'])
        total_correct = sum(sr['correct'] for sr in result['subject_results'])
        
        stats.append({
            '模型': result['model_name'],
            '整體準確率': f"{result['overall_accuracy']:.2%}",
            '正確題數': total_correct,
            '總題數': total_questions,
            '錯誤題數': total_questions - total_correct,
            '評估學科數': len(result['subject_results']),
        })
        
        # 添加分類準確率
        for category, accuracy in result['category_accuracies'].items():
            stats[-1][f'{category}準確率'] = f"{accuracy:.2%}"
    
    return pd.DataFrame(stats)


# 計算統計資訊
stats_df = calculate_overall_statistics(results)

print("\n" + "=" * 60)
print("整體統計分析")
print("=" * 60)
print(stats_df.to_string(index=False))
print("=" * 60)

# 保存統計結果
stats_df.to_csv(ANALYSIS_DIR / "overall_statistics.csv", index=False, encoding='utf-8-sig')
print(f"\n✅ 統計結果已保存: {ANALYSIS_DIR / 'overall_statistics.csv'}")

## 3. 學科難度分析

分析每個學科的難度 (基於所有模型的平均準確率)。

In [None]:
def analyze_subject_difficulty(results: dict) -> pd.DataFrame:
    """
    分析學科難度
    
    Args:
        results: 評估結果字典
    
    Returns:
        學科難度 DataFrame (按難度降序排列)
    """
    subject_stats = defaultdict(lambda: {'accuracies': [], 'total_questions': 0})
    
    # 收集每個學科的準確率
    for model_key, result in results.items():
        for subject_result in result['subject_results']:
            subject = subject_result['subject']
            subject_stats[subject]['accuracies'].append(subject_result['accuracy'])
            subject_stats[subject]['total_questions'] = subject_result['total']
    
    # 計算平均準確率和難度
    difficulty_data = []
    for subject, stats in subject_stats.items():
        avg_accuracy = np.mean(stats['accuracies'])
        std_accuracy = np.std(stats['accuracies'])
        
        # 難度定義: 1 - 準確率 (越高越難)
        difficulty = 1 - avg_accuracy
        
        difficulty_data.append({
            '學科': subject,
            '題數': stats['total_questions'],
            '平均準確率': avg_accuracy,
            '準確率標準差': std_accuracy,
            '難度分數': difficulty,
            '難度等級': '困難' if difficulty > 0.6 else '中等' if difficulty > 0.4 else '簡單'
        })
    
    # 按難度排序
    df = pd.DataFrame(difficulty_data)
    df = df.sort_values('難度分數', ascending=False)
    
    # 格式化百分比
    df['平均準確率'] = df['平均準確率'].apply(lambda x: f"{x:.2%}")
    df['準確率標準差'] = df['準確率標準差'].apply(lambda x: f"{x:.2%}")
    df['難度分數'] = df['難度分數'].apply(lambda x: f"{x:.3f}")
    
    return df


# 分析學科難度
difficulty_df = analyze_subject_difficulty(results)

print("\n" + "=" * 80)
print("學科難度排名 (由難到易)")
print("=" * 80)
print(difficulty_df.to_string(index=False))
print("=" * 80)

# 保存結果
difficulty_df.to_csv(ANALYSIS_DIR / "subject_difficulty.csv", index=False, encoding='utf-8-sig')
print(f"\n✅ 難度分析已保存: {ANALYSIS_DIR / 'subject_difficulty.csv'}")

## 4. 錯誤分析

深入分析模型的錯誤模式。

In [None]:
def analyze_errors(results: dict) -> Dict[str, pd.DataFrame]:
    """
    分析錯誤模式
    
    Args:
        results: 評估結果字典
    
    Returns:
        錯誤分析結果字典
    """
    error_analysis = {}
    
    for model_key, result in results.items():
        model_name = result['model_name']
        
        # 收集所有錯誤
        errors = []
        
        for subject_result in result['subject_results']:
            subject = subject_result['subject']
            
            for detail in subject_result['details']:
                if not detail['is_correct']:
                    errors.append({
                        '學科': subject,
                        '問題': detail['question'][:50] + '...' if len(detail['question']) > 50 else detail['question'],
                        '正確答案': detail['correct_answer'],
                        '預測答案': detail['predicted_answer'],
                        '錯誤類型': f"{detail['correct_answer']}→{detail['predicted_answer']}"
                    })
        
        # 創建錯誤 DataFrame
        if errors:
            error_df = pd.DataFrame(errors)
            
            # 分析錯誤類型分佈
            error_type_counts = Counter(error_df['錯誤類型'])
            
            print(f"\n{'=' * 60}")
            print(f"模型: {model_name}")
            print(f"{'=' * 60}")
            print(f"\n總錯誤數: {len(errors)}")
            print(f"\n最常見錯誤類型 (Top 5):")
            for error_type, count in error_type_counts.most_common(5):
                print(f"  {error_type}: {count} 次 ({count/len(errors):.1%})")
            
            # 按學科統計錯誤
            subject_errors = error_df.groupby('學科').size().sort_values(ascending=False)
            print(f"\n各學科錯誤數:")
            for subject, count in subject_errors.items():
                print(f"  {subject}: {count}")
            
            error_analysis[model_key] = {
                'errors': error_df,
                'error_type_counts': error_type_counts,
                'subject_errors': subject_errors
            }
            
            # 保存錯誤詳情
            error_file = ANALYSIS_DIR / f"{model_key}_errors.csv"
            error_df.to_csv(error_file, index=False, encoding='utf-8-sig')
            print(f"\n✅ 錯誤詳情已保存: {error_file}")
        else:
            print(f"\n✨ {model_name}: 沒有錯誤 (完美表現!)")
    
    return error_analysis


# 執行錯誤分析
error_analysis = analyze_errors(results)

## 5. 信心度分析

分析模型對答案的信心度 (基於 logits 分佈)。

In [None]:
def analyze_confidence(results: dict) -> Dict[str, pd.DataFrame]:
    """
    分析模型信心度
    
    Args:
        results: 評估結果字典
    
    Returns:
        信心度分析結果
    """
    confidence_analysis = {}
    
    for model_key, result in results.items():
        model_name = result['model_name']
        
        confidence_data = []
        
        for subject_result in result['subject_results']:
            subject = subject_result['subject']
            
            for detail in subject_result['details']:
                logits = detail['logits']
                
                # 計算 logits 統計
                logit_values = list(logits.values())
                max_logit = max(logit_values)
                min_logit = min(logit_values)
                logit_range = max_logit - min_logit  # 信心度指標
                
                # Softmax 計算機率
                exp_logits = np.exp(np.array(logit_values) - max_logit)  # 防止溢出
                probs = exp_logits / exp_logits.sum()
                max_prob = probs.max()
                
                confidence_data.append({
                    '學科': subject,
                    '是否正確': detail['is_correct'],
                    'Logit範圍': logit_range,
                    '最大機率': max_prob,
                    '預測答案': detail['predicted_answer'],
                    '正確答案': detail['correct_answer']
                })
        
        # 創建 DataFrame
        conf_df = pd.DataFrame(confidence_data)
        
        # 分析正確 vs 錯誤的信心度差異
        correct_conf = conf_df[conf_df['是否正確']]
        incorrect_conf = conf_df[~conf_df['是否正確']]
        
        print(f"\n{'=' * 60}")
        print(f"模型: {model_name} - 信心度分析")
        print(f"{'=' * 60}")
        
        if len(correct_conf) > 0:
            print(f"\n正確答案的信心度:")
            print(f"  平均 Logit 範圍: {correct_conf['Logit範圍'].mean():.3f}")
            print(f"  平均最大機率:    {correct_conf['最大機率'].mean():.3f}")
        
        if len(incorrect_conf) > 0:
            print(f"\n錯誤答案的信心度:")
            print(f"  平均 Logit 範圍: {incorrect_conf['Logit範圍'].mean():.3f}")
            print(f"  平均最大機率:    {incorrect_conf['最大機率'].mean():.3f}")
        
        # 高信心度錯誤 (過度自信)
        if len(incorrect_conf) > 0:
            overconfident = incorrect_conf[incorrect_conf['最大機率'] > 0.8]
            print(f"\n過度自信的錯誤 (機率 > 0.8): {len(overconfident)} 題")
            if len(overconfident) > 0:
                print(f"  佔總錯誤的比例: {len(overconfident)/len(incorrect_conf):.1%}")
        
        # 低信心度正確 (保守)
        if len(correct_conf) > 0:
            underconfident = correct_conf[correct_conf['最大機率'] < 0.5]
            print(f"\n保守的正確答案 (機率 < 0.5): {len(underconfident)} 題")
            if len(underconfident) > 0:
                print(f"  佔總正確的比例: {len(underconfident)/len(correct_conf):.1%}")
        
        confidence_analysis[model_key] = conf_df
        
        # 保存信心度分析
        conf_file = ANALYSIS_DIR / f"{model_key}_confidence.csv"
        conf_df.to_csv(conf_file, index=False, encoding='utf-8-sig')
        print(f"\n✅ 信心度分析已保存: {conf_file}")
    
    return confidence_analysis


# 執行信心度分析
confidence_analysis = analyze_confidence(results)

## 6. 模型強弱項對比

識別每個模型的強項和弱項學科。

In [None]:
def compare_model_strengths(results: dict) -> pd.DataFrame:
    """
    對比模型在各學科的表現
    
    Args:
        results: 評估結果字典
    
    Returns:
        學科對比 DataFrame
    """
    # 收集所有學科的準確率
    subject_comparison = defaultdict(dict)
    
    for model_key, result in results.items():
        model_name = result['model_name']
        
        for subject_result in result['subject_results']:
            subject = subject_result['subject']
            accuracy = subject_result['accuracy']
            subject_comparison[subject][model_name] = accuracy
    
    # 創建 DataFrame
    comparison_df = pd.DataFrame(subject_comparison).T
    
    # 計算差異
    if len(comparison_df.columns) == 2:
        model1, model2 = comparison_df.columns
        comparison_df['差異'] = comparison_df[model1] - comparison_df[model2]
        comparison_df['優勢模型'] = comparison_df['差異'].apply(
            lambda x: model1 if x > 0 else (model2 if x < 0 else '相同')
        )
    
    # 排序
    if '差異' in comparison_df.columns:
        comparison_df = comparison_df.sort_values('差異', ascending=False)
    
    # 格式化
    for col in comparison_df.columns:
        if col not in ['優勢模型']:
            if comparison_df[col].dtype in [np.float64, np.float32]:
                comparison_df[col] = comparison_df[col].apply(lambda x: f"{x:.2%}")
    
    return comparison_df


if len(results) >= 2:
    # 對比分析
    comparison_df = compare_model_strengths(results)
    
    print("\n" + "=" * 80)
    print("模型強弱項對比 (按差異排序)")
    print("=" * 80)
    print(comparison_df.to_string())
    print("=" * 80)
    
    # 保存對比結果
    comparison_df.to_csv(ANALYSIS_DIR / "model_comparison.csv", encoding='utf-8-sig')
    print(f"\n✅ 對比結果已保存: {ANALYSIS_DIR / 'model_comparison.csv'}")
    
    # 總結強弱項
    print("\n" + "=" * 60)
    print("強弱項總結")
    print("=" * 60)
    
    for model_name in comparison_df.columns:
        if model_name not in ['差異', '優勢模型']:
            print(f"\n{model_name}:")
            
            # 找出強項 (準確率最高的 3 個學科)
            strengths = comparison_df[comparison_df['優勢模型'] == model_name].head(3)
            if len(strengths) > 0:
                print(f"  強項:")
                for subject in strengths.index:
                    print(f"    - {subject}: {strengths.loc[subject, model_name]}")
            
            # 找出弱項 (準確率最低的 3 個學科)
            weaknesses = comparison_df[comparison_df['優勢模型'] != model_name].tail(3)
            if len(weaknesses) > 0:
                print(f"  弱項:")
                for subject in weaknesses.index:
                    print(f"    - {subject}: {weaknesses.loc[subject, model_name]}")
else:
    print("\n⚠️  需要至少 2 個模型的結果才能進行對比分析")

## 7. 生成分析摘要

生成完整的分析摘要報告。

In [None]:
def generate_analysis_summary(results: dict, output_file: Path):
    """
    生成分析摘要報告
    
    Args:
        results: 評估結果字典
        output_file: 輸出檔案路徑
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("# C-Eval 評估分析報告\n\n")
        f.write(f"生成時間: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        # 1. 整體表現
        f.write("## 1. 整體表現\n\n")
        for model_key, result in results.items():
            f.write(f"### {result['model_name']}\n\n")
            f.write(f"- 整體準確率: {result['overall_accuracy']:.2%}\n")
            f.write(f"- 評估學科數: {len(result['subject_results'])}\n")
            f.write(f"\n分類準確率:\n")
            for category, accuracy in result['category_accuracies'].items():
                f.write(f"  - {category}: {accuracy:.2%}\n")
            f.write("\n")
        
        # 2. 學科難度排名
        f.write("## 2. 學科難度排名\n\n")
        difficulty_df = analyze_subject_difficulty(results)
        f.write(difficulty_df.to_markdown(index=False))
        f.write("\n\n")
        
        # 3. 錯誤統計
        f.write("## 3. 錯誤統計\n\n")
        for model_key, result in results.items():
            total_errors = sum(
                len([d for d in sr['details'] if not d['is_correct']])
                for sr in result['subject_results']
            )
            f.write(f"### {result['model_name']}\n\n")
            f.write(f"- 總錯誤數: {total_errors}\n")
            
            # 各學科錯誤數
            f.write(f"\n各學科錯誤數:\n")
            for sr in result['subject_results']:
                errors = len([d for d in sr['details'] if not d['is_correct']])
                f.write(f"  - {sr['subject']}: {errors}/{sr['total']}\n")
            f.write("\n")
        
        # 4. 關鍵發現
        f.write("## 4. 關鍵發現\n\n")
        
        # 找出最難的學科
        difficulty_df_raw = analyze_subject_difficulty(results)
        hardest_subject = difficulty_df_raw.iloc[0]['學科']
        easiest_subject = difficulty_df_raw.iloc[-1]['學科']
        
        f.write(f"- 最困難學科: {hardest_subject}\n")
        f.write(f"- 最簡單學科: {easiest_subject}\n")
        
        # 模型對比
        if len(results) >= 2:
            model_names = [r['model_name'] for r in results.values()]
            accuracies = [r['overall_accuracy'] for r in results.values()]
            best_model_idx = np.argmax(accuracies)
            f.write(f"- 最佳模型: {model_names[best_model_idx]} ({accuracies[best_model_idx]:.2%})\n")
            f.write(f"- 準確率差距: {(max(accuracies) - min(accuracies)):.2%}\n")
        
        f.write("\n---\n\n")
        f.write("詳細圖表請參閱 04-Visualize_and_Report.ipynb\n")
    
    print(f"\n✅ 分析摘要已生成: {output_file}")


# 生成摘要
summary_file = ANALYSIS_DIR / "analysis_summary.md"
generate_analysis_summary(results, summary_file)

print("\n" + "=" * 60)
print("分析完成!")
print("=" * 60)
print(f"\n所有分析結果已保存至: {ANALYSIS_DIR}")
print("\n生成的檔案:")
for file in sorted(ANALYSIS_DIR.glob("*")):
    print(f"  - {file.name}")

## 📝 總結

在本 notebook 中,我們完成了:

1. ✅ 載入評估結果並計算整體統計
2. ✅ 學科難度分析 (識別最難和最簡單的學科)
3. ✅ 錯誤分析 (找出常見錯誤模式)
4. ✅ 信心度分析 (檢查模型的過度自信和保守傾向)
5. ✅ 模型強弱項對比 (識別各模型的優劣勢)
6. ✅ 生成分析摘要報告

### 下一步

前往 **04-Visualize_and_Report.ipynb** 生成視覺化圖表和完整評估報告。

---

**關鍵發現**:
- 學科難度與模型表現高度相關
- 中文優化模型 (如 Qwen) 通常在中文任務上表現更好
- 錯誤模式可以幫助識別模型的系統性弱點
- 信心度分析可以揭示模型的校準品質
