# Lab 4.1 - OpenCompass 評估實戰
## Notebook 04: 視覺化與報告生成

**學習目標**:
1. 繪製雷達圖 (多維度能力分佈)
2. 繪製熱力圖 (學科表現矩陣)
3. 繪製柱狀圖 (模型對比)
4. 生成完整的評估報告 (Markdown/HTML)
5. 匯出圖表為 PNG 檔案

**預計時間**: 30-45 分鐘

---

## 1. 環境設定與數據載入

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List
import warnings
warnings.filterwarnings('ignore')

# 設定中文字體 (解決中文顯示問題)
plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial Unicode MS', 'SimHei', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False  # 解決負號顯示問題

# 設定風格
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.2)

# 設定路徑
RESULTS_DIR = Path("./results")
ANALYSIS_DIR = Path("./analysis")
CHARTS_DIR = Path("./charts")
CHARTS_DIR.mkdir(exist_ok=True)

print("📊 視覺化環境準備中...")
print("=" * 60)

# 載入評估結果
results = {}
result_files = {
    'llama-2-7b': RESULTS_DIR / "llama2_7b_results.json",
    'qwen-7b': RESULTS_DIR / "qwen_7b_results.json"
}

for model_key, result_file in result_files.items():
    if result_file.exists():
        with open(result_file, 'r', encoding='utf-8') as f:
            results[model_key] = json.load(f)
        print(f"✅ 載入: {result_file.name}")
    else:
        print(f"⚠️  未找到: {result_file.name}")

if not results:
    raise FileNotFoundError("未找到評估結果,請先執行 02-Evaluate.ipynb")

print(f"\n✅ 共載入 {len(results)} 個模型的結果")
print("=" * 60)

## 2. 整體表現對比 - 柱狀圖

使用柱狀圖對比模型的整體表現和分類表現。

In [None]:
def plot_overall_comparison(results: dict, save_path: Path = None):
    """
    繪製整體表現對比柱狀圖
    
    Args:
        results: 評估結果字典
        save_path: 儲存路徑
    """
    # 準備數據
    model_names = [r['model_name'] for r in results.values()]
    overall_accs = [r['overall_accuracy'] * 100 for r in results.values()]
    
    # 收集分類準確率
    categories = list(list(results.values())[0]['category_accuracies'].keys())
    category_data = {cat: [] for cat in categories}
    
    for result in results.values():
        for cat in categories:
            category_data[cat].append(result['category_accuracies'][cat] * 100)
    
    # 創建圖表
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # 子圖 1: 整體準確率
    ax1 = axes[0]
    bars1 = ax1.bar(model_names, overall_accs, color=['#3498db', '#e74c3c'], alpha=0.8, edgecolor='black')
    ax1.set_ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
    ax1.set_title('Overall Accuracy Comparison', fontsize=14, fontweight='bold', pad=20)
    ax1.set_ylim(0, 100)
    ax1.grid(axis='y', alpha=0.3)
    
    # 添加數值標籤
    for bar, acc in zip(bars1, overall_accs):
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
                f'{acc:.1f}%', ha='center', va='bottom', fontsize=11, fontweight='bold')
    
    # 子圖 2: 分類準確率
    ax2 = axes[1]
    x = np.arange(len(categories))
    width = 0.35
    
    colors = ['#3498db', '#e74c3c']
    for i, (model_name, color) in enumerate(zip(model_names, colors)):
        values = [category_data[cat][i] for cat in categories]
        bars = ax2.bar(x + i*width, values, width, label=model_name, color=color, alpha=0.8, edgecolor='black')
        
        # 添加數值標籤
        for bar, val in zip(bars, values):
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height + 1,
                    f'{val:.1f}%', ha='center', va='bottom', fontsize=9)
    
    ax2.set_ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
    ax2.set_title('Category-wise Accuracy Comparison', fontsize=14, fontweight='bold', pad=20)
    ax2.set_xticks(x + width / 2)
    ax2.set_xticklabels(categories, rotation=15, ha='right')
    ax2.set_ylim(0, 100)
    ax2.legend(loc='upper right', framealpha=0.9)
    ax2.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✅ 圖表已儲存: {save_path}")
    
    plt.show()


# 繪製整體對比圖
plot_overall_comparison(results, CHARTS_DIR / "01_overall_comparison.png")

## 3. 學科表現熱力圖

使用熱力圖展示所有學科的表現。

In [None]:
def plot_subject_heatmap(results: dict, save_path: Path = None):
    """
    繪製學科表現熱力圖
    
    Args:
        results: 評估結果字典
        save_path: 儲存路徑
    """
    # 準備數據
    model_names = [r['model_name'] for r in results.values()]
    subjects = list(list(results.values())[0]['subject_results'][0].keys())
    
    # 收集所有學科的準確率
    heatmap_data = []
    subject_names = []
    
    # 獲取所有學科
    all_subjects = [sr['subject'] for sr in list(results.values())[0]['subject_results']]
    
    for subject in all_subjects:
        row = []
        for result in results.values():
            # 找到對應學科的準確率
            subject_result = next(
                (sr for sr in result['subject_results'] if sr['subject'] == subject),
                None
            )
            if subject_result:
                row.append(subject_result['accuracy'] * 100)
        
        if row:
            heatmap_data.append(row)
            # 簡化學科名稱 (如果太長)
            display_name = subject.replace('_', ' ').title()
            if len(display_name) > 30:
                display_name = display_name[:27] + '...'
            subject_names.append(display_name)
    
    # 創建 DataFrame
    df = pd.DataFrame(heatmap_data, columns=model_names, index=subject_names)
    
    # 繪製熱力圖
    fig, ax = plt.subplots(figsize=(10, 8))
    
    sns.heatmap(
        df,
        annot=True,
        fmt='.1f',
        cmap='RdYlGn',
        center=50,
        vmin=0,
        vmax=100,
        cbar_kws={'label': 'Accuracy (%)'},
        linewidths=0.5,
        linecolor='gray',
        ax=ax
    )
    
    ax.set_title('Subject Performance Heatmap', fontsize=16, fontweight='bold', pad=20)
    ax.set_xlabel('Model', fontsize=12, fontweight='bold')
    ax.set_ylabel('Subject', fontsize=12, fontweight='bold')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✅ 圖表已儲存: {save_path}")
    
    plt.show()


# 繪製熱力圖
plot_subject_heatmap(results, CHARTS_DIR / "02_subject_heatmap.png")

## 4. 能力雷達圖

使用雷達圖展示模型在不同學科類別的多維度能力。

In [None]:
def plot_radar_chart(results: dict, save_path: Path = None):
    """
    繪製能力雷達圖
    
    Args:
        results: 評估結果字典
        save_path: 儲存路徑
    """
    # 準備數據
    categories = list(list(results.values())[0]['category_accuracies'].keys())
    num_vars = len(categories)
    
    # 計算角度
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1]  # 閉合圖形
    
    # 創建圖表
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))
    
    # 顏色和標記
    colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']
    markers = ['o', 's', '^', 'D']
    
    # 繪製每個模型
    for i, (model_key, result) in enumerate(results.items()):
        model_name = result['model_name']
        values = [result['category_accuracies'][cat] * 100 for cat in categories]
        values += values[:1]  # 閉合圖形
        
        ax.plot(angles, values, 'o-', linewidth=2, label=model_name, 
                color=colors[i % len(colors)], marker=markers[i % len(markers)], markersize=8)
        ax.fill(angles, values, alpha=0.15, color=colors[i % len(colors)])
    
    # 設定標籤
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories, fontsize=12)
    
    # 設定 y 軸
    ax.set_ylim(0, 100)
    ax.set_yticks([20, 40, 60, 80, 100])
    ax.set_yticklabels(['20%', '40%', '60%', '80%', '100%'], fontsize=10)
    ax.set_rlabel_position(0)
    
    # 添加網格
    ax.grid(True, linestyle='--', alpha=0.5)
    
    # 標題和圖例
    ax.set_title('Multi-Dimensional Capability Radar Chart', 
                 fontsize=16, fontweight='bold', pad=30)
    ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=11, framealpha=0.9)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✅ 圖表已儲存: {save_path}")
    
    plt.show()


# 繪製雷達圖
plot_radar_chart(results, CHARTS_DIR / "03_radar_chart.png")

## 5. 學科表現詳細對比

繪製每個學科的詳細對比柱狀圖。

In [None]:
def plot_detailed_subject_comparison(results: dict, save_path: Path = None):
    """
    繪製學科詳細對比柱狀圖
    
    Args:
        results: 評估結果字典
        save_path: 儲存路徑
    """
    # 準備數據
    model_names = [r['model_name'] for r in results.values()]
    all_subjects = [sr['subject'] for sr in list(results.values())[0]['subject_results']]
    
    # 收集數據
    data = {model: [] for model in model_names}
    subject_display_names = []
    
    for subject in all_subjects:
        for model_name, result in zip(model_names, results.values()):
            subject_result = next(
                (sr for sr in result['subject_results'] if sr['subject'] == subject),
                None
            )
            if subject_result:
                data[model_name].append(subject_result['accuracy'] * 100)
        
        # 簡化顯示名稱
        display_name = subject.replace('_', ' ').title()
        if len(display_name) > 25:
            display_name = display_name[:22] + '...'
        subject_display_names.append(display_name)
    
    # 創建圖表
    fig, ax = plt.subplots(figsize=(14, 8))
    
    x = np.arange(len(all_subjects))
    width = 0.35
    colors = ['#3498db', '#e74c3c']
    
    for i, (model_name, color) in enumerate(zip(model_names, colors)):
        offset = width * (i - len(model_names)/2 + 0.5)
        bars = ax.bar(x + offset, data[model_name], width, label=model_name, 
                     color=color, alpha=0.8, edgecolor='black')
        
        # 添加數值標籤 (只在較高的柱子上顯示)
        for bar, val in zip(bars, data[model_name]):
            if val > 30:  # 只在高於 30% 時顯示
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height - 3,
                       f'{val:.0f}', ha='center', va='top', fontsize=8, color='white', fontweight='bold')
    
    # 設定標籤和標題
    ax.set_xlabel('Subject', fontsize=12, fontweight='bold')
    ax.set_ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
    ax.set_title('Detailed Subject-wise Performance Comparison', 
                 fontsize=14, fontweight='bold', pad=20)
    ax.set_xticks(x)
    ax.set_xticklabels(subject_display_names, rotation=45, ha='right', fontsize=9)
    ax.set_ylim(0, 100)
    ax.legend(loc='upper right', framealpha=0.9)
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✅ 圖表已儲存: {save_path}")
    
    plt.show()


# 繪製詳細對比圖
plot_detailed_subject_comparison(results, CHARTS_DIR / "04_detailed_comparison.png")

## 6. 錯誤率分析圖

視覺化錯誤率分佈。

In [None]:
def plot_error_analysis(results: dict, save_path: Path = None):
    """
    繪製錯誤率分析圖
    
    Args:
        results: 評估結果字典
        save_path: 儲存路徑
    """
    # 準備數據
    model_names = [r['model_name'] for r in results.values()]
    all_subjects = [sr['subject'] for sr in list(results.values())[0]['subject_results']]
    
    # 收集錯誤率數據
    error_data = {model: [] for model in model_names}
    
    for subject in all_subjects:
        for model_name, result in zip(model_names, results.values()):
            subject_result = next(
                (sr for sr in result['subject_results'] if sr['subject'] == subject),
                None
            )
            if subject_result:
                error_rate = (1 - subject_result['accuracy']) * 100
                error_data[model_name].append(error_rate)
    
    # 創建圖表
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # 子圖 1: 錯誤率分佈 (箱型圖)
    ax1 = axes[0]
    positions = np.arange(len(model_names)) + 1
    bp = ax1.boxplot(
        [error_data[model] for model in model_names],
        positions=positions,
        labels=model_names,
        patch_artist=True,
        notch=True,
        showmeans=True
    )
    
    # 設定箱型圖顏色
    colors = ['#3498db', '#e74c3c']
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.6)
    
    ax1.set_ylabel('Error Rate (%)', fontsize=12, fontweight='bold')
    ax1.set_title('Error Rate Distribution', fontsize=14, fontweight='bold', pad=20)
    ax1.grid(axis='y', alpha=0.3)
    
    # 子圖 2: 總錯誤數對比
    ax2 = axes[1]
    total_errors = []
    total_questions = []
    
    for result in results.values():
        errors = sum(sr['total'] - sr['correct'] for sr in result['subject_results'])
        total = sum(sr['total'] for sr in result['subject_results'])
        total_errors.append(errors)
        total_questions.append(total)
    
    bars = ax2.bar(model_names, total_errors, color=colors, alpha=0.8, edgecolor='black')
    ax2.set_ylabel('Number of Errors', fontsize=12, fontweight='bold')
    ax2.set_title('Total Error Count', fontsize=14, fontweight='bold', pad=20)
    ax2.grid(axis='y', alpha=0.3)
    
    # 添加數值標籤
    for bar, errors, total in zip(bars, total_errors, total_questions):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f'{errors}\n({errors/total*100:.1f}%)', 
                ha='center', va='bottom', fontsize=11, fontweight='bold')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✅ 圖表已儲存: {save_path}")
    
    plt.show()


# 繪製錯誤分析圖
plot_error_analysis(results, CHARTS_DIR / "05_error_analysis.png")

## 7. 生成完整評估報告

自動生成包含所有圖表和分析的 Markdown/HTML 報告。

In [None]:
def generate_evaluation_report(results: dict, charts_dir: Path, output_file: Path):
    """
    生成完整評估報告
    
    Args:
        results: 評估結果字典
        charts_dir: 圖表目錄
        output_file: 輸出檔案路徑
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        # 報告標題
        f.write("# C-Eval 模型評估完整報告\n\n")
        f.write(f"**生成時間**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        f.write("---\n\n")
        
        # 1. 執行摘要
        f.write("## 1. 執行摘要\n\n")
        
        model_names = [r['model_name'] for r in results.values()]
        overall_accs = [r['overall_accuracy'] for r in results.values()]
        
        if len(results) >= 2:
            best_idx = np.argmax(overall_accs)
            f.write(f"- **最佳模型**: {model_names[best_idx]} ({overall_accs[best_idx]:.2%})\n")
            f.write(f"- **準確率差距**: {(max(overall_accs) - min(overall_accs)):.2%}\n")
        
        f.write(f"- **評估學科數**: {len(list(results.values())[0]['subject_results'])}\n")
        f.write(f"- **總評估題數**: {sum(sr['total'] for sr in list(results.values())[0]['subject_results'])}\n\n")
        
        # 2. 整體表現
        f.write("## 2. 整體表現\n\n")
        f.write("### 2.1 準確率對比\n\n")
        f.write("![整體對比](./charts/01_overall_comparison.png)\n\n")
        
        f.write("### 2.2 詳細統計\n\n")
        f.write("| 模型 | 整體準確率 | STEM | Humanities | Social Science |\n")
        f.write("|------|-----------|------|------------|----------------|\n")
        
        for result in results.values():
            model_name = result['model_name']
            overall = result['overall_accuracy']
            cats = result['category_accuracies']
            f.write(f"| {model_name} | {overall:.2%} | ")
            f.write(f"{cats.get('STEM', 0):.2%} | ")
            f.write(f"{cats.get('Humanities', 0):.2%} | ")
            f.write(f"{cats.get('Social Science', 0):.2%} |\n")
        
        f.write("\n")
        
        # 3. 學科表現
        f.write("## 3. 學科表現分析\n\n")
        f.write("### 3.1 學科表現熱力圖\n\n")
        f.write("![學科熱力圖](./charts/02_subject_heatmap.png)\n\n")
        
        f.write("### 3.2 詳細學科對比\n\n")
        f.write("![詳細對比](./charts/04_detailed_comparison.png)\n\n")
        
        # 4. 能力雷達圖
        f.write("## 4. 多維度能力分析\n\n")
        f.write("![能力雷達圖](./charts/03_radar_chart.png)\n\n")
        
        # 5. 錯誤分析
        f.write("## 5. 錯誤分析\n\n")
        f.write("![錯誤分析](./charts/05_error_analysis.png)\n\n")
        
        f.write("### 5.1 錯誤統計\n\n")
        f.write("| 模型 | 總錯誤數 | 錯誤率 | 最易錯學科 |\n")
        f.write("|------|----------|--------|-----------|\n")
        
        for result in results.values():
            model_name = result['model_name']
            total_errors = sum(sr['total'] - sr['correct'] for sr in result['subject_results'])
            total = sum(sr['total'] for sr in result['subject_results'])
            error_rate = total_errors / total
            
            # 找出錯誤率最高的學科
            worst_subject = min(result['subject_results'], key=lambda x: x['accuracy'])
            
            f.write(f"| {model_name} | {total_errors} | {error_rate:.2%} | ")
            f.write(f"{worst_subject['subject']} ({worst_subject['accuracy']:.2%}) |\n")
        
        f.write("\n")
        
        # 6. 關鍵發現
        f.write("## 6. 關鍵發現與建議\n\n")
        f.write("### 6.1 主要發現\n\n")
        
        # 計算一些關鍵指標
        if len(results) >= 2:
            model1, model2 = list(results.values())
            
            # 找出差異最大的學科
            max_diff = 0
            max_diff_subject = ""
            
            for sr1 in model1['subject_results']:
                sr2 = next((sr for sr in model2['subject_results'] if sr['subject'] == sr1['subject']), None)
                if sr2:
                    diff = abs(sr1['accuracy'] - sr2['accuracy'])
                    if diff > max_diff:
                        max_diff = diff
                        max_diff_subject = sr1['subject']
            
            f.write(f"1. **整體表現**: {model_names[best_idx]} 以 {overall_accs[best_idx]:.2%} 的準確率領先\n")
            f.write(f"2. **最大差異學科**: {max_diff_subject} (差距 {max_diff:.2%})\n")
            
            # 分析各類別
            for category in ['STEM', 'Humanities', 'Social Science']:
                cat1 = model1['category_accuracies'].get(category, 0)
                cat2 = model2['category_accuracies'].get(category, 0)
                better_model = model1['model_name'] if cat1 > cat2 else model2['model_name']
                f.write(f"3. **{category}**: {better_model} 表現較佳 ({max(cat1, cat2):.2%})\n")
        
        f.write("\n### 6.2 改進建議\n\n")
        f.write("1. **數據增強**: 針對錯誤率高的學科增加訓練數據\n")
        f.write("2. **模型微調**: 考慮針對弱項學科進行專門微調\n")
        f.write("3. **集成方法**: 可以考慮結合兩個模型的優勢\n")
        f.write("4. **提示工程**: 優化提示詞以提高特定學科的表現\n\n")
        
        # 7. 附錄
        f.write("## 7. 附錄\n\n")
        f.write("### 7.1 詳細數據\n\n")
        f.write("完整的評估數據和分析結果可在以下目錄找到:\n\n")
        f.write("- 評估結果: `./results/`\n")
        f.write("- 分析數據: `./analysis/`\n")
        f.write("- 圖表檔案: `./charts/`\n\n")
        
        f.write("---\n\n")
        f.write("*本報告由 Lab 4.1 - OpenCompass 評估實戰自動生成*\n")
    
    print(f"\n✅ 評估報告已生成: {output_file}")


# 生成報告
report_file = Path("./EVALUATION_REPORT.md")
generate_evaluation_report(results, CHARTS_DIR, report_file)

print("\n" + "=" * 60)
print("報告生成完成!")
print("=" * 60)

## 8. 生成 HTML 報告 (可選)

將 Markdown 報告轉換為 HTML 格式以便瀏覽器查看。

In [None]:
def markdown_to_html(md_file: Path, html_file: Path):
    """
    將 Markdown 轉換為 HTML
    
    Args:
        md_file: Markdown 檔案路徑
        html_file: HTML 檔案路徑
    """
    try:
        import markdown
        
        # 讀取 Markdown
        with open(md_file, 'r', encoding='utf-8') as f:
            md_content = f.read()
        
        # 轉換為 HTML
        html_content = markdown.markdown(
            md_content,
            extensions=['tables', 'fenced_code', 'codehilite']
        )
        
        # 添加 HTML 框架和樣式
        full_html = f"""<!DOCTYPE html>
<html lang="zh-TW">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>C-Eval 評估報告</title>
    <style>
        body {{
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            line-height: 1.6;
            max-width: 1200px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f5f5f5;
        }}
        h1, h2, h3 {{
            color: #333;
            border-bottom: 2px solid #3498db;
            padding-bottom: 10px;
        }}
        table {{
            border-collapse: collapse;
            width: 100%;
            margin: 20px 0;
            background-color: white;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }}
        th, td {{
            border: 1px solid #ddd;
            padding: 12px;
            text-align: left;
        }}
        th {{
            background-color: #3498db;
            color: white;
        }}
        tr:nth-child(even) {{
            background-color: #f9f9f9;
        }}
        img {{
            max-width: 100%;
            height: auto;
            display: block;
            margin: 20px auto;
            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
            border-radius: 8px;
        }}
        code {{
            background-color: #f4f4f4;
            padding: 2px 6px;
            border-radius: 3px;
            font-family: 'Courier New', monospace;
        }}
        hr {{
            border: none;
            border-top: 2px solid #3498db;
            margin: 40px 0;
        }}
    </style>
</head>
<body>
    {html_content}
</body>
</html>
"""
        
        # 寫入 HTML 檔案
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(full_html)
        
        print(f"✅ HTML 報告已生成: {html_file}")
        
    except ImportError:
        print("⚠️  markdown 套件未安裝,跳過 HTML 生成")
        print("   安裝方式: pip install markdown")


# 生成 HTML 報告
html_file = Path("./EVALUATION_REPORT.html")
markdown_to_html(report_file, html_file)

## 9. 檔案總結

列出所有生成的檔案。

In [None]:
print("\n" + "=" * 60)
print("生成檔案總結")
print("=" * 60)

print("\n📊 圖表檔案:")
chart_files = sorted(CHARTS_DIR.glob("*.png"))
for i, chart_file in enumerate(chart_files, 1):
    size = chart_file.stat().st_size / 1024  # KB
    print(f"  {i}. {chart_file.name} ({size:.1f} KB)")

print("\n📄 報告檔案:")
report_files = [
    Path("./EVALUATION_REPORT.md"),
    Path("./EVALUATION_REPORT.html")
]
for report in report_files:
    if report.exists():
        size = report.stat().st_size / 1024  # KB
        print(f"  - {report.name} ({size:.1f} KB)")

print("\n📁 其他資料:")
print(f"  - 評估結果: {RESULTS_DIR}")
print(f"  - 分析數據: {ANALYSIS_DIR}")

print("\n" + "=" * 60)
print("✅ 所有視覺化和報告生成完成!")
print("=" * 60)

print("\n💡 提示:")
print("  - 使用瀏覽器開啟 EVALUATION_REPORT.html 查看完整報告")
print("  - 所有圖表已儲存為高解析度 PNG (300 DPI)")
print("  - 可直接使用這些圖表於論文或簡報中")

## 📝 總結

在本 notebook 中,我們完成了:

1. ✅ 整體表現對比柱狀圖
2. ✅ 學科表現熱力圖
3. ✅ 多維度能力雷達圖
4. ✅ 學科詳細對比圖
5. ✅ 錯誤率分析圖
6. ✅ 完整評估報告 (Markdown/HTML)
7. ✅ 所有圖表匯出為 PNG 檔案

### Lab 4.1 完整流程回顧

1. **01-Setup.ipynb**: 環境配置與數據準備
2. **02-Evaluate.ipynb**: 執行模型評估
3. **03-Analyze.ipynb**: 深入結果分析
4. **04-Visualize_and_Report.ipynb**: 視覺化與報告生成 ✅

---

**下一步建議**:
- 嘗試評估更多模型 (Mistral, Gemma 等)
- 使用完整的 C-Eval 數據集 (52 個學科)
- 探索其他評估基準 (MMLU, AGIEval 等)
- 實驗不同的提示工程策略
- 分析模型在不同難度題目上的表現
