# 模型评估
本notebook用于评估已训练好的模型性能，包括：
1. 原始Keras模型
2. TFLite模型
3. 量化后的TFLite模型

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import time
import json
from pathlib import Path

## 1. 数据准备

# 设置数据集名称
dataset_name = "huizecai/mushroom"  # 指定要下载的Kaggle数据集名称

# 使用KaggleHub下载数据集
path = kagglehub.dataset_download(dataset_name)  # 下载数据集并获取保存路径

# 打印数据集文件的保存路径
print("Path to dataset files:", path)

# 设置数据和标签文件的具体路径
dataset_path = path + '/archive/data'  # 图片数据所在目录的路径
label_path = path + '/archive/label.txt'  # 标签文件的路径

## 2. 评估函数定义

def evaluate_model(model_path, test_ds):
    """评估Keras模型性能
    
    Args:
        model_path: 模型保存路径
        test_ds: 测试数据集
        
    Returns:
        dict: 包含评估指标的字典
    """
    # 加载模型
    model = keras.models.load_model(model_path)
    
    # 获取模型大小
    model_size = Path(model_path).stat().st_size / (1024 * 1024)  # MB
    
    # 准备评估指标
    top1_accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=1, name='top1_accuracy')
    top5_accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='top5_accuracy')
    
    # 计算推理时间和准确率
    total_inference_time = 0
    num_batches = 0
    
    for images, labels in test_ds:
        # 计时并进行推理
        start_time = time.time()
        predictions = model(images, training=False)
        end_time = time.time()
        
        # 更新指标
        top1_accuracy.update_state(labels, predictions)
        top5_accuracy.update_state(labels, predictions)
        
        total_inference_time += (end_time - start_time)
        num_batches += 1
    
    # 计算平均推理时间
    avg_inference_time = (total_inference_time / num_batches) * 1000  # ms
    
    # 整合所有指标
    metrics = {
        'top1_accuracy': float(top1_accuracy.result()),
        'top5_accuracy': float(top5_accuracy.result()),
        'model_size_mb': model_size,
        'avg_inference_time_ms': avg_inference_time
    }
    
    return metrics

def evaluate_tflite_model(tflite_path, test_ds):
    """评估TFLite模型性能"""
    # 加载TFLite模型
    interpreter = tf.lite.Interpreter(model_path=tflite_path)
    interpreter.allocate_tensors()
    
    # 获取输入输出细节
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    
    # 获取模型大小
    model_size = Path(tflite_path).stat().st_size / (1024 * 1024)  # MB
    
    # 准备评估指标
    top1_accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=1, name='top1_accuracy')
    top5_accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='top5_accuracy')
    total_inference_time = 0
    num_samples = 0
    
    # 在测试集上评估
    for images, labels in test_ds:
        for image, label in zip(images, labels):
            # 准备输入数据
            input_data = np.expand_dims(image, axis=0).astype(np.float32)
            interpreter.set_tensor(input_details[0]['index'], input_data)
            
            # 计时并进行推理
            start_time = time.time()
            interpreter.invoke()
            end_time = time.time()
            
            # 获取输出
            output_data = interpreter.get_tensor(output_details[0]['index'])
            
            # 更新指标
            top1_accuracy.update_state([label], output_data)
            top5_accuracy.update_state([label], output_data)
            total_inference_time += (end_time - start_time)
            num_samples += 1
    
    # 计算平均推理时间
    avg_inference_time = (total_inference_time / num_samples) * 1000  # ms
    
    # 整合所有指标
    metrics = {
        'top1_accuracy': float(top1_accuracy.result()),
        'top5_accuracy': float(top5_accuracy.result()),
        'model_size_mb': model_size,
        'avg_inference_time_ms': avg_inference_time
    }
    
    return metrics

def evaluate_quantized_tflite_model(tflite_path, test_ds):
    """评估量化后的TFLite模型性能"""
    # 加载TFLite模型
    interpreter = tf.lite.Interpreter(model_path=tflite_path)
    interpreter.allocate_tensors()
    
    # 获取输入输出细节
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    
    # 获取模型大小
    model_size = Path(tflite_path).stat().st_size / (1024 * 1024)  # MB
    
    # 准备评估指标
    top1_accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=1, name='top1_accuracy')
    top5_accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='top5_accuracy')
    total_inference_time = 0
    num_samples = 0
    
    # 在测试集上评估
    for images, labels in test_ds:
        for image, label in zip(images, labels):
            # 准备输入数据（注意：量化模型可能需要uint8输入）
            input_data = np.expand_dims(image, axis=0)
            if input_details[0]['dtype'] == np.uint8:
                input_scale, input_zero_point = input_details[0]['quantization']
                input_data = input_data / input_scale + input_zero_point
                input_data = input_data.astype(np.uint8)
            
            interpreter.set_tensor(input_details[0]['index'], input_data)
            
            # 计时并进行推理
            start_time = time.time()
            interpreter.invoke()
            end_time = time.time()
            
            # 获取输出
            output_data = interpreter.get_tensor(output_details[0]['index'])
            
            # 如果输出是量化的，需要反量化
            if output_details[0]['dtype'] == np.uint8:
                output_scale, output_zero_point = output_details[0]['quantization']
                output_data = (output_data.astype(np.float32) - output_zero_point) * output_scale
            
            # 更新指标
            top1_accuracy.update_state([label], output_data)
            top5_accuracy.update_state([label], output_data)
            total_inference_time += (end_time - start_time)
            num_samples += 1
    
    # 计算平均推理时间
    avg_inference_time = (total_inference_time / num_samples) * 1000  # ms
    
    # 整合所有指标
    metrics = {
        'top1_accuracy': float(top1_accuracy.result()),
        'top5_accuracy': float(top5_accuracy.result()),
        'model_size_mb': model_size,
        'avg_inference_time_ms': avg_inference_time
    }
    
    return metrics

## 3. 模型评估

# 评估原始Keras模型
keras_model_path = 'models/best_model'
keras_metrics = evaluate_model(keras_model_path, test_ds)
print('\nKeras模型评估结果:')
print(json.dumps(keras_metrics, indent=2))

# 评估TFLite模型
tflite_model_path = 'models/model.tflite'
tflite_metrics = evaluate_tflite_model(tflite_model_path, test_ds)
print('\nTFLite模型评估结果:')
print(json.dumps(tflite_metrics, indent=2))

# 评估量化后的TFLite模型
quantized_model_path = 'models/model_quantized.tflite'
quantized_metrics = evaluate_quantized_tflite_model(quantized_model_path, test_ds)
print('\n量化TFLite模型评估结果:')
print(json.dumps(quantized_metrics, indent=2))

## 4. 性能对比可视化

import matplotlib.pyplot as plt

def plot_metrics_comparison(keras_metrics, tflite_metrics, quantized_metrics):
    """绘制不同模型的性能对比图"""
    metrics = ['top1_accuracy', 'top5_accuracy', 'model_size_mb', 'avg_inference_time_ms']
    metric_names = ['Top-1 Accuracy', 'Top-5 Accuracy', 'Model Size (MB)', 'Inference Time (ms)']
    model_types = ['Keras', 'TFLite', 'Quantized TFLite']
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.ravel()
    
    for i, (metric, name) in enumerate(zip(metrics, metric_names)):
        values = [
            keras_metrics[metric],
            tflite_metrics[metric],
            quantized_metrics[metric]
        ]
        
        axes[i].bar(model_types, values)
        axes[i].set_title(name)
        axes[i].set_ylabel(name)
        plt.setp(axes[i].xaxis.get_majorticklabels(), rotation=45)
    
    plt.tight_layout()
    plt.show()

# 绘制性能对比图
plot_metrics_comparison(keras_metrics, tflite_metrics, quantized_metrics)