# HOSVD 手寫辨識分析筆記本

本筆記本演示如何使用高階奇異值分解(HOSVD)進行手寫數字辨識。

## 1. 環境設置

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

# 導入自定義模塊
sys.path.insert(0, '/Users/Benchen1981/Downloads/Google Drive/中興大學/2025-1-3 數據分析數學/Homework 2/Gemini/hosvd_handwriting_recognition')

from data import load_data, DataPreprocessor
from models import HOSVDModel, ClassifierPipeline
from utils import (
    plot_digits, plot_confusion_matrix, plot_classification_metrics,
    plot_dimensionality_reduction, Metrics, ModelEvaluator
)

print("所有模塊已成功導入")

## 2. 數據加載

In [None]:
# 加載MNIST數據集
print("正在加載MNIST數據集...")
X_train, y_train, X_test, y_test = load_data('mnist', normalize=True)

print(f"訓練集形狀: {X_train.shape}")
print(f"測試集形狀: {X_test.shape}")
print(f"類別數: {len(np.unique(y_train))}")
print(f"數據範圍: [{X_train.min():.3f}, {X_train.max():.3f}]")

## 3. 數據預覽

In [None]:
# 顯示樣本數字
fig = plot_digits(X_test[:25], y_test[:25], n_rows=5, n_cols=5, 
                  figsize=(10, 10), title="MNIST測試集樣本")
plt.show()

## 4. 應用HOSVD分解

In [None]:
# 創建HOSVD模型
n_components = 50
print(f"應用HOSVD分解，主成分數: {n_components}")

hosvd = HOSVDModel(n_components=n_components)
X_train_reduced = hosvd.fit_transform(X_train)
X_test_reduced = hosvd.transform(X_test)

print(f"\n降維後的訓練集形狀: {X_train_reduced.shape}")
print(f"降維後的測試集形狀: {X_test_reduced.shape}")
print(f"\n核心張量形狀: {hosvd.get_core_tensor_shape()}")
print(f"壓縮比: {hosvd.get_compression_ratio():.4f}")
print(f"重建誤差: {hosvd.get_reconstruction_error(X_train):.2f}%")

## 5. 訓練分類器

In [None]:
# 訓練KNN分類器
print("訓練KNN分類器...")
knn = ClassifierPipeline('knn', n_neighbors=5)
knn.fit(X_train_reduced, y_train)

train_acc = knn.score(X_train_reduced, y_train)
test_acc = knn.score(X_test_reduced, y_test)

print(f"訓練精度: {train_acc:.4f}")
print(f"測試精度: {test_acc:.4f}")

## 6. 模型評估

In [None]:
# 預測
y_pred = knn.predict(X_test_reduced)

# 評估
evaluator = ModelEvaluator(y_test, y_pred)
metrics = evaluator.get_metrics()

print("分類指標:")
for metric, value in metrics.items():
    print(f"  {metric}: {value:.4f}")

## 7. 混淆矩陣

In [None]:
# 繪製混淆矩陣
fig = plot_confusion_matrix(evaluator.get_confusion_matrix(), figsize=(10, 8))
plt.show()

## 8. 降維可視化

In [None]:
# 使用降維後的數據進行PCA投影和可視化
fig = plot_dimensionality_reduction(X_test, X_test_reduced, y_test, figsize=(14, 5))
plt.show()

## 9. 比較不同分類器

In [None]:
# 訓練多個分類器
classifiers = {
    'KNN': ClassifierPipeline('knn', n_neighbors=5),
    'SVM': ClassifierPipeline('svm', kernel='rbf'),
    'RF': ClassifierPipeline('rf', n_estimators=100),
}

results = {}
for name, clf in classifiers.items():
    print(f"訓練{name}...")
    clf.fit(X_train_reduced, y_train)
    y_pred = clf.predict(X_test_reduced)
    
    evaluator = ModelEvaluator(y_test, y_pred)
    results[name] = evaluator.get_metrics()
    
    print(f"  精度: {results[name]['accuracy']:.4f}")

print("\n完成所有分類器訓練")

## 10. 分類器性能比較

In [None]:
# 創建比較表
df_results = pd.DataFrame(results).T
print(df_results)

# 繪製比較圖
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(results))
width = 0.2
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1']

for i, metric in enumerate(metrics_to_plot):
    values = [results[clf][metric] for clf in results.keys()]
    ax.bar(x + i * width, values, width, label=metric)

ax.set_xlabel('分類器', fontsize=12)
ax.set_ylabel('分數', fontsize=12)
ax.set_title('分類器性能比較', fontsize=14)
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(results.keys())
ax.legend(fontsize=11)
ax.set_ylim([0, 1.1])
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 11. 參數敏感性分析

In [None]:
# 分析不同主成分數的影響
components_range = [10, 20, 30, 40, 50, 75, 100]
accuracies = []

print("分析主成分數的影響...")
for n_comp in components_range:
    print(f"  n_components={n_comp}", end=" ")
    hosvd_temp = HOSVDModel(n_components=n_comp)
    X_train_temp = hosvd_temp.fit_transform(X_train)
    X_test_temp = hosvd_temp.transform(X_test)
    
    knn_temp = ClassifierPipeline('knn', n_neighbors=5)
    knn_temp.fit(X_train_temp, y_train)
    acc = knn_temp.score(X_test_temp, y_test)
    accuracies.append(acc)
    print(f"-> 精度: {acc:.4f}")

print("完成")

## 12. 主成分數與精度的關係

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(components_range, accuracies, 'o-', linewidth=2, markersize=8)
ax.fill_between(components_range, accuracies, alpha=0.3)

ax.set_xlabel('主成分數', fontsize=12)
ax.set_ylabel('測試精度', fontsize=12)
ax.set_title('主成分數與分類精度的關係', fontsize=14)
ax.grid(True, alpha=0.3)

# 添加數值標籤
for x, y in zip(components_range, accuracies):
    ax.text(x, y + 0.005, f'{y:.3f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

print(f"\n最佳主成分數: {components_range[np.argmax(accuracies)]}")
print(f"最高精度: {max(accuracies):.4f}")

## 13. 錯誤分類分析

In [None]:
# 找出分類錯誤的樣本
hosvd_final = HOSVDModel(n_components=50)
X_train_final = hosvd_final.fit_transform(X_train)
X_test_final = hosvd_final.transform(X_test)

knn_final = ClassifierPipeline('knn', n_neighbors=5)
knn_final.fit(X_train_final, y_train)
y_pred_final = knn_final.predict(X_test_final)

# 找錯誤
errors = y_pred_final != y_test
error_indices = np.where(errors)[0]

print(f"總錯誤數: {errors.sum()} / {len(y_test)}")
print(f"錯誤率: {errors.sum() / len(y_test) * 100:.2f}%")

# 顯示前20個錯誤
if len(error_indices) > 0:
    fig, axes = plt.subplots(4, 5, figsize=(12, 10))
    for i, idx in enumerate(error_indices[:20]):
        ax = axes[i // 5, i % 5]
        img = X_test[idx].reshape(28, 28)
        ax.imshow(img, cmap='gray')
        ax.set_title(f'True: {y_test[idx]}, Pred: {y_pred_final[idx]}', color='red')
        ax.axis('off')
    
    plt.suptitle('前20個錯誤分類樣本', fontsize=14)
    plt.tight_layout()
    plt.show()

## 14. 總結

In [None]:
print("="*80)
print("HOSVD 手寫辨識實驗總結")
print("="*80)
print(f"\n數據集: MNIST")
print(f"訓練集大小: {X_train.shape[0]}")
print(f"測試集大小: {X_test.shape[0]}")
print(f"\n原始特徵維度: {X_train.shape[1]}")
print(f"HOSVD降維後維度: {X_train_final.shape[1]}")
print(f"壓縮比: {X_train_final.shape[1] / X_train.shape[1]:.4f}")
print(f"\n最佳分類器: KNN")
print(f"測試精度: {knn_final.score(X_test_final, y_test):.4f}")
print(f"錯誤率: {errors.sum() / len(y_test) * 100:.2f}%")
print("\n實驗完成!")
print("="*80)