# MNIST 手寫辨識：從零實作 CNN（NumPy）與 Keras CNN 版本

這份 Notebook 包含兩個主要部分：
1. **只使用 pandas + numpy + 標準函式庫，從零實作簡易 CNN**（教學示範，效能不佳但可理解原理）。
2. **使用 TensorFlow Keras 的標準 CNN 模型**，適合實際訓練與應用。

你可以在 Colab 直接上傳 MNIST CSV 檔，或用 Keras 內建的 MNIST 資料集。

## 0. 基本環境設定

- 建議在 Google Colab 執行。
- 如果你使用 CSV 版本的 MNIST，請先把 `mnist_train.csv`、`mnist_test.csv` 上傳到 Colab。

In [1]:
# 如果在 Colab，可以先掛載 Google Drive（可選）：
# from google.colab import drive
# drive.mount('/content/drive')

import numpy as np
import pandas as pd
import math

np.random.seed(42)
print("NumPy / Pandas 已載入完成")

NumPy / Pandas 已載入完成


## Part 1. 只用 pandas + numpy 的簡易 CNN

這一部分示範：
- 從 CSV 載入 MNIST
- 自行實作：卷積層、ReLU、MaxPooling、全連接層、Softmax + Cross-Entropy
- 使用簡單 SGD 訓練

> **提醒：** 這是教學示範，沒有任何最佳化，速度會比 Keras 版慢很多。

### 1.1 載入與前處理 MNIST CSV

假設：
- `mnist_train.csv`、`mnist_test.csv`
- 第一欄為 `label` (0–9)
- 其餘 784 欄為像素值（0–255）

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# === 請確認檔名與路徑 ===
train_path = "/content/drive/MyDrive/Colab Notebooks/Vibe Coding/Data/mnist_train.csv"  # 例如："/content/mnist_train.csv"
test_path  = "/content/drive/MyDrive/Colab Notebooks/Vibe Coding/Data/mnist_test.csv"   # 例如："/content/mnist_test.csv"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# 取出 y 與 X
y_train = train_df.iloc[:, 0].values
X_train = train_df.iloc[:, 1:].values.astype(np.float32)

y_test = test_df.iloc[:, 0].values
X_test = test_df.iloc[:, 1:].values.astype(np.float32)

# 正規化到 0~1
X_train /= 255.0
X_test  /= 255.0

# 轉成 (N, 1, 28, 28)
X_train = X_train.reshape(-1, 1, 28, 28)
X_test  = X_test.reshape(-1, 1, 28, 28)

num_classes = 10

def one_hot(y, num_classes=10):
    N = y.shape[0]
    oh = np.zeros((N, num_classes), dtype=np.float32)
    oh[np.arange(N), y] = 1.0
    return oh

y_train_oh = one_hot(y_train, num_classes)
y_test_oh  = one_hot(y_test, num_classes)

print("X_train:", X_train.shape, "y_train:", y_train.shape)

Train shape: (60000, 785)
Test shape: (10000, 785)
X_train: (60000, 1, 28, 28) y_train: (60000,)


### 1.2 自行實作 CNN 各層

包含：
- `conv_forward` / `conv_backward`
- `relu_forward` / `relu_backward`
- `maxpool_forward` / `maxpool_backward`
- `linear_forward` / `linear_backward`
- `softmax_cross_entropy_loss`

In [4]:
### 卷積層（Forward）
def conv_forward(input_tensor, weight, bias, padding=1, stride=1):
    """
    input_tensor: (batch_size, in_channels, height, width)
    weight:       (out_channels, in_channels, kernel_size, kernel_size)
    bias:         (out_channels,)
    padding:      邊界補零數量
    stride:       每次卷積的步幅
    """

    # 取出輸入維度
    batch_size, in_channels, in_height, in_width = input_tensor.shape
    out_channels, _, kernel_size, _ = weight.shape

    # 對輸入做 padding，使卷積可覆蓋邊緣像素
    input_padded = np.pad(
        input_tensor,
        ((0, 0), (0, 0), (padding, padding), (padding, padding)),
        mode="constant"
    )
    _, _, padded_height, padded_width = input_padded.shape

    # 計算輸出 feature map 的尺寸
    out_height = (padded_height - kernel_size) // stride + 1
    out_width = (padded_width - kernel_size) // stride + 1

    # 初始化輸出
    out = np.zeros((batch_size, out_channels, out_height, out_width), dtype=np.float32)

    # 卷積運算
    for n in range(batch_size):               # 每張圖片
        for out_c in range(out_channels):     # 每個濾波器
            for i in range(out_height):       # 垂直方向位置
                for j in range(out_width):    # 水平方向位置
                    h_start = i * stride
                    w_start = j * stride
                    # 卷積視窗取出區域，形狀 = (in_channels, kernel_size, kernel_size)
                    region = input_padded[n, :, h_start:h_start + kernel_size, w_start:w_start + kernel_size]
                    # 卷積公式： sum(region * weight) + bias
                    out[n, out_c, i, j] = np.sum(region * weight[out_c]) + bias[out_c]

    # cache 用來存反向傳播需要的變數
    cache = (input_padded, weight, bias, padding, stride)
    return out, cache


### 卷積層（Backward）
def conv_backward(dout, cache):
    """
    dout: 來自上層的梯度，shape = (batch_size, out_channels, out_height, out_width)
    """

    input_padded, weight, bias, padding, stride = cache
    batch_size, in_channels, padded_height, padded_width = input_padded.shape
    out_channels, _, kernel_size, _ = weight.shape
    _, _, out_height, out_width = dout.shape

    # 初始化梯度
    d_input_padded = np.zeros_like(input_padded, dtype=np.float32)
    d_weight = np.zeros_like(weight, dtype=np.float32)
    d_bias = np.zeros_like(bias, dtype=np.float32)

    # 計算梯度
    for n in range(batch_size):
        for out_c in range(out_channels):
            for i in range(out_height):
                for j in range(out_width):
                    h_start = i * stride
                    w_start = j * stride
                    region = input_padded[n, :, h_start:h_start + kernel_size, w_start:w_start + kernel_size]

                    # bias 的梯度 = 對所有位置的 dout 相加
                    d_bias[out_c] += dout[n, out_c, i, j]

                    # weight 的梯度 = 卷積視窗 * 上層梯度
                    d_weight[out_c] += dout[n, out_c, i, j] * region

                    # input 梯度 = 上層梯度 * 濾波器權重
                    d_input_padded[n, :, h_start:h_start + kernel_size, w_start:w_start + kernel_size] += \
                        dout[n, out_c, i, j] * weight[out_c]

    # 去除 padding 部分，得到真正的 d_input
    if padding > 0:
        d_input = d_input_padded[:, :, padding:-padding, padding:-padding]
    else:
        d_input = d_input_padded

    return d_input, d_weight, d_bias


### ReLU 層（Forward）
def relu_forward(input_tensor):
    """
    ReLU(x) = max(0, x)
    """
    out = np.maximum(0, input_tensor)  # 大於 0 保留，小於 0 設為 0
    cache = input_tensor               # 反向傳播需要知道正負訊息
    return out, cache


### ReLU 層（Backward）
def relu_backward(dout, cache):
    input_tensor = cache
    # ReLU 梯度：小於等於 0 的梯度為 0，大於 0 的梯度為 1
    d_input = dout * (input_tensor > 0)
    return d_input


### MaxPool 2x2（Forward）
def maxpool_forward(input_tensor, pool_size=2, stride=2):
    """
    MaxPool: 在 pool_size x pool_size 區域內取最大值
    """
    batch_size, channels, height, width = input_tensor.shape
    out_height = (height - pool_size) // stride + 1
    out_width = (width - pool_size) // stride + 1

    out = np.zeros((batch_size, channels, out_height, out_width), dtype=np.float32)
    mask = np.zeros_like(input_tensor, dtype=np.float32)  # 紀錄最大值位置

    for n in range(batch_size):
        for c in range(channels):
            for i in range(out_height):
                for j in range(out_width):
                    h_start = i * stride
                    w_start = j * stride
                    window = input_tensor[n, c, h_start:h_start + pool_size, w_start:w_start + pool_size]

                    max_val = np.max(window)
                    out[n, c, i, j] = max_val

                    # 找出最大值位置並作記號
                    max_pos = np.unravel_index(np.argmax(window), window.shape)
                    mask[n, c, h_start + max_pos[0], w_start + max_pos[1]] = 1.0

    cache = (mask, pool_size, stride)
    return out, cache


### MaxPool 2x2（Backward）
def maxpool_backward(dout, cache):
    mask, pool_size, stride = cache
    batch_size, channels, height, width = mask.shape
    _, _, out_height, out_width = dout.shape

    d_input = np.zeros_like(mask, dtype=np.float32)

    for n in range(batch_size):
        for c in range(channels):
            for i in range(out_height):
                for j in range(out_width):
                    h_start = i * stride
                    w_start = j * stride

                    # 只有當時紀錄的最大值位置會收到梯度
                    d_input[n, c, h_start:h_start + pool_size, w_start:w_start + pool_size] += \
                        dout[n, c, i, j] * mask[n, c, h_start:h_start + pool_size, w_start:w_start + pool_size]

    return d_input


### 全連接層（Forward）
def linear_forward(input_tensor, weight, bias):
    """
    input_tensor: (batch_size, in_features)
    weight:       (in_features, out_features)
    bias:         (out_features,)
    """
    out = input_tensor @ weight + bias
    cache = (input_tensor, weight, bias)
    return out, cache


### 全連接層（Backward）
def linear_backward(dout, cache):
    input_tensor, weight, bias = cache

    d_input = dout @ weight.T          # 對輸入的梯度
    d_weight = input_tensor.T @ dout   # 對權重矩陣的梯度
    d_bias = np.sum(dout, axis=0)      # 對偏置的梯度（在 batch 上求和）

    return d_input, d_weight, d_bias


### Softmax + Cross-Entropy（Forward + Backward 合併）
def softmax_cross_entropy_loss(logits, y_onehot):
    """
    logits: (batch_size, num_classes)
    y_onehot: (batch_size, num_classes)
    """

    # Softmax（避免 overflow）
    logits_shifted = logits - np.max(logits, axis=1, keepdims=True)
    exp_scores = np.exp(logits_shifted)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    # Cross Entropy Loss
    batch_size = logits.shape[0]
    loss = -np.sum(y_onehot * np.log(probs + 1e-9)) / batch_size

    # Softmax + Cross Entropy 梯度簡化公式： (p - y) / N
    d_logits = (probs - y_onehot) / batch_size

    return loss, d_logits, probs


print("CNN 各基本層函式定義完成（含完整中文註解）")


CNN 各基本層函式定義完成（含完整中文註解）


### 1.3 建立簡易 CNN 模型（1 層 Conv + MaxPool + FC）

架構：
- Conv2D：輸入 (1, 28, 28) → 8 個 3×3 filter (padding=1)
- ReLU
- MaxPool2D：2×2, stride=2 → 特徵圖大小 14×14
- Flatten
- 全連接層：8×14×14 → 10 類別
- Softmax + Cross-Entropy

In [10]:
import numpy as np
# 定義模型結構與前向/反向
C_in = 1
C_out = 8
K = 3
pad = 1
stride = 1

H_out_conv = 28  # padding=1, stride=1 -> 高度不變
W_out_conv = 28
H_pool = H_out_conv // 2
W_pool = W_out_conv // 2
D_flat = C_out * H_pool * W_pool  # 8 * 14 * 14 = 1568

rng = np.random.default_rng(42)
W1 = rng.normal(0, 0.1, size=(C_out, C_in, K, K)).astype(np.float32)
b1 = np.zeros(C_out, dtype=np.float32)
W2 = rng.normal(0, 0.1, size=(D_flat, num_classes)).astype(np.float32)
b2 = np.zeros(num_classes, dtype=np.float32)

def forward_pass(X):
    # Conv
    z1, cache_conv = conv_forward(X, W1, b1, padding=pad, stride=stride)
    # ReLU
    a1, cache_relu = relu_forward(z1)
    # MaxPool
    p1, cache_pool = maxpool_forward(a1, pool_size=2, stride=2)
    # Flatten
    N = X.shape[0]
    flat = p1.reshape(N, -1)
    cache_flat = p1.shape
    # Linear
    logits, cache_fc = linear_forward(flat, W2, b2)

    caches = (cache_conv, cache_relu, cache_pool, cache_flat, cache_fc)
    return logits, caches

def backward_pass(dlogits, caches):
    global W1, b1, W2, b2
    cache_conv, cache_relu, cache_pool, cache_flat, cache_fc = caches

    dflat, dW2, db2_ = linear_backward(dlogits, cache_fc)
    dpool = dflat.reshape(cache_flat)
    da1 = maxpool_backward(dpool, cache_pool)
    dz1 = relu_backward(da1, cache_relu)
    dX, dW1, db1_ = conv_backward(dz1, cache_conv)

    return dW1, db1_, dW2, db2_

def accuracy(X, y_true, batch_size=256):
    N = X.shape[0]
    correct = 0
    total = 0
    for i in range(0, N, batch_size):
        X_batch = X[i:i+batch_size]
        y_batch = y_true[i:i+batch_size]
        logits, _ = forward_pass(X_batch)
        preds = np.argmax(logits, axis=1)
        correct += np.sum(preds == y_batch)
        total += y_batch.shape[0]
    return correct / total

print("簡易 CNN 模型初始化完成")


簡易 CNN 模型初始化完成


### 1.4 訓練迴圈（示範）

- 為了節省時間，先只用部分訓練資料（例如前 10000 筆）。
- 可以自行把 `N_train_use` 改為全部樣本數。

In [11]:
# 訓練設定
N_train_use = 10000  # 可以改成 len(X_train) 使用全部資料
X_tr = X_train[:N_train_use]
y_tr = y_train[:N_train_use]
y_tr_oh = y_train_oh[:N_train_use]

learning_rate = 0.01
num_epochs = 3
batch_size = 64

for epoch in range(num_epochs):
    idx = np.random.permutation(N_train_use)
    X_tr = X_tr[idx]
    y_tr = y_tr[idx]
    y_tr_oh = y_tr_oh[idx]

    total_loss = 0.0
    num_batches = 0

    for i in range(0, N_train_use, batch_size):
        X_batch = X_tr[i:i+batch_size]
        y_batch_oh = y_tr_oh[i:i+batch_size]

        logits, caches = forward_pass(X_batch)
        loss, dlogits, probs = softmax_cross_entropy_loss(logits, y_batch_oh)
        total_loss += loss
        num_batches += 1

        dW1, db1_, dW2, db2_ = backward_pass(dlogits, caches)

        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1_
        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2_

    train_acc = accuracy(X_tr, y_tr)
    test_acc = accuracy(X_test, y_test)

    print(f"Epoch {epoch+1}/{num_epochs} "
          f"Loss = {total_loss/num_batches:.4f}, "
          f"Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")

KeyboardInterrupt: 

---
## Part 2. 使用 TensorFlow Keras 的 CNN

這一部分改用 TensorFlow Keras：
- 直接載入 Keras 內建 MNIST
- 建立標準 CNN 模型
- 編譯與訓練
- 評估與簡單預測


### 2.1 載入與前處理 MNIST（Keras 內建）

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

print("TensorFlow 版本：", tf.__version__)

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

x_train = x_train.astype('float32') / 255.0
x_test  = x_test.astype('float32') / 255.0

# 增加通道維度 (N, 28, 28, 1)
x_train = x_train[..., tf.newaxis]
x_test  = x_test[..., tf.newaxis]

num_classes = 10
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)

TensorFlow 版本： 2.19.0
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
x_train shape: (60000, 28, 28, 1)
y_train shape: (60000,)


### 2.2 建立 CNN 模型

In [None]:
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(num_classes, activation='softmax'),
])

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### 2.3 編譯與訓練

In [None]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'],
)

history = model.fit(
    x_train, y_train,
    epochs=5,
    batch_size=128,
    validation_split=0.1,
    verbose=1
)

test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
print(f"Test accuracy: {test_acc:.4f}")

Epoch 1/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 100ms/step - accuracy: 0.8089 - loss: 0.6057 - val_accuracy: 0.9785 - val_loss: 0.0725
Epoch 2/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 96ms/step - accuracy: 0.9782 - loss: 0.0681 - val_accuracy: 0.9848 - val_loss: 0.0538
Epoch 3/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 95ms/step - accuracy: 0.9856 - loss: 0.0457 - val_accuracy: 0.9872 - val_loss: 0.0440
Epoch 4/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 98ms/step - accuracy: 0.9891 - loss: 0.0356 - val_accuracy: 0.9847 - val_loss: 0.0495
Epoch 5/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 94ms/step - accuracy: 0.9920 - loss: 0.0270 - val_accuracy: 0.9902 - val_loss: 0.0358
Test accuracy: 0.9893


### 2.4 預測與範例輸出

In [None]:
import numpy as np

idx = np.random.choice(len(x_test), size=5, replace=False)
x_sample = x_test[idx]
y_true = y_test[idx]

y_pred_prob = model.predict(x_sample)
y_pred = np.argmax(y_pred_prob, axis=1)

print("真實標籤：", y_true)
print("預測結果：", y_pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
真實標籤： [6 5 4 2 9]
預測結果： [6 5 4 2 9]
