<a href="https://colab.research.google.com/github/ben12345qq/-DIT5411-CHUNG-SIU-KEUNG/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Download

In [None]:
!pip install opencv-python  # For manual augmentation if needed, but we'll use TensorFlow's built-in



In [None]:
# Clone the repo containing the split zip files
!git clone https://github.com/chenkenanalytic/handwritting_data_all.git

# Navigate to the directory
%cd handwritting_data_all

# Concatenate the split zip files into one
!cat all_data.zip* > all_data.zip

# Unzip with Big5 encoding to avoid garbled filenames
!unzip -O big5 all_data.zip -d /content/dataset

# The data is now in /content/dataset/cleaned_data/
# Each subfolder is named after a Chinese character, containing ~50 image files (e.g., 1.png, 2.png, etc.)

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
  inflating: /content/dataset/cleaned_data/9924/頊_0.png  
  inflating: /content/dataset/cleaned_data/9924/頊_1.png  
  inflating: /content/dataset/cleaned_data/9924/頊_10.png  
  inflating: /content/dataset/cleaned_data/9924/頊_11.png  
  inflating: /content/dataset/cleaned_data/9924/頊_12.png  
  inflating: /content/dataset/cleaned_data/9924/頊_13.png  
  inflating: /content/dataset/cleaned_data/9924/頊_14.png  
  inflating: /content/dataset/cleaned_data/9924/頊_15.png  
  inflating: /content/dataset/cleaned_data/9924/頊_16.png  
  inflating: /content/dataset/cleaned_data/9924/頊_17.png  
  inflating: /content/dataset/cleaned_data/9924/頊_18.png  
  inflating: /content/dataset/cleaned_data/9924/頊_19.png  
  inflating: /content/dataset/cleaned_data/9924/頊_2.png  
  inflating: /content/dataset/cleaned_data/9924/頊_20.png  
  inflating: /content/dataset/cleaned_data/9924/頊_21.png  
  inflating: /content/dataset/cleaned_data/9924/頊_22.png  
  inflating: /content/da

# ProcessData


In [None]:
import os
import shutil

data_dir = '/content/dataset/cleaned_data'
train_dir = '/content/train'
test_dir = '/content/test'

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Loop through each character folder
for char_folder in os.listdir(data_dir):
    char_path = os.path.join(data_dir, char_folder)
    if os.path.isdir(char_path):
        # Get sorted list of image files (assuming numbered like 1.png, 2.png...)
        images = sorted([f for f in os.listdir(char_path) if f.endswith('.png') or f.endswith('.jpg')])

        # Create subfolders in train and test
        train_char_dir = os.path.join(train_dir, char_folder)
        test_char_dir = os.path.join(test_dir, char_folder)
        os.makedirs(train_char_dir, exist_ok=True)
        os.makedirs(test_char_dir, exist_ok=True)

        # Copy first 40 to train
        for img in images[:40]:
            shutil.copy(os.path.join(char_path, img), os.path.join(train_char_dir, img))

        # Copy remaining to test
        for img in images[40:]:
            shutil.copy(os.path.join(char_path, img), os.path.join(test_char_dir, img))

In [None]:
!pip install tensorflow



Data Augmentation


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
# Image size (resize to save memory; adjust based on your RAM/GPU)
IMG_SIZE = 64



# Augmentation for training (aim for ~5x augmentation to reach 200 effective samples per class)
train_datagen = ImageDataGenerator(
    rescale=1./255,              # Normalize pixel values
    rotation_range=15,           # Random rotation up to 15 degrees
    shear_range=0.2,             # Shearing up to 20%
    zoom_range=0.2,              # Scaling/zoom up to 20%
    width_shift_range=0.1,       # Horizontal shift
    height_shift_range=0.1,      # Vertical shift
    horizontal_flip=False,       # No flip for characters (direction matters)
    validation_split=0.0         # We'll handle split manually
)

# No augmentation for test, just normalization
test_datagen = ImageDataGenerator(rescale=1./255)

# Load train data with augmentation
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(IMG_SIZE, IMG_SIZE),
    color_mode='grayscale',  # Assuming grayscale handwriting images
    batch_size=32,
    class_mode='categorical',
    shuffle=True
)

# Load test data
test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(IMG_SIZE, IMG_SIZE),
    color_mode='grayscale',
    batch_size=32,
    class_mode='categorical',
    shuffle=False
)

# Number of classes
NUM_CLASSES = len(train_generator.class_indices)
print(f"Detected {NUM_CLASSES} classes.")


# Model 1: Simple CNN (with Input layer to fix warning)
model1 = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 1)),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')
])

model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train with fixed steps_per_epoch; use more epochs for effective augmentation
history1 = model1.fit(
    train_generator,
    steps_per_epoch=len(train_generator),  # Fixes run-out-of-data warning
    epochs=1,  # Increased for ~200 effective samples (augmentations over epochs)
    validation_data=test_generator,
    validation_steps=len(test_generator)
)

test_loss1, test_acc1 = model1.evaluate(test_generator)
print(f"Model 1 Test Accuracy: {test_acc1 * 100:.2f}%")

# Repeat for Model 2 (Deeper CNN)
model2 = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 1)),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')
])

model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history2 = model2.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=1,
    validation_data=test_generator,
    validation_steps=len(test_generator)
)

test_loss2, test_acc2 = model2.evaluate(test_generator)
print(f"Model 2 Test Accuracy: {test_acc2 * 100:.2f}%")

# Repeat for Model 3 (with Regularization)
model3 = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 1)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')
])

model3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history3 = model3.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=1,
    validation_data=test_generator,
    validation_steps=len(test_generator)
)

test_loss3, test_acc3 = model3.evaluate(test_generator)
print(f"Model 3 Test Accuracy: {test_acc3 * 100:.2f}%")

# Compare and select best
accuracies = [test_acc1, test_acc2, test_acc3]
best_model = accuracies.index(max(accuracies)) + 1
print(f"Best model: Model {best_model} with {max(accuracies) * 100:.2f}% accuracy")

Found 522127 images belonging to 13065 classes.
Found 162550 images belonging to 13065 classes.
Detected 13065 classes.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  self._warn_if_super_not_called()


[1m16317/16317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1556s[0m 95ms/step - accuracy: 2.9975e-05 - loss: 9.5139 - val_accuracy: 6.7671e-05 - val_loss: 9.4852
[1m5080/5080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 47ms/step - accuracy: 5.7891e-05 - loss: 9.4838
Model 1 Test Accuracy: 0.01%
[1m16317/16317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1365s[0m 83ms/step - accuracy: 4.4822e-05 - loss: 9.5143 - val_accuracy: 6.7671e-05 - val_loss: 9.4865
[1m5080/5080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 46ms/step - accuracy: 9.7156e-05 - loss: 9.4865
Model 2 Test Accuracy: 0.01%
[1m16317/16317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1290s[0m 79ms/step - accuracy: 7.4096e-05 - loss: 9.5234 - val_accuracy: 5.5368e-05 - val_loss: 9.4860
[1m5080/5080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 45ms/step - accuracy: 2.3045e-05 - loss: 9.4859
Model 3 Test Accuracy: 0.01%
Best model: Model 1 with 0.01% accuracy
