In [6]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import joblib
from joblib import dump
import cv2
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [7]:
data_path = './data_hand_writing/'
data_training = np.load(data_path + 'data_train.npz')

images = data_training['images']
labels = data_training['labels']

images_train, images_val, labels_train, labels_val = train_test_split(images, labels, test_size=0.2, random_state=42)

In [8]:
pca = PCA(n_components=64, random_state=0)
images_train_pca = pca.fit_transform(images_train)
images_val_pca = pca.transform(images_val)
images_pca = pca.transform(images)

In [9]:
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(images_train, labels_train)
y_pred = svm_model.predict(images_val)
accuracy = accuracy_score(labels_val, y_pred)
print(f'Accuracy on test set: {accuracy}')
cross_val_accuracy = cross_val_score(svm_model, images, labels, cv=5)
print(f'Cross-validation accuracy: {cross_val_accuracy.mean()}')

Accuracy on test set: 0.9082177161152615
Cross-validation accuracy: 0.9833401745888406


In [10]:
svm_model2 = SVC(kernel='linear', C=1.0, random_state=42)
svm_model2.fit(images_train_pca, labels_train)
y_pred = svm_model2.predict(images_val_pca)
accuracy = accuracy_score(labels_val, y_pred)
print(f'Accuracy on test set: {accuracy}')
cross_val_accuracy = cross_val_score(svm_model2, images_pca, labels, cv=5)
print(f'Cross-validation accuracy: {cross_val_accuracy.mean()}')
dump(svm_model2, data_path + 'svm_model.joblib')

Accuracy on test set: 0.9146211312700107
Cross-validation accuracy: 0.983768437183592


['./data_hand_writing/svm_model.joblib']

In [11]:
def process_image(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

    if img is not None:
        img_resized = cv2.resize(img, (28, 28))
        _, optimal_thresh = cv2.threshold(img_resized, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # cv2.imshow('image', optimal_thresh)
        # cv2.waitKey(0)  # Đợi cho đến khi một phím được nhấn
        # cv2.destroyAllWindows()

        return optimal_thresh.flatten()
    
label_unicode_path = data_path + "label_unicode.csv"

data = pd.read_csv(label_unicode_path)
label_unicode_dict = pd.Series(data.Unicode.values, index=data.Label).to_dict()

print(label_unicode_dict)

{1: 'U+4E00', 2: 'U+4E07', 3: 'U+4E0B', 4: 'U+4E8C', 5: 'U+4ECA', 6: 'U+56DB', 7: 'U+5973', 8: 'U+5B66', 9: 'U+5B89', 10: 'U+5C71', 11: 'U+5DE6', 12: 'U+5E74', 13: 'U+5E97', 14: 'U+5F8C', 15: 'U+624B', 16: 'U+65B0', 17: 'U+65E5', 18: 'U+66F8', 19: 'U+6765', 20: 'U+6BCD', 21: 'U+6BCE', 22: 'U+6C34', 23: 'U+706B', 24: 'U+767E', 25: 'U+793E', 26: 'U+7A7A', 27: 'U+8033', 28: 'U+82B1', 29: 'U+884C', 30: 'U+897F', 31: 'U+898B', 32: 'U+8A00', 33: 'U+8A9E', 34: 'U+8AAD', 35: 'U+8CB7', 36: 'U+8ECA', 37: 'U+9053', 38: 'U+9577', 39: 'U+9593', 40: 'U+96E8', 41: 'U+96FB', 42: 'U+98DF', 43: 'U+98F2', 44: 'U+99C5', 45: 'U+9AD8', 46: 'U+9B5A'}


In [18]:
path = './test_image/6.jpg'

img = process_image(path)
img = pca.transform([img])

predict = svm_model2.predict(img)
print(predict, label_unicode_dict[predict[0]], chr(int(label_unicode_dict[predict[0]][2:], 16)))

image_path = cv2.imread(path)
cv2.imshow('image', image_path)
cv2.waitKey(0) 
cv2.destroyAllWindows()

[43] U+98F2 飲
