# digits 분류하기

### (1) 필요한 모듈 import

In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
import numpy as np

### (2) 데이터 준비

In [2]:
digits = load_digits()
digits_data = digits.data
digits_label = digits.target

In [3]:
print(digits_data.shape)

(1797, 64)


In [4]:
print(digits.keys())

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])


In [5]:
print(digits.feature_names)

['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_4', 'pixel_7_5', 'pixel_7_6', 'pixel_7_7']


In [6]:
print(digits.target_names)

[0 1 2 3 4 5 6 7 8 9]


In [7]:
print(digits.DESCR)

.. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each blo

### (3) train, test 데이터 분리

In [8]:
x_train, x_test, y_train, y_test = train_test_split(digits_data, 
                                                    digits_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

In [9]:
x_train.shape, y_train.shape

((1437, 64), (1437,))

In [10]:
# 정규화

x_train_norm, x_test_norm = x_train / np.max(x_train), x_test / np.max(x_test)

### (4) 모델 학습 및 예측

In [11]:
# Decision Tree
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(x_train_norm, y_train)
decision_tree_y_pred = decision_tree.predict(x_test_norm)

print(classification_report(y_test, decision_tree_y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.81      0.81      0.81        42
           2       0.79      0.82      0.80        40
           3       0.79      0.91      0.85        34
           4       0.83      0.95      0.89        37
           5       0.90      0.96      0.93        28
           6       0.84      0.93      0.88        28
           7       0.96      0.82      0.89        33
           8       0.88      0.65      0.75        43
           9       0.78      0.78      0.78        32

    accuracy                           0.86       360
   macro avg       0.86      0.86      0.86       360
weighted avg       0.86      0.86      0.85       360



In [12]:
# RandomForest
random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(x_train_norm, y_train)
random_forest_y_pred = random_forest.predict(x_test_norm)

print(classification_report(y_test, random_forest_y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.93      1.00      0.97        42
           2       1.00      1.00      1.00        40
           3       1.00      1.00      1.00        34
           4       0.93      1.00      0.96        37
           5       0.90      0.96      0.93        28
           6       1.00      0.96      0.98        28
           7       0.94      0.97      0.96        33
           8       1.00      0.84      0.91        43
           9       0.94      0.94      0.94        32

    accuracy                           0.96       360
   macro avg       0.96      0.96      0.96       360
weighted avg       0.97      0.96      0.96       360



In [13]:
# SVM
svm_model = svm.SVC()
svm_model.fit(x_train_norm, y_train)
svm_y_pred = svm_model.predict(x_test_norm)

print(classification_report(y_test, svm_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.95      1.00      0.98        42
           2       1.00      1.00      1.00        40
           3       1.00      1.00      1.00        34
           4       1.00      1.00      1.00        37
           5       0.93      1.00      0.97        28
           6       1.00      1.00      1.00        28
           7       1.00      1.00      1.00        33
           8       1.00      0.93      0.96        43
           9       1.00      0.97      0.98        32

    accuracy                           0.99       360
   macro avg       0.99      0.99      0.99       360
weighted avg       0.99      0.99      0.99       360



In [14]:
# SGD
sgd_model = SGDClassifier()
sgd_model.fit(x_train_norm, y_train)
sgd_y_pred = sgd_model.predict(x_test_norm)

print(classification_report(y_test, sgd_y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        43
           1       0.92      0.86      0.89        42
           2       0.93      1.00      0.96        40
           3       0.89      0.97      0.93        34
           4       0.97      0.97      0.97        37
           5       0.90      1.00      0.95        28
           6       0.96      0.96      0.96        28
           7       0.97      0.97      0.97        33
           8       1.00      0.84      0.91        43
           9       0.94      0.94      0.94        32

    accuracy                           0.95       360
   macro avg       0.95      0.95      0.95       360
weighted avg       0.95      0.95      0.95       360



In [15]:
# Logistic Regression
logistic_model = LogisticRegression(max_iter=256)
logistic_model.fit(x_train_norm, y_train)
logistic_y_pred = logistic_model.predict(x_test_norm)

print(classification_report(y_test, logistic_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.91      0.93      0.92        42
           2       0.95      1.00      0.98        40
           3       1.00      1.00      1.00        34
           4       1.00      1.00      1.00        37
           5       0.90      1.00      0.95        28
           6       1.00      0.93      0.96        28
           7       0.97      1.00      0.99        33
           8       0.95      0.86      0.90        43
           9       0.94      0.91      0.92        32

    accuracy                           0.96       360
   macro avg       0.96      0.96      0.96       360
weighted avg       0.96      0.96      0.96       360



In [16]:
# 종합 평가
# 숫자는 판단 기준이 실제로 참인 것 중에 얼마나 참으로 예측했느냐가 중요하기 때문에
# Recall 이 중요하다고 생각되어 Recall 의 평균 값만 출력해보았다.

print('Decision Tree       : {}'.format(recall_score(y_test, decision_tree_y_pred, average='weighted')))
print('Random Forest       : {}'.format(recall_score(y_test, random_forest_y_pred, average='weighted')))
print('SVM                 : {}'.format(recall_score(y_test, svm_y_pred, average='weighted')))
print('SGD                 : {}'.format(recall_score(y_test, sgd_y_pred, average='weighted')))
print('Logistic Regression : {}'.format(recall_score(y_test, logistic_y_pred, average='weighted')))

Decision Tree       : 0.8555555555555555
Random Forest       : 0.9638888888888889
SVM                 : 0.9888888888888889
SGD                 : 0.9472222222222222
Logistic Regression : 0.9611111111111111


In [17]:
# 결론 : SVM 이 가장 적합한 모델이다.