In [1]:
# 參數設定與導入需要套件
seed = 0    # 亂數種子數
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score

In [2]:
# 載入手寫數字資料集
X_digits, y_digits = load_digits(return_X_y=True)  #數值特徵  數字類別

# 特徵標準化(scale/StandardScaler)
data = scale(X_digits)

# 取出資料集的數字類別數
n_digits = len(np.unique(y_digits))
n_samples, n_features = data.shape

print("n_digits: %d, n_samples: %d, n_features: %d"
      % (n_digits, n_samples, n_features))

n_digits: 10, n_samples: 1797, n_features: 64


In [3]:
# 進行 PCA 降維後再做 K-Means，除以下參數設定外，其餘為預設值
# kmeans: init='k-means++', n_clusters=n_digits, n_init=10, random_state=seed
# PCA: n_components=(1~10), random_state=seed
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10, 
                random_state=seed)
for i in range(1,11):
    reduced_data = PCA(n_components=i, random_state=seed).fit_transform(data)
    kmeans.fit(reduced_data)
    print('PCA(n_components=%d)+KMeans Silhouette=%.4f' % (i,silhouette_score(data, 
                                                            kmeans.labels_, 
                                                            metric='euclidean')), 
                                                            end='')
    print(' Accuracy=%.4f' % accuracy_score(y_digits, kmeans.labels_))

PCA(n_components=1)+KMeans Silhouette=0.0005 Accuracy=0.1041
PCA(n_components=2)+KMeans Silhouette=0.0599 Accuracy=0.1441
PCA(n_components=3)+KMeans Silhouette=0.0973 Accuracy=0.0139
PCA(n_components=4)+KMeans Silhouette=0.1226 Accuracy=0.0334
PCA(n_components=5)+KMeans Silhouette=0.1269 Accuracy=0.0050
PCA(n_components=6)+KMeans Silhouette=0.1278 Accuracy=0.2014
PCA(n_components=7)+KMeans Silhouette=0.1271 Accuracy=0.0412
PCA(n_components=8)+KMeans Silhouette=0.1407 Accuracy=0.0089
PCA(n_components=9)+KMeans Silhouette=0.1460 Accuracy=0.0551
PCA(n_components=10)+KMeans Silhouette=0.1449 Accuracy=0.0723


##### 請填入n_components參數是多少時模型的輪廓係數為最大?請填入此時模型的輪廓係數？
##### n_components=10, Silhouette=0.1449

##### 請填入n_components參數是多少時模型的準確率為最大?請填入此時模型的準確率（四捨五入取至小數點後第四位）？
##### n_components=6, Accuracy=0.2014