In [28]:
import numpy as np
from sklearn.datasets import load_digits

In [29]:
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size

In [30]:
print(f"# digits: {n_digits}; # samples: {n_samples}; # features: {n_features}")

# digits: 10; # samples: 1797; # features: 64


In [31]:
print(data.shape)
print(labels.shape)
print(n_samples)
print(n_features)
print(n_digits)

(1797, 64)
(1797,)
1797
64
10


In [32]:
from time import time

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [45]:
def bench_k_marks(kmeans, name, data, labels):
    """
    Benchmark to evaluate the KMeans initializtion methods
    """
    t0 = time()
    estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
    fit_time = time() - t0
    results = [name, fit_time, estimator[-1].inertia_]
    
    #Define the metrics which requires only the true labels and estimator labels
    clustering_metrics = [
    metrics.homogeneity_score,
    metrics.completeness_score,
    metrics.v_measure_score,
    metrics.adjusted_rand_score,
    metrics.adjusted_mutual_info_score]
    
    results+=[m(labels, estimator[-1].labels_) for m in clustering_metrics]
    
    # The silhoutte score requires full dataset.
    results += [metrics.silhouette_score(data, 
                                         estimator[-1].labels_,
                                        metric='euclidean',
                                        sample_size=300,
                                        )]
    
    # show the results
    formatter_result = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
    print(formatter_result.format(*results))

In [46]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [50]:
print(82 * "_")
print("init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette")
kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4, random_state=0)
bench_k_marks(kmeans=kmeans, name="k-means++", data=data, labels=labels)

kmeans = KMeans(init="random", n_clusters=n_digits, n_init=4, random_state=0)
bench_k_marks(kmeans=kmeans, name="random", data=data, labels=labels)

pca = PCA(n_components=n_digits).fit(data)
kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)
bench_k_marks(kmeans=kmeans, name="PCA-based", data=data, labels=labels)

__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI	silhouette
k-means++	0.059s	69662	0.680	0.719	0.699	0.570	0.695	0.173
random   	0.035s	69707	0.675	0.716	0.694	0.560	0.691	0.183
PCA-based	0.013s	72686	0.636	0.658	0.647	0.521	0.643	0.140


In [51]:
import matplotlib.pyplot as plt

reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4)
kmeans.fit(reduced_data)

#Step size of mes. Decrease to increase the quality of two VQ.
h = 0.02 # point in the mesh [x_min, x_max] x [y_min, y_max].

#Plot the decision boundary. For that, we willl assign a color to each.
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:,0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1 , reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
