The code illustrates how Kmeans can be used as a preprocessing step to improve performance of a classifier of images of digits 0-9.
There are 1797 images of size 8x8, each flattened to a vector of size 64


In [1]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
X_digits, y_digits = load_digits(return_X_y=True)

The images are transformed into a n-dimensional space (try n=10, 15 and 20), described by the distances to the cluster centers.
Note that the original dimension is 64.

In [3]:
improvement = np.array([])
n_clusters = 20  # It would seem natural to choose 10 clusters, but some numbers can be written in different ways...

for random_state in range(0,10):   # since Kmeans performance is random, we average over some trials

  X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=random_state)

  # Direct classification using if Logistic Regression
  log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=random_state)
  log_reg.fit(X_train, y_train)
  before = log_reg.score(X_test, y_test)

  # Use k-means before classifying with Logistic Regression
  pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=n_clusters, random_state=random_state)),
    ("logreg", LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=random_state)),])
  pipeline.fit(X_train, y_train)
  after = pipeline.score(X_test, y_test)

  # Compare the results
  print('test {} improved accuracy by {:.4f}'.format(random_state,after-before))
  print('     accuracy is now {:.4f}'.format(after))
  improvement = np.append(improvement,after-before)

print('average improvement = {:.4f}'.format(np.mean(improvement)))
# Note: The improvement for 8*8 images is not very large, close to zero

test 0 improved accuracy by 0.0244
     accuracy is now 0.9756
test 1 improved accuracy by 0.0044
     accuracy is now 0.9778
test 2 improved accuracy by 0.0178
     accuracy is now 0.9644
test 3 improved accuracy by 0.0533
     accuracy is now 0.9844
test 4 improved accuracy by 0.0222
     accuracy is now 0.9867
test 5 improved accuracy by 0.0133
     accuracy is now 0.9778
test 6 improved accuracy by 0.0156
     accuracy is now 0.9689
test 7 improved accuracy by 0.0289
     accuracy is now 0.9733
test 8 improved accuracy by 0.0178
     accuracy is now 0.9800
test 9 improved accuracy by 0.0222
     accuracy is now 0.9778
average improvement = 0.0220
