# Comparison of dimensionality reduction on digits dataset

We'll now look at ALL the dimensionality reduction algorithms we've looked at so far and compare them using a common dataset.

The dataset we'll use is a commonly used [dataset of handwritten digits](http://yann.lecun.com/exdb/mnist/).

In [68]:
# Authors: Fabian Pedregosa <fabian.pedregosa@inria.fr>
#          Olivier Grisel <olivier.grisel@ensta.org>
#          Mathieu Blondel <mathieu@mblondel.org>
#          Gael Varoquaux
# License: BSD 3 clause (C) INRIA 2011

print(__doc__)
from time import time

import ipywidgets
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble,
                     discriminant_analysis, random_projection)
import seaborn as sns
sns.set(context='notebook', style='white')

%matplotlib inline
plt.close('all')

digits = datasets.load_digits() #n_class=n_class)
X = digits.data
y = digits.target
n_samples, n_features = X.shape
n_neighbors = 30


PALETTE = sns.color_palette('husl', n_colors=len(np.unique(digits.target)))

# PALETTE = sns.diverging_palette(220, 20, n=10, center='dark')
row_colors = [PALETTE[i] for i in digits.target]

#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure()
    ax = plt.subplot(111)
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], str(digits.target[i]),
                 color=PALETTE[int(y[i])],
                 fontdict={'weight': 'bold', 'size': 9})

    if hasattr(offsetbox, 'AnnotationBbox'):
        # only print thumbnails with matplotlib > 1.0
        shown_images = np.array([[1., 1.]])  # just something big
        for i in range(digits.data.shape[0]):
            dist = np.sum((X[i] - shown_images) ** 2, 1)
            if np.min(dist) < 4e-3:
                # don't show points that are too close
                continue
            shown_images = np.r_[shown_images, [X[i]]]
            imagebox = offsetbox.AnnotationBbox(
                offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
                X[i])
            ax.add_artist(imagebox)
    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)


#----------------------------------------------------------------------
# Plot images of the digits
n_img_per_row = 20
img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row))
for i in range(n_img_per_row):
    ix = 10 * i + 1
    for j in range(n_img_per_row):
        iy = 10 * j + 1
        img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8))

plt.imshow(img, cmap=plt.cm.binary)
plt.xticks([])
plt.yticks([])
plt.title('A selection from the 64-dimensional digits dataset');
sns.despine(bottom=True, left=True)

In [71]:
methods = ['Clustering: complete', 
           'Clustering: single', 
           'Clustering: average', 
           'Clustering: ward', 
           'Clustering: centroid',
           'Matrix decomposition: PCA',
           'Matrix decomposition: ICA',
           'Manifold learning: MDS',
           'Manifold learning: t-SNE'
          ]


def explore_clustering(method):
    # Copy the full name of the method
    fullname = str(method)
    if method.startswith('Clustering'):
        method = method.split()[-1]
        t0 = time()
        g = sns.clustermap(digits.data, row_colors=row_colors, method=method,
                           xticklabels=[], yticklabels=[])
        g.fig.suptitle('{} of the digits (time {:.2f}s)'.format(fullname, time()-t0))
        
    else:
        n_components = 2
        max_iter = 100
        random_state = 0
        n_init = 1
        if method.endswith('PCA'):
            estimator = decomposition.PCA(n_components=n_components)
        elif method.endswith('ICA'):
            estimator = decomposition.FastICA(max_iter=max_iter, n_components=n_components, 
                                              random_state=random_state)
        elif method.endswith('MDS'):
            estimator =  manifold.MDS(n_init=n_init, max_iter=max_iter, random_state=random_state)
        elif method.endswith('t-SNE'):
            estimator = manifold.TSNE(n_components, init='pca', random_state=random_state)
        
        t0 = time()
        smushed = estimator.fit_transform(digits.data)
        title = "{} embedding of the digits (time {:.2f}s)".format(fullname, time() - t0)
        plot_embedding(smushed, title)
        
    # Plot a legend by hand
    fig, ax = plt.subplots(figsize=(1, 1))
    for digit, color in zip(digits.target, PALETTE):
        ax.bar(0, 0, color=color, label=digit)
    ax.patches = []
    ax.legend(loc='center')
    ax.axis('off')


ipywidgets.interact(explore_clustering,
                    metric=ipywidgets.Dropdown(options=['euclidean', 'cityblock', ], value='euclidean', 
                                               description='Distance metric'),
                    method=ipywidgets.Dropdown(options=methods, value='Matrix decomposition: PCA', 
                                               description='Unsupervised learning method'),);

## Quiz 2.5.1

While you're playing with the sliders, work on this [quiz](https://docs.google.com/forms/d/1Mx0rASlRzEi2BTFI1RxdAoQdAeTueTL5xZdtzAv8Kns/viewform). Some of them (especially clustering) will take some time to compute and plot, so have patience.