## Dimensionality reduction on features with nonlinear relationships

Dimensionality reduction is performed most frequently using principal component analysis (PCA). However, PCA often falls short in capturing variance of highly dimensional data wherein the dimensions have nonlinear relationships with each other. This exercise will demonstrate the effectiveness of two-component PCA compared to two other techniques, t-SNE and Isomap, in classifying images of handwritten digits.

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import Isomap
from sklearn.datasets import load_digits

In [None]:
# Load and plot example digits
digits = load_digits()
fig, axes = plt.subplots(2, 5, figsize=(10, 5),
subplot_kw = {'xticks':(), 'yticks': ()})
for ax, img in zip(axes.ravel(), digits.images):
    ax.imshow(img,cmap = cm.gray)

In [None]:
# Define function to plot clusters of digits
def display_2d_component_names(model, selected, dataobj):
    colors = [
        "#476A2A", 
        "#7851B8", 
        "#BD3430", 
        "#4A2D4E", 
        "#875525",
        "#A83683", 
        "#4E655E", 
        "#853541",
        "#3A3120", 
        "#535D8E"
    ]
    
    plt.figure(figsize = (14, 14))
    plt.xlim(model[:, 0].min(), model[:, 0].max() + 1)
    plt.ylim(model[:, 1].min(), model[:, 1].max() + 1)
    
    for i in range(len(dataobj.data)):
        cindex = dataobj.target[i] % len(selected)
        if dataobj.target[i] not in selected:
            continue
        plt.text(
            model[i, 0], 
            model[i, 1], 
            str(dataobj.target_names[dataobj.target[i]]),
            color = colors[cindex],
            fontdict={'weight': 'bold', 'size': 9}
        )
        
    plt.xlabel("model feature 0")
    plt.ylabel("model feature 1")

In [None]:
# Define/plot PCA with two components
pca = PCA(n_components = 2)
pca.fit(digits.data)
digits_pca = pca.transform(digits.data)
display_2d_component_names(
    digits_pca,
    (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
    digits
)

In [None]:
# Define/plot TSNE
tsne = TSNE(random_state = 42)
digits_tsne = tsne.fit_transform(digits.data)
display_2d_component_names(
    digits_tsne,
    (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
    digits
)

In [None]:
# Define/plot Isomap
iso = Isomap(n_neighbors = 10, n_components = 2)
digits_iso = iso.fit(digits.data).transform(digits.data)
display_2d_component_names(
    digits_iso,
    (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
    digits
)