# Showcase of Dimensionality Reduction

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.datasets import make_swiss_roll

X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)

In [None]:
X

In [None]:
fig = plt.figure(figsize=(6, 4))
ax = plt.axes(projection='3d')

ax.scatter(
    X[:, 0], X[:, 1], X[:, 2], c=t
)

ax.set_title("Swiss Roll in Ambient Space")
ax.view_init(azim=-66, elev=12)

In [None]:
fig = plt.figure(figsize=(6, 4))
ax = plt.axes()#projection='3d')

ax.scatter(
    X[:, 0], X[:, 1], c=t
)

ax.set_title("Swiss Roll in Ambient Space")
#ax.view_init(azim=-66, elev=12)

In [None]:
fig = plt.figure(figsize=(6, 4))
ax = plt.axes()#projection='3d')

ax.scatter(
    X[:, 0], X[:, 2], c=t
)

ax.set_title("Swiss Roll in Ambient Space")
#ax.view_init(azim=-66, elev=12)

In [None]:
fig = plt.figure(figsize=(6, 4))
ax = plt.axes()#projection='3d')

ax.scatter(
    X[:, 1], X[:, 2], c=t
)

ax.set_title("Swiss Roll in Ambient Space")
#ax.view_init(azim=-66, elev=12)

In [None]:
from sklearn import manifold

In [None]:
X_lle = manifold.LocallyLinearEmbedding(
    n_neighbors=12, n_components=2, random_state=42
).fit_transform(X)

fig, ax = plt.subplots(figsize=(6, 4))

ax.scatter(X_lle[:, 0], X_lle[:, 1], c=t)
ax.set_title("Locally Linear Embedding of Swiss Roll")

In [None]:
X_tsne = manifold.TSNE(
    n_components=2, learning_rate="auto", init="pca", random_state=42
).fit_transform(X)

fig, ax = plt.subplots(figsize=(6, 4))

ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=t)
ax.set_title("t-Distributed Stochastic Neighbor Embedding of Swiss Roll")

In [None]:
X_mds = manifold.MDS(
    n_components=2, random_state=42
).fit_transform(X)

fig, ax = plt.subplots(figsize=(6, 4))

ax.scatter(X_mds[:, 0], X_mds[:, 1], c=t)
ax.set_title("Multidimensional Scaling (MDS) of Swiss Roll")

In [None]:
X_iso = manifold.Isomap(
    n_components=2
).fit_transform(X)

fig, ax = plt.subplots(figsize=(6, 4))

ax.scatter(X_iso[:, 0], X_iso[:, 1], c=t)
ax.set_title("Isomap of Swiss Roll")

# Example Dimensionality Reduction + Viz of Digits


In [None]:
import matplotlib.pyplot as plt
import sklearn.datasets
import sklearn.model_selection
import sklearn.metrics

We use the toy digit dataset provided by scikit-learn.

In [None]:
d = sklearn.datasets.load_digits()

In [None]:
print(d.DESCR)

In [None]:
x = d.data
y = d.target

In [None]:
x.shape

In [None]:
y.shape

In [None]:
x[0]

In [None]:
y[0]

The samples consist of 64 features, one for each pixel value of an 8x8 image array.  We can reshape the sample into an 8x8 array in order to visualize it.

In [None]:
sample = x[4].reshape(8,8)
plt.imshow(sample, cmap='binary')

In [None]:
for i in range(100):
    plt.subplot(10,10,i+1)
    sample = x[i].reshape(8,8)
    plt.imshow(sample, cmap='binary')

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
x_reduced = tsne.fit_transform(x)

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(x_reduced[:, 0], x_reduced[:, 1], c=y, cmap="jet")
plt.axis('off')
plt.colorbar()
plt.show()

In [None]:
import plothelp

In [None]:
plothelp.plot_digits(x_reduced, y)
plt.show()

In [None]:
from sklearn.decomposition import PCA
import time

t0 = time.time()
x_pca_reduced = PCA(n_components=2, random_state=42).fit_transform(x)
t1 = time.time()

print("PCA took {:.1f}s.".format(t1 - t0))

plothelp.plot_digits(x_pca_reduced, y)
plt.show()

In [None]:
from sklearn.manifold import LocallyLinearEmbedding

t0 = time.time()
x_lle_reduced = LocallyLinearEmbedding(n_components=2, random_state=42).fit_transform(x)
t1 = time.time()

print("LLE took {:.1f}s.".format(t1 - t0))

plothelp.plot_digits(x_lle_reduced, y)
plt.show()

## Full MNIST dimensionality reduction for viz

In [None]:
from sklearn.datasets import fetch_openml

import numpy as np
import matplotlib as mpl
import plothelp
import time

from sklearn.decomposition import PCA
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.pipeline import Pipeline

In [None]:
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

In [None]:
mnist.target = mnist.target.astype(np.uint8)

In [None]:
np.random.seed(42)

m = 10000
idx = np.random.permutation(60000)[:m]

x = mnist['data'][idx]
y = mnist['target'][idx]

In [None]:
x.shape

In [None]:
tsne = TSNE(n_components=2, random_state=42)
x_reduced = tsne.fit_transform(x)

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(x_reduced[:, 0], x_reduced[:, 1], c=y, cmap="jet")
plt.axis('off')
plt.colorbar()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
cmap = mpl.cm.get_cmap("jet")
for digit in (2, 3, 5):
    plt.scatter(x_reduced[y == digit, 0], x_reduced[y == digit, 1], c=[cmap(digit / 9)])
plt.axis('off')
plt.show()

### Principal Component Analysis (PCA)

In [None]:
t0 = time.time()
x_pca_reduced = PCA(n_components=2, random_state=42).fit_transform(x)
t1 = time.time()

print("PCA took {:.1f}s.".format(t1 - t0))

plothelp.plot_digits(x_pca_reduced, y)
plt.show()

### Locally Linear Embedding (LLE)

In [None]:
t0 = time.time()
x_lle_reduced = LocallyLinearEmbedding(n_components=2, random_state=42).fit_transform(x)
t1 = time.time()

print("LLE took {:.1f}s.".format(t1 - t0))

plothelp.plot_digits(x_lle_reduced, y)
plt.show()

### PCA + LLE

In [None]:
pca_lle = Pipeline([
    ("pca", PCA(n_components=0.95, random_state=42)),
    ("lle", LocallyLinearEmbedding(n_components=2, random_state=42)),
])

t0 = time.time()
x_pca_lle_reduced = pca_lle.fit_transform(x)
t1 = time.time()

print("PCA+LLE took {:.1f}s.".format(t1 - t0))

plothelp.plot_digits(x_pca_lle_reduced, y)
plt.show()

### Multidimensional Scaling (MDS)

In [None]:
t0 = time.time()
m = 2000
x_mds_reduced = MDS(n_components=2, random_state=42).fit_transform(x[:m])
t1 = time.time()

print("MDS took {:.1f}s (on just 2,000 MNIST images instead of 10,000).".format(t1 - t0))

plothelp.plot_digits(x_mds_reduced, y[:m])
plt.show()

### PCA + MDS

In [None]:
pca_mds = Pipeline([
    ("pca", PCA(n_components=0.95, random_state=42)),
    ("mds", MDS(n_components=2, random_state=42)),
])

t0 = time.time()
m = 2000
x_pca_mds_reduced = pca_mds.fit_transform(x[:m])
t1 = time.time()

print("PCA+MDS took {:.1f}s (on 2,000 MNIST images).".format(t1 - t0))

plothelp.plot_digits(x_pca_mds_reduced, y[:2000])
plt.show()

### Linear Discriminant Analysis (LDA)

In [None]:
t0 = time.time()
x_lda_reduced = LinearDiscriminantAnalysis(n_components=2).fit_transform(x, y)
t1 = time.time()

print("LDA took {:.1f}s.".format(t1 - t0))

plothelp.plot_digits(x_lda_reduced, y, figsize=(12,12))
plt.show()

### t-Distributed Stochastic Neighbor Embedding (t-SNE)

In [None]:
t0 = time.time()
x_tsne_reduced = TSNE(n_components=2, random_state=42).fit_transform(x)
t1 = time.time()

print("t-SNE took {:.1f}s.".format(t1 - t0))

plothelp.plot_digits(x_tsne_reduced, y)
plt.show()

### PCA + t-SNE

In [None]:
pca_tsne = Pipeline([
    ("pca", PCA(n_components=0.95, random_state=42)),
    ("tsne", TSNE(n_components=2, random_state=42)),
])

t0 = time.time()
x_pca_tsne_reduced = pca_tsne.fit_transform(x)
t1 = time.time()

print("PCA+t-SNE took {:.1f}s.".format(t1 - t0))

plothelp.plot_digits(x_pca_tsne_reduced, y)
plt.show()