# Dimensionality Reduction

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import numpy as np
import umap

In [None]:
%matplotlib inline
sns.set()
sns.set(rc={"figure.figsize": (15, 12)})

In [None]:
RANDOM_STATE = 42

## Defining plotting functions

In [None]:
def plot_2d(x, y, c, title):
    sns.set_style("darkgrid")

    PALETTE = sns.color_palette('deep', n_colors=len(np.unique(c)))
    CMAP = ListedColormap(PALETTE.as_hex())

    plt.scatter(x, y,
        c=c,
        cmap=CMAP,
        s=70)

    plt.title(title, fontsize=20, y=1.03)

    #plt.xlabel(xlabel, fontsize=16)
    #plt.ylabel(ylabel, fontsize=16)
    plt.show()

In [None]:
def plot_3d(x, y, z, c, title):
    sns.set_style('whitegrid')

    PALETTE = sns.color_palette('deep', n_colors=len(np.unique(c)))
    CMAP = ListedColormap(PALETTE.as_hex())

    fig = plt.figure(1, figsize=(15, 12))
    ax = fig.add_subplot(111, projection="3d")

    ax.scatter(x, y, z,
               c=c,
               cmap=CMAP,
               s=40)

    ax.set_title(title, fontsize=20, y=1.03)

    fsize = 14
    #ax.set_xlabel("1st eigenvector", fontsize=fsize)
    #ax.set_ylabel("2nd eigenvector", fontsize=fsize)
    #ax.set_zlabel("3rd eigenvector", fontsize=fsize)

    ax.xaxis.set_ticklabels([])
    ax.yaxis.set_ticklabels([])
    ax.zaxis.set_ticklabels([])
    plt.show()

## Create Dataset

In [None]:
from sklearn.datasets import make_classification
X, y = make_classification(
    n_features=6,
    n_classes=2,
    n_samples=1500,
    n_informative=2,
    random_state=42,
    n_clusters_per_class=1,
    )

plot_3d(
    x=X[:, 0],
    y=X[:, 1],
    z=X[:, 2],
    c=y,
    title = "Visualization of dataset")

### 2D Plotting with PCA

In [None]:
pca = PCA(n_components=2)
points = pca.fit_transform(X)

In [None]:
plot_2d(
    x = points[:,0],
    y = points[:,1],
    c = y,
    title = 'X visualized with PCA')

### 2D plotting with t-SNE

#### How to use t-SNE Effectivly
https://distill.pub/2016/misread-tsne/

In [None]:
tsne = TSNE(n_components=2, n_iter=1000, random_state=RANDOM_STATE, perplexity=30)
points = tsne.fit_transform(X)

In [None]:
plot_2d(
    x = points[:, 0],
    y = points[:, 1],
    c = y,
    title = 'X visualized with t-SNE')

Kullback-Leibler (KL) divergence between the high-dimensional probability distribution and the low-dimensional probability distribution. Low KL divergence is a sign of better results

In [None]:
tsne.kl_divergence_

### 2D plotting with UMAP

In [None]:
map = umap.UMAP(n_neighbors=10, n_components=2, n_epochs=5000, random_state=RANDOM_STATE)
points = map.fit_transform(X)

In [None]:
plot_2d(
    x = points[:, 0],
    y = points[:, 1],
    c = y,
    title = 'X visualized with UMAP')

### 3D plotting with PCA

In [None]:
pca = PCA(n_components=3)
points = pca.fit_transform(X)

In [None]:
plot_3d(
    x = points[:,0],
    y = points[:,1],
    z = points[:,2],
    c = y,
    title = "X dataset visualized with PCA")

### 3D plotting with t-SNE

In [None]:
tsne = TSNE(n_components=3, n_iter=5000, random_state=RANDOM_STATE, perplexity=30)
points = tsne.fit_transform(X)

In [None]:
plot_3d(
    x = points[:,0],
    y = points[:,1],
    z = points[:,2],
    c = y,
    title = "X dataset visualized with t-SNE")

### 3D plotting with UMAP

In [None]:
map = umap.UMAP(n_neighbors=30, n_components=3, n_epochs=5000, random_state=RANDOM_STATE)
points = map.fit_transform(X)

In [None]:
plot_3d(
    x = points[:,0],
    y = points[:,1],
    z = points[:,2],
    c = y,
    title = "X dataset visualized with UMAP")

### Criterion PCA

In [None]:
# If not set all the component are included
pca = PCA()
points = pca.fit_transform(X)

#### Kaiser Criterion

In [None]:
def scree_plot():
    from matplotlib.pyplot import figure, show
    from matplotlib.ticker import MaxNLocator

    ax = figure().gca()
    ax.plot(pca.explained_variance_)
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.xlabel('Principal Component')
    plt.ylabel('Eigenvalue')
    plt.axhline(y=1, linewidth=1, color='r', alpha=0.5)
    plt.title('Scree Plot of PCA: Component Eigenvalues')
    show()

scree_plot()

#### Explained Variance

In [None]:
exp_var_pca = pca.explained_variance_ratio_
#
# Cumulative sum of eigenvalues; This will be used to create step plot
# for visualizing the variance explained by each principal component.
#
cum_sum_eigenvalues = np.cumsum(exp_var_pca)
#
# Create the visualization plot
#
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.axhline(y=0.9, linewidth=1, color='r', alpha=0.5)
plt.legend(loc='best')
plt.tight_layout()
plt.show()

### t-SNE different level of perplexity

In [None]:
from time import time

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import NullFormatter

from sklearn import datasets, manifold

n_samples = 150
n_components = 2
(fig, subplots) = plt.subplots(1, 5, figsize=(20, 4))
perplexities = [5, 30, 50, 100]

# Another example using s-curve
X, color = datasets.make_s_curve(n_samples, random_state=0)

ax = subplots[0]
ax.scatter(X[:, 0], X[:, 2], c=color)
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())

for i, perplexity in enumerate(perplexities):
    ax = subplots[i + 1]

    t0 = time()
    tsne = manifold.TSNE(
        n_components=n_components,
        init="random",
        random_state=0,
        perplexity=perplexity,
        learning_rate="auto",
        n_iter=300,
    )
    Y = tsne.fit_transform(X)
    t1 = time()
    print("S-curve, perplexity=%d in %.2g sec" % (perplexity, t1 - t0))

    ax.set_title("Perplexity=%d" % perplexity)
    ax.scatter(Y[:, 0], Y[:, 1], c=color)
    ax.xaxis.set_major_formatter(NullFormatter())
    ax.yaxis.set_major_formatter(NullFormatter())
    ax.axis("tight")


## Loading IRIS the dataset

In [None]:
# Load IRIS, create a pandas DataFram, and visualize the first 10 lines of the dataset


### Plotting first two components

In [None]:
# Plot the first 2 components with the given plotting function 


### 2D Plotting with PCA

In [None]:
# Fit PCA


In [None]:
# Plot the results


### 2D plotting with t-SNE

In [None]:
# Fit with t-SNE


In [None]:
# Plot


### 2D plotting with UMAP

In [None]:
# Fit with UMAP


In [None]:
# Plot


### 3D plotting with PCA

In [None]:
# Fit with PCA the first 3 components


In [None]:
# Plot in 3D with the given function 


### 3D plotting with t-SNE

In [None]:
# Fit with t-SNE in 3D


In [None]:
# Plot in 3D


### 3D plotting with UMAP

In [None]:
# Fit UMAP in 3D


In [None]:
# Plot in 3D


## Load MNIST Dataset

In [None]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
# data from sklearn datasets


# Extract data & target from the dataset


In [None]:
# Reshape the pixel data into 28x28


### 2D Plotting with PCA

In [None]:
# Fit with PCA in 2D


In [None]:
# Plot in 2D


### 2D plotting with t-SNE

In [None]:
# Fit with t-SNE in 2D


In [None]:
# Plot in 2D


### 2D plotting with UMAP

In [None]:
# Fit with UMAP in 2D


In [None]:
# Plot in 2D


### 3D plotting with PCA

In [None]:
# Fit with PCA in 3D


In [None]:
# Plot in 3D


### 3D plotting with t-SNE

In [None]:
# Fit with t-SNE in 3D


In [None]:
# Plot in 3D


### 3D plotting with UMAP

In [None]:
# Fit with UMAP in 3D


In [None]:
# Plot in 3D
