In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from time import time
from matplotlib import offsetbox
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline
plotly.offline.init_notebook_mode(connected=True)
matplotlib.rcParams["figure.figsize"] = [10, 8]

# import some data to play with
iris = datasets.load_iris()
dims = [0,2,3]
columns = ['sepal_length','sepal_width','petal_length','petal_width']

In [None]:
# DATA: 3 dimensions
trace = go.Scatter3d(x=iris.data[:,dims[0]],
                   y=iris.data[:,dims[1]],
                     z=iris.data[:,dims[2]],
                   mode = 'markers',
                   marker = dict(color=iris.target))
data = [trace]

layout=go.Layout(title='Iris dataset:',
                 scene=dict(
                 xaxis={'title':columns[dims[0]]},
                 yaxis={'title':columns[dims[1]]},
                 zaxis={'title':columns[dims[2]]}
                 )
                )

fig=go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
# PCA on selected dims
# Note that we can find PC1, PC2, and PC3 as the orthogonal axes with the most, second-most, and least variances from
# the previous plot.
x_pca = PCA(n_components=3).fit_transform(iris.data[:,dims])
trace = go.Scatter3d(x=x_pca[:,0],
                   y=x_pca[:,1],
                     z=x_pca[:,2],
                   mode = 'markers',
                   marker = dict(color=iris.target))
data = [trace]

layout=go.Layout(title='Iris dataset:',
                 scene=dict(
                 xaxis={'title':'PC1'},
                 yaxis={'title':'PC2'},
                 zaxis={'title':'PC3'}
                 )
                )

fig=go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
# PCA on all dims
# PCA is a simple, effective way to reduce dimensions.
x_pca = PCA(n_components=3).fit_transform(iris.data)
trace = go.Scatter3d(x=x_pca[:,0],
                   y=x_pca[:,1],
                     z=x_pca[:,2],
                   mode = 'markers',
                   marker = dict(color=iris.target))
data = [trace]

layout=go.Layout(title='Iris dataset:',
                 scene=dict(
                 xaxis={'title':'PC1'},
                 yaxis={'title':'PC2'},
                 zaxis={'title':'PC3'}
                 )
                )

fig=go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
# T-SNE
# We really only need 2 dimensions to cluster.
# "It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler 
# divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data."
# -sklearn
x_tsne = TSNE(n_components=2).fit_transform(iris.data)
trace = go.Scatter(x=x_tsne[:,0],
                   y=x_tsne[:,1],
                   mode = 'markers',
                   marker = dict(color=iris.target))
data = [trace]

layout=go.Layout(title='Iris dataset:',
                 xaxis={'title':'t-SNE 1'},
                 yaxis={'title':'t-SNE 2'})

fig=go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
x_tsne = TSNE(n_components=3).fit_transform(iris.data)
trace = go.Scatter3d(x=x_tsne[:,0],
                   y=x_tsne[:,1],
                     z=x_tsne[:,2],
                   mode = 'markers',
                   marker = dict(color=iris.target))
data = [trace]

layout=go.Layout(title='Iris dataset:',
                 scene=dict(
                 xaxis={'title':'t-SNE 1'},
                 yaxis={'title':'t-SNE 2'},
                 zaxis={'title':'t-SNE 3'}
                 )
                )

fig=go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
## Digits
# Code adapted from: 
# https://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html#sphx-glr-auto-examples-manifold-plot-lle-digits-py
# Authors: Fabian Pedregosa <fabian.pedregosa@inria.fr>
#          Olivier Grisel <olivier.grisel@ensta.org>
#          Mathieu Blondel <mathieu@mblondel.org>
#          Gael Varoquaux
# License: BSD 3 clause (C) INRIA 2011

In [None]:
digits = datasets.load_digits()

In [None]:
n_neighbors = 30


#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding(X, y, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure()
    ax = plt.subplot(111)
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], str(y[i]),
                 color=plt.cm.Set1(y[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})

    if hasattr(offsetbox, 'AnnotationBbox'):
        # only print thumbnails with matplotlib > 1.0
        shown_images = np.array([[1., 1.]])  # just something big
        for i in range(X.shape[0]):
            dist = np.sum((X[i] - shown_images) ** 2, 1)
            if np.min(dist) < 4e-3:
                # don't show points that are too close
                continue
            shown_images = np.r_[shown_images, [X[i]]]
            imagebox = offsetbox.AnnotationBbox(
                offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
                X[i])
            ax.add_artist(imagebox)
    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)


#----------------------------------------------------------------------
# Plot images of the digits
n_img_per_row = 10
img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row))
for i in range(n_img_per_row):
    ix = 10 * i + 1
    for j in range(n_img_per_row):
        iy = 10 * j + 1
        img[ix:ix + 8, iy:iy + 8] = digits.data[i * n_img_per_row + j].reshape((8, 8))

plt.imshow(img, cmap=plt.cm.binary)
plt.xticks([])
plt.yticks([])
plt.title('A selection from the 64-dimensional digits dataset')

In [None]:
#----------------------------------------------------------------------
# Projection on to the first 2 principal components

print("Computing PCA projection")
t0 = time()
X_pca = PCA(n_components=2).fit_transform(digits.data)
plot_embedding(X_pca, digits.target,
               "Principal Components projection of the digits (time %.2fs)" %
               (time() - t0))

In [None]:
#----------------------------------------------------------------------
# t-SNE embedding of the digits dataset
print("Computing t-SNE embedding")
tsne = TSNE(n_components=2, init='pca', random_state=0)
t0 = time()
X_tsne = tsne.fit_transform(digits.data)

plot_embedding(X_tsne,digits.target,
               "t-SNE embedding of the digits (time %.2fs)" %
               (time() - t0))

plt.show()