In [1]:
import pandas as pd
import numpy as np
from umap.umap_ import UMAP
import sklearn.datasets as sd
import altair as alt

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
# load transformers
pca = PCA(n_components=2)
tsne = TSNE()
umap = UMAP()

scaler = StandardScaler()

In [3]:
# load data
breast_cancer = sd.load_breast_cancer()

X = breast_cancer['data']
y_mapping = {x: y for x, y in zip([0,1], breast_cancer['target_names'])}
y = pd.DataFrame(breast_cancer['target'], columns=['target'])
y['target'] = y['target'].map(y_mapping)

In [4]:
breast_cancer['feature_names']

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [5]:
def embed(data, transformer, log=False, scale=False, names=['x', 'y']):
    if log==True:
        data = np.log(data)
    if scale==True:
        data = scaler.fit_transform(data)
    transformed_data = transformer.fit_transform(data)
    df = pd.DataFrame(transformed_data, columns=names)
    return df

In [6]:
def scree_plot(data, title="", bar_size=30):
    transformer = PCA()
    fitted = transformer.fit(data)
    evr = list(fitted.explained_variance_ratio_)
    labels = ["C" + str(i+1) for i, x in enumerate(evr)]
    values = evr
    cumulative = [sum(values[:i+1]) for i in range(len(values))]
    df = pd.DataFrame({
        "Components": labels,
        'Single Component': values,
        "Total": cumulative
    })
    df = df.loc[df['Single Component'] > 0].copy()
    df['Cumulative'] = df['Total'] - df ['Single Component']
    df = df.drop("Total", axis=1)
    df = df.melt(id_vars = ["Components"], var_name="Share", value_name="Variance")
    plot = alt.Chart(df, title="Scree Plot of "+title).mark_bar().encode(
        x=alt.X('Components:N', sort='y'),
        y=alt.Y('sum(Variance)', title="Explained Variance"),
        color="Share"
    ).properties(width=300)
    return plot

In [7]:
scree_plot(scaler.fit_transform(X), title="Breast Cancer Dataset", bar_size=60).properties(width=600)

In [8]:
bc_pca_scaled = embed(X, pca, scale=True, names=['x_pca_scaled', 'y_pca_scaled'])
bc_tsne_scaled = embed(X, tsne, scale=True, names=['x_tsne_scaled', 'y_tsne_scaled'])
bc_umap_scaled = embed(X, umap, scale=True, names=['x_umap_scaled', 'y_umap_scaled'])

In [9]:
bc_embeddings = pd.concat([bc_pca_scaled, bc_tsne_scaled, bc_umap_scaled, y], axis=1)

In [10]:
alt.Chart(bc_embeddings, title="Breast Cancer PCA Embedding").mark_circle().encode(
    x='x_pca_scaled',
    y='y_pca_scaled',
    color='target:N'
).configure_axis(
    grid=False
)

In [11]:
alt.Chart(bc_embeddings).mark_circle().encode(
    x='x_tsne_scaled',
    y='y_tsne_scaled',
    color='target:N'
).configure_axis(
    grid=False
)

In [12]:
alt.Chart(bc_embeddings).mark_circle().encode(
    x='x_umap_scaled',
    y='y_umap_scaled',
    color='target:N'
).configure_axis(
    grid=False
)

In [13]:
# load data
digits = sd.load_digits()

X = digits['data']
y_mapping = {x: y for x, y in zip(list(range(len(digits['target_names']))), digits['target_names'])}
y = pd.DataFrame(digits['target'], columns=['target'])
y['target'] = y['target'].map(y_mapping)

In [14]:
scree_plot(scaler.fit_transform(X), title="Digits Dataset", bar_size=0.1).properties(height=350, width=700)

In [15]:
d_pca_scaled = embed(X, pca, scale=True, names=['x_pca_scaled', 'y_pca_scaled'])
d_umap_scaled = embed(X, UMAP(n_neighbors=20, min_dist=.5), scale=True, names=['x_umap_scaled', 'y_umap_scaled'])
d_embeddings = pd.concat([d_pca_scaled, d_umap_scaled, y], axis=1)

In [16]:
alt.Chart(d_embeddings, title="Digits PCA Embedding").mark_circle().encode(
    x='x_pca_scaled',
    y='y_pca_scaled',
    color='target:N'
).properties(
    width=600,
    height=600
).configure_axis(
    grid=False
)

In [17]:
alt.Chart(d_embeddings, title="Digits UMAP Embedding").mark_circle().encode(
    x='x_umap_scaled',
    y='y_umap_scaled',
    color='target:N'
).properties(
    width=600,
    height=600
).configure_axis(
    grid=False
)

In [28]:
n_min = 30
n_max = 300
n_step = 30

min_dist_range = [0, 0.001, .1, .5, 1]

slider_neighbors = alt.binding_range(min=n_min, max=n_max, step=n_step)
select_neighbors = alt.selection_single(name="neighbors", fields=['n_neighbors'], bind=slider_neighbors, init={'n_neighbors': n_min})

slider_mindist = alt.binding_select(options=min_dist_range)
select_mindist = alt.selection_single(name="min_dist", fields=['min_dist'], bind=slider_mindist, init={'min_dist': min_dist_range[0]})

In [27]:
def embed(data, transformer, log=False, scale=False, names=['x', 'y'], n=None, min_dist=None, target=None):
    if log==True:
        data = np.log(data)
    if scale==True:
        data = scaler.fit_transform(data)
    transformed_data = transformer.fit_transform(data)
    df = pd.DataFrame(transformed_data, columns=names)
    if n:
        df["n_neighbors"] = n
    if min_dist:
        df["min_dist"] = min_dist
    if target:
        df["target"] = target
    return df

In [None]:
umap_slider = pd.concat([
    embed(
        X, 
        UMAP(n_neighbors=n, min_dist=m), 
        scale=True, 
        names=['x_umap_scaled', 'y_umap_scaled'], 
        n=n
    ) 
    for n in range(n_min, n_max+n_step, n_step)]
    for m in min_dist_range
)

In [26]:
alt.Chart(umap_slider, title="Digits UMAP Embedding").mark_circle().encode(
    x='x_umap_scaled',
    y='y_umap_scaled',
    color='target:N'
).properties(
    width=600,
    height=600
).configure_axis(
    grid=False
).add_selection(
    select_neighbors, select_mindist
).transform_filter(
    select_neighbors, select_mindist
    
    

)

In [23]:
min_dist_range = [0, 0.001, .1, .5, 1]

(1797, 64)

In [None]:
UMAP()