In [None]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline

sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

# Penguins

Load penguins dataset.

Features include:
1. Culmen length
2. Culmen depth
3. Flipper length
4. Body mass

In [None]:
penguins = pd.read_csv("https://github.com/allisonhorst/palmerpenguins/raw/5b5891f01b52ae26ad8cb9755ec93672f49328a8/data/penguins_size.csv")
penguins.head()

Get rid of NaNs and count species

In [None]:
penguins = penguins.dropna()
penguins.species_short.value_counts()

Visualise features, with nodes coloured by species.  This helps visualise an otherwise 4-dimensional space.

In [None]:
sns.pairplot(penguins, hue='species_short')

UMAP pipeline: reduce dimensionality

Note: I tried it and each iteration gives a different 'rotation', but the gist of it is the same each time.

In [None]:
import umap

# Instantiates class
reducer = umap.UMAP()

# Pull out the quantitative data (i.e. features) and normalise
penguin_data = penguins[
    [
        "culmen_length_mm",
        "culmen_depth_mm",
        "flipper_length_mm",
        "body_mass_g",
    ]
].values
scaled_penguin_data = StandardScaler().fit_transform(penguin_data)

# Fit reducer to features and transform.
# This reduces it to two dimensions
# (is there an option to change to 3 dimensions?  n dimensions?)
embedding = reducer.fit_transform(scaled_penguin_data)

Plot

In [None]:
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=[sns.color_palette()[x] for x in penguins.species_short.map({"Adelie":0, "Chinstrap":1, "Gentoo":2})])
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Penguin dataset', fontsize=24)

# Digits

In [None]:
# Load data
digits = load_digits()

In [None]:
# Visualise some of the data -- they are images
fig, ax_array = plt.subplots(20, 20)
axes = ax_array.flatten()
for i, ax in enumerate(axes):
    ax.imshow(digits.images[i], cmap='gray_r')
plt.setp(axes, xticks=[], yticks=[], frame_on=False)
plt.tight_layout(h_pad=0.5, w_pad=0.01)

In [None]:
# First approach: scatterplot matrix of features
# Here, the features are the intensities (white <--> black) of each of the 64 pixels, 64 features in total.
# These scatterplots shows just 10 features and how it is difficult to discern which features are the most important.
digits_df = pd.DataFrame(digits.data[:,1:11])
digits_df['digit'] = pd.Series(digits.target).map(lambda x: 'Digit {}'.format(x))
sns.pairplot(digits_df, hue='digit', palette='Spectral')

In [None]:
# Fit reducer (with seed)
reducer = umap.UMAP(random_state=42)
reducer.fit(digits.data)

# Access transform
embedding = reducer.transform(digits.data)

# Plot and colour by class (i.e. digit)
plt.scatter(embedding[:, 0], embedding[:, 1], c=digits.target, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of the Digits dataset', fontsize=24);