In [None]:
import os
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch.utils.data import DataLoader
from torchvision.datasets import FashionMNIST
from torchvision import datasets, transforms


import utils as utils

### Download the Fashion MNIST dataset, create a dataloader for it

In [None]:
DATA_DIR = './fashionMNIST/'

# transforms for images
transform=transforms.Compose([transforms.ToTensor()])
# prepare transforms standard to MNIST
mnist_train = FashionMNIST(DATA_DIR, train=True, download=True, transform=transform)
dl = DataLoader(mnist_train, batch_size=64, num_workers=utils.get_num_cpus())

Display the MNIST data on a grid so we can see it.

In [None]:
# Class map from the fashion mnist website https://github.com/zalandoresearch/fashion-mnist
classes = {0: 'T-shirt/top',
           1: 'Trouser',
           2: 'Pullover',
           3: 'Dress',
           4: 'Coat',
           5: 'Sandal',
           6: 'Shirt',
           7: 'Sneaker',
           8: 'Bag',
           9: 'Ankle Boot'}
    
utils.display_grid_data(dl, classes, ncols=8)

Reinitialize the dataloader so we can read all the data, and get the data and labels into numpy arrays.

In [None]:
dl = DataLoader(mnist_train, batch_size=60000, num_workers=utils.get_num_cpus())

In [None]:
mnist_data = None
mnist_labels = None
for batch, lab in dl:
    mnist_data = batch.detach().numpy().squeeze()
    mnist_labels = lab.detach().numpy()
mnist_data = np.reshape(mnist_data, (60000, 28 * 28))

Use umap to create the embedding.  Note that we use all the mnist training data here to create the embedding.  Subsampling the data before this step will cause us to get different results.

In [None]:
import umap
standard_embedding = umap.UMAP(verbose=True).fit_transform(mnist_data)

Plot the embedding in a non interactive plot.

In [None]:
sns.set(style='white', rc={'figure.figsize':(10,8)})
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=mnist_labels.astype(int), s=0.1, cmap='Spectral');
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))

The plot is pretty busy.  Subsample the data to 10000 points so we can introspect it a little easier.

In [None]:
idx = np.random.choice(standard_embedding.shape[0], 10000, replace=False)
reduced_embedding = standard_embedding[idx, :]
reduced_data = mnist_data[idx, :]
reduced_labels = mnist_labels[idx]

In [None]:
plt.scatter(reduced_embedding[:, 0], reduced_embedding[:, 1], c=reduced_labels.astype(int), s=0.1, cmap='Spectral');
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))

It's a pretty cool plot.  You can really see the separation of the classes in the MNIST dataset.  You can also see that there are some points in the clusters that don't belong.  But you know what would make this plot really rock?  Zoom, pan, and ability to see the images when we hover over a point.  The bokeh library to the rescue.  First, reshape the data back to an image rather than a flat vector.

In [None]:
reduced_data = np.reshape(reduced_data, (10000, 28, 28))

In [None]:
#%load_ext autoreload
%autoreload 2

In [None]:
utils.plot_interactive_embedding(reduced_embedding,
                                 reduced_data,
                                 reduced_labels,
                                 classes,
                                 title='Fashion MNIST UMAP Embedding')  

So this is a lot nicer plot!  You will notice that as you hover your mouse over the points, the example images will pop up.  This allows you to see which cluster is which and it is helpful for analysis.  On the right hand side of the plot are the "tools" that bokeh offers.  You can toggle the tools on and off by clicking them once to enable, then again to disable.  To zoom in using the scroll wheel, click the "Wheel Zoom" tool to enable it.  To pan, enable the panning tool and then left click and drag to move the plot.

Try zooming into one of the clusters and finding an example that doesn't belong.