# Preliminaries

## Load

In [1]:
%matplotlib notebook

In [141]:
import numpy as np
import sklearn as skl
import scipy as sp
import scipy.cluster.hierarchy
from sklearn import datasets

In [3]:
from skimage import color
from colorsys import rgb_to_hsv

In [137]:
import sys
sys.path.append('/home/andy/Documents/Research/pnri/Helpers/')

import importlib
pcl = importlib.import_module('protoclust')
cplt = importlib.import_module('colonyplotting')

In [6]:
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

## Reload 

In [29]:
pcl = importlib.reload(pcl)
cplt = importlib.reload(cplt)

## Utilities

In [8]:
def labelXYZ(ax, xlabel, ylabel, zlabel=None, params={}):
    ax.set_xlabel(xlabel, **params)
    ax.set_ylabel(ylabel, **params)
    if zlabel:
        ax.set_zlabel(zlabel, **params)
    return ax

# Example 1: Faces

In [103]:
faces = skl.datasets.fetch_olivetti_faces()
X = faces.data
n,d = X.shape # d = 64x64
Y = faces.target

## Inspect SVD

In [57]:
U,S,Vt = np.linalg.svd(X - np.mean(X, axis=0))

In [11]:
fig, ax = plt.subplots(1, figsize=[5,3])
ax.imshow(Vt[:,0].reshape(64,64))
cplt.clean_ax(ax)

<IPython.core.display.Javascript object>

In [11]:
fig,ax = plt.subplots(1, figsize=[4,3])
ax.scatter(list(range(len(S))), np.log10(S), marker='+')
ax.set_ylim([0,2.2])

<IPython.core.display.Javascript object>

(0, 2.2)

In [14]:
fig= plt.figure()
ax = fig.add_subplot(111, projection='3d')
pcX = X@Vt.T[:,:3]
colors, cnorm = [mpl.cm.get_cmap('tab10'), np.max(Y)]
for i,j,k,c in zip(pcX[:,0], pcX[:,1], pcX[:,2],Y):
    ax.scatter(i,j,k,color=colors(c/cnorm))
labelXYZ(ax, 'A','B','C', {'fontsize': 14});

<IPython.core.display.Javascript object>

## An example of protoclust

In [139]:
pcl = importlib.reload(pcl)

In [140]:
available_indices, clustering_data, Z = pcl.protoclust(X,  lambda x,y: np.sqrt(np.sum(np.square(x-y))))

In [None]:
fig = plt.figure()
sp.cluster.hierarchy.dendrogram(Z, p=5, truncate_mode='level');

In [130]:
clustering, clustering_centers, clustering_distances = [np.array(i) for i in clustering_data]

In [118]:
clustering_centers[available_indices[-6]]

array([396, 288, 206, 366, 151, 249])

In [124]:
nk = 9

kindices = available_indices[-nk]
kclust = clustering[kindices]
kclust_center = clustering_centers[kindices]

colors, cnorm = [mpl.cm.get_cmap('tab10'), nk]

fig= plt.figure()
ax = fig.add_subplot(111, projection='3d')
pcX = X@Vt.T[:,:3]
for index, row in enumerate(pcX):
    c='gray'
    x1,x2,x3 = row
    
    for a, aclust in enumerate(kclust):
        if index in aclust:
            c=colors(a/cnorm)
    
    if index in kclust_center:
        ax.scatter(x1,x2,x3,color='k', s=500, marker='x')
    else:
        ax.scatter(x1,x2,x3,color=c)
    
labelXYZ(ax, 'A','B','C', {'fontsize': 14});

<IPython.core.display.Javascript object>

https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html#scipy.cluster.hierarchy.dendrogram

In [None]:
fig, ax = plt.subplots(2,3)
fig.subplots_adjust(wspace=0, hspace=0)
for i,row in enumerate(kclust_center):
    im = color.gray2rgb(X[row].reshape(64,64))
    c_hsv = rgb_to_hsv(*colors(i/cnorm)[:-1])
    ax.flatten()[i].imshow(cplt.colorize(im, c_hsv[0], .75*c_hsv[1]))
    cplt.clean_ax(ax.flatten()[i])

# Example 2: Words

In [96]:
from scipy.io import loadmat

In [97]:
words = loadmat('/home/andy/Downloads/grolier15276.mat')
X = words['grolier'].toarray()
n,d = X.shape

In [92]:
n,d

(15276, 30991)

In [98]:
def pairwise_dissimilarity(x,y):
    return 1 - np.sum(x*y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [102]:
clustering, clustering_data = pcl.protoclust(X,  pairwise_dissimilarity, verbose=True)

Compute distance matrix...


KeyboardInterrupt: 

In [None]:
clustering, clustering_centers, clustering_distances = [np.array(i) for i in clustering_data]