
<br>
===========================================================<br>
Hierarchical clustering: structured vs unstructured ward<br>
===========================================================<br>
Example builds a swiss roll dataset and runs<br>
hierarchical clustering on their position.<br>
For more information, see :ref:`hierarchical_clustering`.<br>
In a first step, the hierarchical clustering is performed without connectivity<br>
constraints on the structure and is solely based on distance, whereas in<br>
a second step the clustering is restricted to the k-Nearest Neighbors<br>
graph: it's a hierarchical clustering with structure prior.<br>
Some of the clusters learned without connectivity constraints do not<br>
respect the structure of the swiss roll and extend across different folds of<br>
the manifolds. On the opposite, when opposing connectivity constraints,<br>
the clusters form a nice parcellation of the swiss roll.<br>


Authors : Vincent Michel, 2010<br>
          Alexandre Gramfort, 2010<br>
          Gael Varoquaux, 2010<br>
License: BSD 3 clause

In [None]:
print(__doc__)

In [None]:
import time as time
import numpy as np
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import make_swiss_roll

#############################################################################<br>
Generate data (swiss roll dataset)

In [None]:
n_samples = 1500
noise = 0.05
X, _ = make_swiss_roll(n_samples, noise)
# Make it thinner
X[:, 1] *= .5

#############################################################################<br>
Compute clustering

In [None]:
print("Compute unstructured hierarchical clustering...")
st = time.time()
ward = AgglomerativeClustering(n_clusters=6, linkage='ward').fit(X)
elapsed_time = time.time() - st
label = ward.labels_
print("Elapsed time: %.2fs" % elapsed_time)
print("Number of points: %i" % label.size)

#############################################################################<br>
Plot result

In [None]:
fig = plt.figure()
ax = p3.Axes3D(fig)
ax.view_init(7, -80)
for l in np.unique(label):
    ax.scatter(X[label == l, 0], X[label == l, 1], X[label == l, 2],
               color=plt.cm.jet(np.float(l) / np.max(label + 1)),
               s=20, edgecolor='k')
plt.title('Without connectivity constraints (time %.2fs)' % elapsed_time)

#############################################################################<br>
Define the structure A of the data. Here a 10 nearest neighbors

In [None]:
from sklearn.neighbors import kneighbors_graph
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)

#############################################################################<br>
Compute clustering

In [None]:
print("Compute structured hierarchical clustering...")
st = time.time()
ward = AgglomerativeClustering(n_clusters=6, connectivity=connectivity,
                               linkage='ward').fit(X)
elapsed_time = time.time() - st
label = ward.labels_
print("Elapsed time: %.2fs" % elapsed_time)
print("Number of points: %i" % label.size)

#############################################################################<br>
Plot result

In [None]:
fig = plt.figure()
ax = p3.Axes3D(fig)
ax.view_init(7, -80)
for l in np.unique(label):
    ax.scatter(X[label == l, 0], X[label == l, 1], X[label == l, 2],
               color=plt.cm.jet(float(l) / np.max(label + 1)),
               s=20, edgecolor='k')
plt.title('With connectivity constraints (time %.2fs)' % elapsed_time)

In [None]:
plt.show()