# Hierarchical Clustering Dendrogram

In [None]:
import numpy as np

from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering
import scipy.stats as stats

## Create some data
Gaussian distributions

In [None]:
# Gaussian distribution parameters
mu1 = [1, 1]
S1 = [[2.0, 0], [0, 0.5]]
mu2 = [-1, -1]
S2 = [[2.0, 0.3], [0.3, 0.5]]

rv1 = stats.multivariate_normal(mu1, S1, seed=1)
rv2 = stats.multivariate_normal(mu2, S2, seed=2)

# plot the pdf
fig2 = plt.figure()


# plot some random points from the pdf
samples1 = rv1.rvs(10)
samples2 = rv2.rvs(10)

# dataset is the set of all points 
X = np.concatenate((samples1,samples2))
# labels
Y = np.concatenate((np.zeros(len(samples1)),np.ones(len(samples2))))

# plot the points
plt.scatter(samples1[:,0],samples1[:,1])
plt.scatter(samples2[:,0],samples2[:,1])
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
for i in range(len(X)):
    plt.annotate(i, (X[i,0], X[i,1]))

plt.show()

In [None]:
#plt.figure(dpi=150)
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
#plt.xlabel('Index of point')
#plt.ylabel('Distance')
#.subplot(131)
Z = linkage(X, 'single')
dendrogram(Z, ax=ax1)
ax1.set_title('single')
ax1.set_ylabel('Distance')
#plt.subplot(132)
Z = linkage(X, 'complete')
dendrogram(Z, ax=ax2)
ax2.set_title('complete')
#plt.subplot(133)
Z = linkage(X, 'ward')
dendrogram(Z, ax=ax3)
ax3.set_title('Ward')

plt.tight_layout()
plt.show()

In [None]:
Z = linkage(X, 'single')
dendrogram(Z)

plt.xlabel('Index of point')
plt.ylabel('Distance')
plt.show()

In [None]:
Z = linkage(X, 'ward')
dendrogram(Z)

plt.xlabel('Index of point')
plt.ylabel('Distance')
plt.show()

In [None]:
Z = linkage(X, 'complete')
dendrogram(Z)

plt.xlabel('Index of point')
plt.ylabel('Distance')
plt.show()

## Another distribution of points

In [None]:
# Gaussian distribution parameters
mu1 = [1, 1]
S1 = [[0.1, 0], [0, 0.5]]
mu2 = [-1, -1]
S2 = [[0.1, 0], [0, 0.5]]

rv1 = stats.multivariate_normal(mu1, S1, seed=1)
rv2 = stats.multivariate_normal(mu2, S2, seed=2)

# plot the pdf
fig2 = plt.figure()


# plot some random points from the pdf
samples1 = rv1.rvs(10)
samples2 = rv2.rvs(10)

# dataset is the set of all points 
X = np.concatenate((samples1,samples2))
# labels
Y = np.concatenate((np.zeros(len(samples1)),np.ones(len(samples2))))

# plot the points
plt.scatter(samples1[:,0],samples1[:,1])
plt.scatter(samples2[:,0],samples2[:,1])

for i in range(len(X)):
    plt.annotate(i, (X[i,0], X[i,1]))

plt.show()

In [None]:
#plt.figure(dpi=150)
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
#plt.xlabel('Index of point')
#plt.ylabel('Distance')
#.subplot(131)
Z = linkage(X, 'single')
dendrogram(Z, ax=ax1)
ax1.set_title('single')
#plt.subplot(132)
Z = linkage(X, 'complete')
dendrogram(Z, ax=ax2)
ax2.set_title('complete')
#plt.subplot(133)
Z = linkage(X, 'ward')
dendrogram(Z, ax=ax3)
ax3.set_title('Ward')

plt.tight_layout()
plt.show()

In [None]:
Z = linkage(X, 'single')
distance = np.linspace(0,2,50)
nb_clusters = [len(np.unique(fcluster(Z,t, criterion='distance'))) for t in distance]

In [None]:
plt.plot(distance, nb_clusters)
plt.xlabel('Distance threshold')
plt.ylabel('Nb of clusters')
plt.show()

Let us display the distance matrix

In [None]:
from scipy.spatial.distance import pdist,squareform

In [None]:
# function for displaying the matrix in a nice way
# from https://dev.to/callas1900/how-to-display-latex-matrix-as-output-of-a-code-cell-40ck
from IPython.display import display, Math

def print_matrix(array):
    data = ''
    for line in array:        
        if len(line) == 1:
            data += ' %.3f &'%line + r' \\\n'
            continue
        for element in line:
            data += ' %.2f &'%element
        data += r' \\' + '\n'
    display(Math('\\begin{bmatrix} \n%s\end{bmatrix}'%data))


In [None]:
d = squareform(pdist(X[:10,:]))

In [None]:
print_matrix(d)