<a href="https://colab.research.google.com/github/bmreiniger/datascience.stackexchange/blob/master/SO78064402_AgglomerativeClustering_intermediate_clusters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True, as_frame=True)



In [None]:
cont_std_ = {}
for k in [2, 3, 4, 5]:
    np.random.seed(123456)  # for reproducibility
    model = AgglomerativeClustering(linkage='ward', n_clusters=k)
    model.fit(X)
    cont_std_['AHC_k'+ str(k)] = model.labels_

    silhouette_score = metrics.silhouette_score(X, model.labels_, metric='euclidean')
    print('silhouette at k=' + str(k) + ': ' + str(silhouette_score))

    davies_bouldin_score = metrics.davies_bouldin_score(X, model.labels_)
    print(f'davies bouldin at k={k}: {davies_bouldin_score}')


silhouette at k=2: 0.6867350732769781
davies bouldin at k=2: 0.38275284210068616
silhouette at k=3: 0.5543236611296426
davies bouldin at k=3: 0.6562564540642021
silhouette at k=4: 0.48896708575546993
davies bouldin at k=4: 0.7952637917518272
silhouette at k=5: 0.48438258927906036
davies bouldin at k=5: 0.8204166609646473


In [None]:
cont_std_

{'AHC_k2': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'AHC_k3': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
        2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
        2, 0, 0, 2, 2, 2, 0,

In [None]:
model.children_.shape

(149, 2)

In [None]:
len(model.children_[:-5])

144

In [None]:
model.children_[:5, :]

array([[101, 142],
       [  7,  39],
       [  0,  17],
       [  9,  34],
       [128, 132]])

In [None]:
model = AgglomerativeClustering(linkage='ward', n_clusters=k)
model.fit(X)

In [None]:
nodes = [[i] for i in range(len(X))]
merged_at_stage = -np.ones(len(X) + len(model.children_), dtype=int)
for i, merge in enumerate(model.children_):
    a, b = merge
    nodes.append(nodes[a] + nodes[b])
    merged_at_stage[a] = i
    merged_at_stage[b] = i

In [None]:
merged_at_stage

array([  2,  18,  34,  13,   6,  64,  57,   1,  12,   3,   5,  52,  26,
        65,  97,  97,  84,   2,  64,   7,  51,   7, 120,  29,  82,  32,
        29,  14,  14,   8,   8,  51,  67,  67,   3,  85,  74,   6,  12,
         1,  21, 127,  55,  43,  89,  18,  33,  13,   5,  23,  45,  46,
        45,  30,  40,  59,  46,   9,  40, 104, 102,  77, 107,  19,  86,
        20,  28,  53,  44,  39,  54,  77,  79,  70,  31,  20,  75,  88,
        38,  86,  10,  10,  15,  63,  28,  87,  61,  44,  27,  30,  59,
        19,  15,   9,  25,  16,  16,  31,  83,  25, 110,   0,  92,  49,
        69,  48, 129,  47, 111, 116,  35,  90,  24,  56, 101,  76,  11,
        80,  96, 109,  36,  73,  48,  22,  62,  68,  22,  17,   4,  68,
        47,  80,   4,  63, 124, 116,  37,  11,  17,  24,  41,  42,   0,
        36,  41,  42,  72,  35,  37,  71,  56,  23,  21,  32,  69,  74,
        78,  33,  52,  83,  39,  49,  55,  34,  50,  53,  27,  54,  26,
        38,  95,  50,  72,  58,  93,  66,  60,  66,  98,  43,  9

In [None]:
len(merged_at_stage), len(X)+len(model.children_)

(299, 299)

In [None]:
N_CLUSTERS = 5
final_nodes = [
    nodes[i]
    for i, x in enumerate(merged_at_stage)
    if (
        x >= len(X) - N_CLUSTERS  # the node hasn't already been merged with another
        and i <= len(X) + len(model.children_) - N_CLUSTERS  # the node has already been created
    )
]
len(final_nodes)

5

In [None]:
sum(len(node) for node in final_nodes)

150

In [None]:
for node in final_nodes:
  print(y[node].values)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2]
[2 2 2 2 2 2 2 2 2 2 2 2]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1]
[2 2 2 2 2 2 1 1 2 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 1
 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
import pandas as pd
y_pred = pd.Series([-1] * len(X))
for i, cluster in enumerate(final_nodes):
    y_pred[cluster] = i
y_pred

0      4
1      4
2      4
3      4
4      4
      ..
145    0
146    3
147    0
148    0
149    3
Length: 150, dtype: int64

In [None]:
y_pred.value_counts()

4    50
3    38
2    26
0    24
1    12
dtype: int64

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(X, y_pred)

0.48438258927906036