agglomerative clustering without scikit learn

In [1]:
import numpy as np

def hierarchical_clustering(data, n_clusters):
    def calculate_distance_matrix(data):
        n = len(data)
        distance_matrix = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                if i != j:
                    distance_matrix[i, j] = np.linalg.norm(data[i] - data[j])
                else:
                    distance_matrix[i, j] = np.inf  # Avoid self-linkage
        return distance_matrix

    def merge_clusters(cluster_map, distance_matrix):
        min_dist = np.inf
        x, y = -1, -1
        for i in range(len(distance_matrix)):
            for j in range(i + 1, len(distance_matrix)):
                if distance_matrix[i, j] < min_dist:
                    min_dist = distance_matrix[i, j]
                    x, y = i, j
        
        cluster_map[y] = cluster_map[x] + cluster_map[y]
        cluster_map.pop(x)
        
        for i in range(len(distance_matrix)):
            if i != x and i != y:
                distance_matrix[y, i] = distance_matrix[i, y] = min(distance_matrix[i, x], distance_matrix[i, y])
        distance_matrix[x, :] = distance_matrix[:, x] = np.inf
        
        return cluster_map, distance_matrix

    cluster_map = {i: [i] for i in range(len(data))}
    distance_matrix = calculate_distance_matrix(data)

    while len(cluster_map) > n_clusters:
        cluster_map, distance_matrix = merge_clusters(cluster_map, distance_matrix)

    return list(cluster_map.values())

data = np.array([[1, 2], [1, 4], [1, 0],
                 [10, 2], [10, 4], [10, 0]])

n_clusters = 2
clusters = hierarchical_clustering(data, n_clusters)
print("Clusters:", clusters)

Clusters: [[0, 1, 2], [3, 4, 5]]


agglomerative clustering with scikit learn

In [2]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

data = np.array([[1, 2], [1, 4], [1, 0], 
                 [10, 2], [10, 4], [10, 0]])

n_clusters = 2
hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
labels = hierarchical.fit_predict(data)

print("Clusters:", labels)

Clusters: [1 1 1 0 0 0]
