In [2]:
import numpy as np
import pandas as pd

In [3]:
class Dissimilarity:
    def __init__(self, x:np.ndarray, distance=2):
        x_len = x.shape[0]        
        if isinstance(distance, int) or isinstance(distance, float):
            self.dmat = np.linalg.norm(x[:,np.newaxis,:]-x, axis=2)
        elif callable(distance):
            x_len = x.shape[0]
            dmat = np.zeros((x_len, x_len))
            for i in range(x_len-1):
                for j in range(i+1, x_len):
                    dmat[i,j] = dmat[j,i] = distance(x[i], x[j])
            self.dmat = dmat
        else:
            raise ValueError('unknown distance:'+str(distance))
        
    def d(self, g1, g2):
        return self.dmat[g1,:][:,g2]
    
    def d_complete(self, g1, g2): 
        return self.d(g1, g2).max()
    
    def d_single(self, g1, g2):
        return self.d(g1, g2).min()
    
    def d_average(self, g1, g2):
        return self.d(g1, g2).mean()

class AgglomerativeClustering:
    def __init__(self, dissim:Dissimilarity, linkage='complete'):
        self.distance = dissim
        self.linkage = {
            'complete':dissim.d_complete,
            'single':dissim.d_single,
            'average':dissim.d_average
        }[linkage]
        
        self.clusters = [[i] for i in range(dissim.dmat.shape[0])]
    def step(self):
        nCluster = len(self.clusters)
        if nCluster < 2:
            return False
        t1, t2 = 0, 1
        minD = self.linkage(self.clusters[t1], self.clusters[t2])
        
        for i in range(nCluster):
            for j in range(i+1, nCluster):
                D = self.linkage(self.clusters[i], self.clusters[j])
                if D < minD:
                    minD = D
                    t1, t2 = i, j
        
        g2 = self.clusters.pop(t2)
        g1 = self.clusters[t1]
        g1.extend(g2)
        return g1, minD
    
    def gini(self, label: np.ndarray):
        gini = 0
        N = label.size
        for clt in self.clusters:
            label_clt = label[clt]
            pp = pd.Series(label_clt).value_counts(normalize=True).to_numpy()
            gini += (1-np.sum(pp**2))*label_clt.size/N
        return gini

In [4]:
from sklearn.datasets import load_iris
X,Y = load_iris(return_X_y=True)

In [6]:
agc = AgglomerativeClustering(Dissimilarity(X), linkage='single')
while len(agc.clusters) > 3:
    agc.step()
agc.gini(Y)

0.32653061224489793