In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.datasets import load_iris

In [2]:
dataset = load_iris()
iris_x = pd.DataFrame(dataset.data, columns=dataset.feature_names)
iris_y = pd.DataFrame(dataset.target, columns=["target"])
iris_y.target.value_counts().sort_index()

0    50
1    50
2    50
Name: target, dtype: int64

In [3]:
from sklearn.preprocessing import RobustScaler

In [4]:
sc = RobustScaler()
iris_x_scaled = sc.fit_transform(iris_x)

In [5]:
from sklearn.cluster import AgglomerativeClustering

In [6]:
model = AgglomerativeClustering(n_clusters=3,
                                affinity="euclidean",
                                linkage="ward").fit(iris_x_scaled)
labels = model.labels_

In [7]:
cluster, counts = np.unique(labels, return_counts=True)
dict(zip(cluster, counts))

{0: 49, 1: 62, 2: 39}

In [8]:
actual, counts = iris_y.target.unique(), iris_y.target.value_counts().sort_index().values
dict(zip(actual, counts))

{0: 50, 1: 50, 2: 50}

In [9]:
from sklearn.metrics import accuracy_score, recall_score, silhouette_score, homogeneity_score, completeness_score, v_measure_score, adjusted_mutual_info_score

In [10]:
acc = round(accuracy_score(iris_y.target, labels),3)
rec = round(recall_score(iris_y.target, labels,average="macro"),3)
sil = round(silhouette_score(iris_x_scaled, labels,metric='sqeuclidean'),3)
homg = round(homogeneity_score(iris_y.target, labels),3)
comp = round(completeness_score(iris_y.target, labels),3)
vms = round(v_measure_score(iris_y.target, labels),3)
mis = round(adjusted_mutual_info_score(iris_y.target, labels),3)

In [11]:
print("The accuracy score for the model is: {}".format(acc))
print("The recall score for the model is: {}".format(rec))
print("The silhouette score for the model is: {}".format(sil))
print("The homogeneity score for the model is: {}".format(homg))
print("The completeness score for the model is: {}".format(homg))
print("The V-measure score for the model is: {}".format(vms))
print("The Adjusted Mutual Information score for the model is: {}".format(mis))

The accuracy score for the model is: 0.8
The recall score for the model is: 0.8
The silhouette score for the model is: 0.629
The homogeneity score for the model is: 0.607
The completeness score for the model is: 0.607
The V-measure score for the model is: 0.612
The Adjusted Mutual Information score for the model is: 0.607


https://www.geeksforgeeks.org/ml-v-measure-for-evaluating-clustering-performance/

 - __Search for best affinity & linkage parameter__
 - __ward method does not work with l2,l2__

In [12]:
affinity = ["euclidean", "l1", "l2", "manhattan", "cosine"]
linkage = ["complete", "average", "single"]
acc_score = []
vms_score = []
method = []
for i in affinity:
    for j in linkage:
        model = AgglomerativeClustering(n_clusters=3,
                                        affinity=i,
                                        linkage=j).fit(iris_x_scaled)
        labels = model.labels_
        acc = round(accuracy_score(iris_y.target, labels),3)
        vms = round(v_measure_score(iris_y.target, labels),3)
        iter_n = i+"-"+j
        acc_score.append(acc)
        vms_score.append(vms)
        method.append(iter_n)

In [13]:
results = np.array((method, acc_score, vms_score)).T
results = pd.DataFrame(results, columns=["Method", "Accuracy", "V_Measure"])
results

Unnamed: 0,Method,Accuracy,V_Measure
0,euclidean-complete,0.547,0.395
1,euclidean-average,0.68,0.674
2,euclidean-single,0.0,0.72
3,l1-complete,0.02,0.68
4,l1-average,0.687,0.713
5,l1-single,0.0,0.72
6,l2-complete,0.547,0.395
7,l2-average,0.68,0.674
8,l2-single,0.0,0.72
9,manhattan-complete,0.02,0.68
