In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.datasets import load_digits

In [2]:
dataset = load_digits()
digits_x, digits_y = dataset.data, dataset.target
digits_x.shape, digits_y.shape

((1797, 64), (1797,))

In [13]:
actual, counts = np.unique(digits_y), np.bincount(digits_y)
dict(zip(actual, counts))

{0: 178,
 1: 182,
 2: 177,
 3: 183,
 4: 181,
 5: 182,
 6: 181,
 7: 179,
 8: 174,
 9: 180}

In [15]:
from sklearn.cluster import Birch

In [17]:
model = Birch(n_clusters=10, compute_labels=True, threshold=0.5, branching_factor=50).fit(digits_x)
labels = model.labels_

In [18]:
cluster, counts = np.unique(labels, return_counts=True)
dict(zip(cluster, counts))

{0: 181, 1: 317, 2: 197, 3: 196, 4: 191, 5: 178, 6: 181, 7: 178, 8: 80, 9: 98}

In [19]:
from sklearn.metrics import accuracy_score, recall_score, silhouette_score, homogeneity_score, completeness_score, v_measure_score, adjusted_mutual_info_score

In [22]:
acc = round(accuracy_score(digits_y, labels),3)
rec = round(recall_score(digits_y, labels,average="macro"),3)
sil = round(silhouette_score(digits_x, labels,metric='sqeuclidean'),3)
homg = round(homogeneity_score(digits_y, labels),3)
comp = round(completeness_score(digits_y, labels),3)
vms = round(v_measure_score(digits_y, labels),3)
mis = round(adjusted_mutual_info_score(digits_y, labels),3)

In [23]:
print("The accuracy score for the model is: {}".format(acc))
print("The recall score for the model is: {}".format(rec))
print("The silhouette score for the model is: {}".format(sil))
print("The homogeneity score for the model is: {}".format(homg))
print("The completeness score for the model is: {}".format(homg))
print("The V-measure score for the model is: {}".format(vms))
print("The Adjusted Mutual Information score for the model is: {}".format(mis))

The accuracy score for the model is: 0.194
The recall score for the model is: 0.194
The silhouette score for the model is: 0.289
The homogeneity score for the model is: 0.858
The completeness score for the model is: 0.858
The V-measure score for the model is: 0.868
The Adjusted Mutual Information score for the model is: 0.867


 - Birch can handle large datasets
 - User does not have specify number of clusters in the beginning
 - Birch may not perform well in a datasets having excessive number of features

In [28]:
threshold = np.linspace(0.10, 1.0, 10, endpoint=False)
mi = []
v = []
for i in threshold:
    model = Birch(n_clusters=10,
                 threshold=i,
                 branching_factor=50,
                 compute_labels=True).fit(digits_x)
    labels = model.labels_
    mis = round(adjusted_mutual_info_score(digits_y, labels),3)
    vms = round(v_measure_score(digits_y, labels),3)
    mi.append(mis)
    v.append(vms)

In [29]:
mi

[0.867, 0.867, 0.867, 0.867, 0.867, 0.867, 0.867, 0.867, 0.867, 0.867]

In [30]:
v

[0.868, 0.868, 0.868, 0.868, 0.868, 0.868, 0.868, 0.868, 0.868, 0.868]