In [25]:
# Importing libraries
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, Birch, MeanShift, estimate_bandwidth 
from sklearn.datasets import load_boston, load_digits, load_iris
from sklearn.metrics.cluster import normalized_mutual_info_score

In [4]:
# Loading the data from sklearn datasets

# Boston dataset
boston_dataset = load_boston()
print('Shape of Boston dataset: ' + str(boston_dataset.data.shape))

# Digits dataset
digits_dataset = load_digits()
print('Shape of Digits dataset: ' + str(digits_dataset.data.shape))

# Iris dataset
iris_dataset = load_iris()
print('Shape of Iris dataset: ' + str(iris_dataset.data.shape))

Shape of Boston dataset: (506, 13)
Shape of Digits dataset: (1797, 64)
Shape of Iris dataset: (150, 4)


In [72]:
# KMeans for Boston dataset
kmeans_boston = KMeans(n_clusters=30, random_state=0, init="k-means++").fit(boston_dataset.data)
km_y_pred_boston = kmeans_boston.labels_
km_centers_boston = kmeans_boston.cluster_centers_

# Accuracy for Boston dataset
print('Accuracy for Boston dataset with KMeans:')
normalized_mutual_info_score(boston_dataset.target, km_y_pred_boston)

Accuracy for Boston dataset with KMeans:


0.55959778366226098

In [17]:
# Mini Batch KMeans for Boston dataset
mb_kmeans_boston = MiniBatchKMeans(n_clusters=30, compute_labels=True, random_state=0).fit(boston_dataset.data)
mb_y_pred_boston = mb_kmeans_boston.labels_
mb_centers_boston = mb_kmeans_boston.cluster_centers_

# Accuracy for Boston dataset
print('Accuracy for Boston dataset with Mini Batch KMeans:')
normalized_mutual_info_score(boston_dataset.target, mb_y_pred_boston)

Accuracy for Boston dataset with Mini Batch KMeans:


0.62450965470326036

In [88]:
# Agglomerative Clustering for Boston dataset
agg_boston = AgglomerativeClustering(n_clusters=30, linkage="ward", affinity="euclidean").fit(boston_dataset.data)
agg_y_pred_boston = agg_boston.labels_

# Accuracy for Boston dataset with Agglomerative Clustering
print('Accuracy for Boston dataset with Agglomerative Clustering')
normalized_mutual_info_score(boston_dataset.target, agg_y_pred_boston)

Accuracy for Boston dataset with Agglomerative Clustering


0.54975352364950936

In [97]:
# Birch clustering for Boston dataset
birch_boston = Birch(threshold=0.5, branching_factor=50, n_clusters=None, compute_labels=True).fit(boston_dataset.data)
birch_y_pred_boston = birch_boston.labels_

# Accuracy for Boston dataset with Birch Clustering
print('Accuracy for Boston dataset with Birch Clustering')
normalized_mutual_info_score(boston_dataset.target, birch_y_pred_boston)

Accuracy for Boston dataset with Birch Clustering


0.91510354070503575

In [56]:
# Mean Shift for Boston dataset
bandwidth = estimate_bandwidth(iris_dataset.data, quantile=0.35, n_samples=100)
ms_boston = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(boston_dataset.data)
ms_y_pred_boston = ms_boston.labels_

# Accuracy for Boston dataset with Mean Shift Clustering
print('Accuracy for Boston dataset with Mean Shift Clustering')
normalized_mutual_info_score(boston_dataset.target, ms_y_pred_boston)

Accuracy for Boston dataset with Mean Shift Clustering


  " using data points as seeds." % bin_size)


0.91342589530607665

In [71]:
# KMeans for Digits dataset
kmeans_digits = KMeans(n_clusters=10, random_state=0, init="k-means++").fit(digits_dataset.data)
y_pred_digits = kmeans_digits.labels_
centers_digits = kmeans_digits.cluster_centers_

# Accuracy for Digits dataset with KMeans
print('Accuracy for Digits dataset with KMeans:')
normalized_mutual_info_score(digits_dataset.target, y_pred_digits)

Accuracy for Digits dataset with KMeans:


0.74689035546457949

In [6]:
# Mini Batch KMeans for Digits dataset
mb_kmeans_digits = MiniBatchKMeans(n_clusters=10, compute_labels=True, random_state=0).fit(digits_dataset.data)
mb_y_pred_digits = mb_kmeans_digits.labels_
mb_centers_digits = mb_kmeans_digits.cluster_centers_

# Accuracy for Digits dataset with Mini Batch KMeans
print('Accuracy for Digits dataset with Mini Batch KMeans:')
normalized_mutual_info_score(digits_dataset.target, mb_y_pred_digits)

Accuracy for Digits dataset with Mini Batch KMeans:


0.70971262593334838

In [40]:
# Agglomerative Clustering for Digits dataset
agg_digits = AgglomerativeClustering(n_clusters=10, linkage="ward", affinity="euclidean").fit(digits_dataset.data)
agg_y_pred_digits = agg_digits.labels_

# Accuracy for Digits dataset with Agglomerative Clustering
print('Accuracy for Digits dataset with Agglomerative Clustering')
normalized_mutual_info_score(digits_dataset.target, agg_y_pred_digits)

Accuracy for Boston dataset with Agglomerative Clustering


0.86823716803658135

In [98]:
# Birch clustering for Digits dataset
birch_digits = Birch(threshold=0.5, branching_factor=50, n_clusters=10, compute_labels=True).fit(digits_dataset.data)
birch_y_pred_digits = birch_digits.labels_

# Accuracy for Boston dataset with Birch Clustering
print('Accuracy for Digits dataset with Birch Clustering')
normalized_mutual_info_score(digits_dataset.target, birch_y_pred_digits)

Accuracy for Digits dataset with Birch Clustering


0.86823716803658135

In [51]:
# Mean Shift for Digits dataset
ms_digits = MeanShift(bandwidth=10.0, bin_seeding=True).fit(digits_dataset.data)
ms_y_pred_digits = ms_digits.labels_

# Accuracy for Digits dataset with Mean Shift Clustering
print('Accuracy for Digits dataset with Mean Shift Clustering')
normalized_mutual_info_score(digits_dataset.target, ms_y_pred_digits)

  " using data points as seeds." % bin_size)


Accuracy for Digits dataset with Mean Shift Clustering


0.55517928748170609

In [69]:
# KMeans for Iris dataset
kmeans_iris = KMeans(n_clusters=3, random_state=0, init="k-means++").fit(iris_dataset.data)
y_pred_iris = kmeans_iris.labels_
centers_iris = kmeans_iris.cluster_centers_

# Accuracy for Iris dataset with KMeans
print('Accuracy for Iris dataset with KMeans:')
normalized_mutual_info_score(iris_dataset.target, y_pred_iris)

Accuracy for Iris dataset with KMeans:


0.75820572781941964

In [70]:
# Mini Batch KMeans for Iris dataset
mb_kmeans_iris = MiniBatchKMeans(n_clusters=3, compute_labels=True, random_state=0).fit(iris_dataset.data)
mb_y_pred_iris = mb_kmeans_iris.labels_
mb_centers_iris = mb_kmeans_iris.cluster_centers_

# Accuracy for Iris dataset with Mini Batch KMeans
print('Accuracy for Iris dataset with Mini Batch KMeans:')
normalized_mutual_info_score(iris_dataset.target, mb_y_pred_iris)

Accuracy for Iris dataset with Mini Batch KMeans:


0.75820572781941964

In [57]:
# Agglomerative Clustering for Iris dataset
agg_iris = AgglomerativeClustering(n_clusters=3, linkage="average", affinity="l2").fit(iris_dataset.data)
agg_y_pred_iris = agg_iris.labels_

# Accuracy for Iris dataset with Agglomerative Clustering
print('Accuracy for Iris dataset with Agglomerative Clustering')
normalized_mutual_info_score(iris_dataset.target, agg_y_pred_iris)

Accuracy for Iris dataset with Agglomerative Clustering


0.80575367113055041

In [92]:
# Birch clustering for Iris dataset
birch_iris = Birch(threshold=0.2, branching_factor=50, n_clusters=3, compute_labels=True).fit(iris_dataset.data)
birch_y_pred_iris = birch_iris.labels_

# Accuracy for Iris dataset with Birch Clustering
print('Accuracy for Iris dataset with Birch Clustering')
normalized_mutual_info_score(iris_dataset.target, birch_y_pred_iris)

Accuracy for Iris dataset with Birch Clustering


0.79806742053659263

In [61]:
# Mean Shift for Iris dataset
bandwidth = estimate_bandwidth(iris_dataset.data, quantile=0.3, n_samples=200)
ms_iris = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(iris_dataset.data)
ms_y_pred_iris = ms_iris.labels_

# Accuracy for Iris dataset with Mean Shift Clustering
print('Accuracy for Iris dataset with Mean Shift Clustering')
normalized_mutual_info_score(iris_dataset.target, ms_y_pred_iris)

Accuracy for Iris dataset with Mean Shift Clustering


0.72492716567676763