In [1]:
# Uncomment and run if you do not have these packages installed or are outdated
#!pip install kemlglearn --upgrade
#!pip install scikit-learn --upgrade

# Consensus Clustering

In [2]:
%matplotlib notebook
from sklearn.datasets import load_iris,  make_moons, make_circles
from sklearn.metrics import adjusted_mutual_info_score
import matplotlib.pyplot as plt
import seaborn as sns
from kemlglearn.datasets import make_blobs
from sklearn.cluster import KMeans
from kemlglearn.cluster.consensus import SimpleConsensusClustering
import numpy as np
from numpy.random import normal

import warnings
warnings.filterwarnings('ignore')

data = load_iris()['data']
labels = load_iris()['target']

The `kemlglearn` library has an implementation of a simple consensus algorithm based on the coassociation matrix.

The basis classifier is K-means, it has the following parameters:

* `n_clusters` = Number of clusters
* `n_clusters_base` = Number of clusters to use the base classifier
* `n_components` = Number of components of the consensus
* `ncb_rand` = If the number of clusters of each component is chosen randomly in the interval [ 2..`n_clusters` ]

We will start applying consensus clustering to the iris dataset and we will compare with a single k-means.

Feel free to experiment with the parameters of the consensus to see if there is any improvement respect to the default values for the parameters.



We will start with the iris dataset, in this case there is a slighly improvement for the AMI respect to the ground truth using the consensus by combining 30 clusterings with 10 clusters each. Obviously in a real application we will not have the ground truth and we will need to explore the hyperparameters of the consensus clustering method, but we can use internal quality methods to assess the quality of the clustering.

In [3]:
nc = 3
km = KMeans(n_clusters=nc)

cons = SimpleConsensusClustering(n_clusters=nc, n_clusters_base=10, n_components=30, ncb_rand=False)

lkm = km.fit_predict(data)
cons.fit(data)
lcons = cons.labels_

print('K-M AMI =', adjusted_mutual_info_score(labels, lkm))
print('SCC AMI =', adjusted_mutual_info_score(labels, lcons))

K-M AMI = 0.7551191675800484
SCC AMI = 0.7954205025674187


In [4]:
fig = plt.figure(figsize=(20,7))
ax = fig.add_subplot(131)
plt.scatter(data[:,0],data[:,1],c=labels)
plt.title('Ground Truth')
ax = fig.add_subplot(132)
plt.scatter(data[:,0],data[:,1],c=lkm)
plt.title('K-means')
ax = fig.add_subplot(133)
plt.scatter(data[:,0],data[:,1],c=lcons)
plt.title('Simple Consensus');

<IPython.core.display.Javascript object>

Now we will apply the consensus clustering to two clusters of different sizes and densities where K-means usually has difficulties. We can experiment with the parameters of the consensus but also with the characteristics of the dataset, in this case we will see what happens with a particular dataset and parameters.

In [5]:
data, labels = make_blobs(n_samples=[50, 200], n_features=2, centers=[[1,1], [0,0]], random_state=2, cluster_std=[0.1, 0.4])

In [6]:
nc = 2
km = KMeans(n_clusters=nc)

cons = SimpleConsensusClustering(n_clusters=nc, n_clusters_base=20, n_components=50, ncb_rand=False)

lkm = km.fit_predict(data)
cons.fit(data)
lcons = cons.labels_

print('K-M AMI =', adjusted_mutual_info_score(labels, lkm))
print('SCC AMI  =', adjusted_mutual_info_score(labels, lcons))

K-M AMI = 0.6977028093054155
SCC AMI  = 0.8929632461306622


We can see a clear difference on the quality of the consensus clustering, notice that the base clustering have 20 clusters each. Usually a large nunber of clusters has advantages because we group the data at a certain granularity and mixing all the partitions allows to discover more clearly how data is distributed.

In [7]:
fig = plt.figure(figsize=(20,7))
ax = fig.add_subplot(131)
plt.scatter(data[:,0],data[:,1],c=labels)
plt.title('Ground Truth')
ax = fig.add_subplot(132)
plt.scatter(data[:,0],data[:,1],c=lkm)
plt.title('K-means')
ax = fig.add_subplot(133)
plt.scatter(data[:,0],data[:,1],c=lcons)
plt.title('Simple Consensus');

<IPython.core.display.Javascript object>

Now we will apply consensus clustering to elongated clusters that are also difficult for K-means

In [8]:
sc1=100
v1=0.1
sc2=100
v2=0.9

data = np.zeros((sc1+sc2,2))
data[0:sc1, 0] = normal(loc=-0.5, scale=v1, size=sc1)
data[0:sc1, 1] = normal(loc=0.0, scale=v2, size=sc1)
data[sc1:, 0] = normal(loc=0.5, scale=v1, size=sc2)
data[sc1:, 1] = normal(loc=0.0, scale=v2, size=sc2)
labels = np.zeros(sc1+sc2)
labels[sc1:] = 1

In [9]:
nc = 2
km = KMeans(n_clusters=nc)

cons = SimpleConsensusClustering(n_clusters=nc, n_clusters_base=30, n_components=150, ncb_rand=False)

lkm = km.fit_predict(data)
cons.fit(data)
lcons = cons.labels_

print('K-M AMI =', adjusted_mutual_info_score(labels, lkm))
print('SCC AMI  =', adjusted_mutual_info_score(labels, lcons))

K-M AMI = -0.0033766715369541404
SCC AMI  = 0.4185407002995434


The results will not always be good, but it will be more consistent than K-means. It will require some parameter experimentation though.

In [10]:
fig = plt.figure(figsize=(20,7))
ax = fig.add_subplot(131)
plt.scatter(data[:,0],data[:,1],c=labels)
plt.title('Ground Truth')
ax = fig.add_subplot(132)
plt.scatter(data[:,0],data[:,1],c=lkm)
plt.title('K-means')
ax = fig.add_subplot(133)
plt.scatter(data[:,0],data[:,1],c=lcons)
plt.title('Simple Consensus');

<IPython.core.display.Javascript object>

This is the two rings dataset where K-means can not generate the true clusters. 

In [11]:
data, labels = make_circles(n_samples=400, noise=0.1, random_state=4, factor=0.3)

In [12]:
nc = 2
km = KMeans(n_clusters=nc)

cons = SimpleConsensusClustering(n_clusters=nc, n_clusters_base=20, n_components=50, ncb_rand=False)

lkm = km.fit_predict(data)
cons.fit(data)
lcons = cons.labels_

print('K-M AMI =', adjusted_mutual_info_score(labels, lkm))
print('SCC AMI  =', adjusted_mutual_info_score(labels, lcons))

K-M AMI = -0.0013617681496602037
SCC AMI  = 0.9772422365106967


Consensus clustering is able to obtain almost the true clusters

In [13]:
fig = plt.figure(figsize=(20,7))
ax = fig.add_subplot(131)
plt.scatter(data[:,0],data[:,1],c=labels)
plt.title('Ground Truth')
ax = fig.add_subplot(132)
plt.scatter(data[:,0],data[:,1],c=lkm)
plt.title('K-means')
ax = fig.add_subplot(133)
plt.scatter(data[:,0],data[:,1],c=lcons)
plt.title('Simple Consensus');

<IPython.core.display.Javascript object>

This is the two moons dataset, it is more dificult than the previous ones.

In this case it is difficult to find good parameters that separate well both classes.

In [14]:
data, labels = make_moons(n_samples=250, noise=0.1)

In [15]:
nc = 2
km = KMeans(n_clusters=nc)

cons = SimpleConsensusClustering(n_clusters=nc, n_clusters_base=15, n_components=150, ncb_rand=False)

lkm = km.fit_predict(data)
cons.fit(data)
lcons = cons.labels_

print('K-M AMI =', adjusted_mutual_info_score(labels, lkm))
print('SCC AMI  =', adjusted_mutual_info_score(labels, lcons))

K-M AMI = 0.1770350337952665
SCC AMI  = 0.3408133125573172


In [17]:
fig = plt.figure(figsize=(20,7))
ax = fig.add_subplot(131)
plt.scatter(data[:,0],data[:,1],c=labels)
plt.title('Ground Truth')
ax = fig.add_subplot(132)
plt.scatter(data[:,0],data[:,1],c=lkm)
plt.title('K-means')
ax = fig.add_subplot(133)
plt.scatter(data[:,0],data[:,1],c=lcons)
plt.title('Simple Consensus');

<IPython.core.display.Javascript object>