In [None]:
from sklearn import cluster
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import logging, sys

%matplotlib inline

# Create logger
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Create STDERR handler
handler = logging.StreamHandler(sys.stderr)
# ch.setLevel(logging.DEBUG)

# Create formatter and add it to the handler
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# Set STDERR handler as the only handler 
logger.handlers = [handler]

# Iris dataset using kmeans, agglomerative clustering (agnes), and DBSCAN

In [None]:
iris = pd.read_csv('iris.data', 
                   header=None, 
                   names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
                  )

In [None]:
iris.species = iris.species.astype('category')

In [None]:
iris['cats'] = iris.species.cat.codes

### First, a 5D visualization

In [None]:
def plot_iris_group(iris, group, ax):
    
    m = ['o', '^', 's']

    groups = iris.groupby(group)

    for (_, group), marker in zip(groups, m):
        x,y,z = group.sepal_length, group.sepal_width, group.petal_length
        c = group.petal_width
        ax.scatter(x, y, z, c=c, cmap=plt.hot(), marker=marker)
    return ax

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
plot_iris_group(iris, 'species', ax)

> One class is linearly separable from the other 2; the latter are NOT linearly separable from each other.

The bottom line of circles is setosa, and is distinct. The other grouping isn't very clear

In [None]:
iris_attributes = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [None]:
kmeans_iris = cluster.KMeans(n_clusters=3)

In [None]:
iris['kmeans'] = kmeans_iris.fit_predict(iris[iris_attributes])

In [None]:
agnes = cluster.AgglomerativeClustering(n_clusters=3)

In [None]:
iris['agnes'] = agnes.fit_predict(iris[iris_attributes])

In [None]:
dbscan = cluster.DBSCAN(eps=.5, min_samples=4)

In [None]:
iris['dbscan'] = dbscan.fit_predict(iris[iris_attributes])
iris['dbscan'] = iris['dbscan'].replace(-1, np.NaN)

### What cluster is each species?

In [None]:
iris.groupby('species').mean()

### Where do the algorithms differ?

In [None]:
iris.query('kmeans != agnes | kmeans != dbscan | agnes != dbscan').head()

### A graph of each cluster

In [None]:
fig, ax = plt.subplots(4, figsize=(5,20), subplot_kw={'projection':'3d'})
for i, column in enumerate(['species', 'kmeans', 'agnes', 'dbscan']):
    ax[i].set_title(column)
    plot_iris_group(iris, column, ax[i])
plt.show()

---

# BUPA dataset using kmeans, agglomerative clustering (agnes), and DBSCAN

In [None]:
bupa = pd.read_csv('bupa.data', header=None, 
                  names = ['mcv', # mean corpuscular volume
                                  # normal range: 80-96 fL/red cell
                           'alkphos', # alkaline phosphotase
                           'sgpt', # alamine aminotransferase
                           'sgot', # aspartate aminotransferase
                           'gammagt', # gamma-glutamyl transpeptidase
                           'drinks', # number of half-pint equivalents of alcoholic beverages drunk per day
                           'selector', # field used to split data into two sets
                          ])
bupa = bupa.drop('selector', axis=1) # their split isn't relevant for us
bupa.drinks *= 1.5 # half pint = 8oz. standard "drink" is 12oz

In [None]:
bupa_attributes = ['mcv', 'alkphos', 'sgpt', 'sgot', 'gammagt']

In [None]:
fig, axes = plt.subplots(len(bupa_attributes), figsize=(5,20))
for attr, ax in zip(bupa_attributes, axes):
    ax.set_xlabel(attr)
    ax.set_ylabel('drinks')
    ax.scatter(bupa[attr], bupa['drinks'])

In [None]:
bupa_kmeans = cluster.KMeans(2)

In [None]:
bupa['kmeans'] = bupa_kmeans.fit_predict(bupa[bupa_attributes])

In [None]:
bupa_agnes = cluster.AgglomerativeClustering(2)

In [None]:
bupa['agnes'] = bupa_agnes.fit_predict(bupa[bupa_attributes])

In [None]:
# not a good idea to use dbscan on this dataset

In [None]:
colors = ['r', 'b']
for attr in bupa_attributes:
    fig, axes = plt.subplots(2, figsize=(5,10))
    for model, ax in zip(['kmeans', 'agnes'], axes):
        ax.set_title(model)
        ax.set_xlabel(attr)
        ax.set_ylabel('drinks')
        ax.scatter(bupa[attr], bupa['drinks'], c=bupa[model], cmap='viridis')
    plt.show()

### Not so great

---

### Let's try generated data

In [None]:
from sklearn import datasets

In [None]:
import kmeans, agnes, dbscan

In [None]:
from sklearn import metrics

In [None]:
def get_metrics(df, labels, i_labels):
    m = dict()
    m['ARI'] = metrics.adjusted_rand_score(labels, i_labels)
    m['NMI'] = metrics.normalized_mutual_info_score(labels,i_labels)
    m['Homogeneity'] = metrics.homogeneity_score(labels, i_labels)
    m['Completeness'] = metrics.completeness_score(labels, i_labels)
    m['Silhouette'] = metrics.silhouette_score(df, i_labels, metric='euclidean')
    
    return pd.Series(m)

In [None]:
evaluation = pd.DataFrame()

In [None]:
circles, c_labels = datasets.make_moons(400, True, 0.001, 43)

In [None]:
circles += 1,1
circles *= 3

In [None]:
blobs, b_labels = datasets.make_blobs(n_samples=600, random_state=31)

In [None]:
blobs -= 1, 1

In [None]:
merged = np.concatenate((circles, blobs))
labels = np.concatenate((c_labels, b_labels))

In [None]:
y = [i[1] for i in merged]
x = [i[0] for i in merged]

In [None]:
df = pd.DataFrame({'x': x, 'y' : y})

In [None]:
plt.scatter(df.x, df.y)

SKLearns kmeans implementation

In [None]:
k = cluster.KMeans(5)
k.fit(df)
plt.scatter(df.x, df.y, c=k.labels_, cmap='prism')

In [None]:
evaluation['KMeans_ref'] = get_metrics(df, labels, k.labels_)

Our implementation:

In [None]:
k = kmeans.KMeans(5)
k.Fit(df)
plt.scatter(df.x, df.y, c=k.GetLabels(df.shape[0]), cmap='prism')

In [None]:
evaluation['KMeans_impl'] = get_metrics(df, labels, k.GetLabels(df.shape[0]))

In [None]:
evaluation

SKlearns Agnes implementation

In [None]:
a = cluster.AgglomerativeClustering(5, linkage='complete')
a.fit(df)
plt.scatter(df.x, df.y, c=a.labels_ ,cmap='prism')

In [None]:
evaluation['Agnes_ref'] = get_metrics(df, labels, a.labels_)

Our Agnes implementation

In [None]:
a = agnes.Agnes(5, 'complete')
a.Fit(df)
plt.scatter(df.x, df.y, c=a.GetLabels(df.shape[0]), cmap='prism')

In [None]:
evaluation['Agnes_impl'] = get_metrics(df, labels, a.GetLabels(df.shape[0]))

In [None]:
evaluation

Sklearns dbscan implementation

In [None]:
d = cluster.DBSCAN(.6, 6)
d.fit(df)
plt.scatter(df.x, df.y, c=d.labels_, cmap='prism', label='DBSCAN')

Our DBSCAN implementation

In [None]:
dbs = dbscan.DBSCAN(.6,6)
dbs.Fit(df)
plt.scatter(df.x, df.y, c=dbs.GetLabels(df.shape[0]), cmap='prism', label='DBSCAN')