In [None]:
from sklearn import cluster
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import logging, sys

%matplotlib inline

# Create logger
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Create STDERR handler
handler = logging.StreamHandler(sys.stderr)
# ch.setLevel(logging.DEBUG)

# Create formatter and add it to the handler
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# Set STDERR handler as the only handler 
logger.handlers = [handler]

---

### Let's try generated data

In [None]:
from sklearn import datasets

In [None]:
import kmeans, agnes, dbscan

In [None]:
from sklearn import metrics

We define a function to evaluate various cluster performance metrics. All except Silhouette compare against the "correct" labels, which are gathered from the data generator.
You can read more about each below:
* [ARI](https://en.wikipedia.org/wiki/Rand_index)
* [NMI](https://en.wikipedia.org/wiki/Mutual_information)
* [Homogeneity](http://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness-and-v-measure)
* [Completeness](http://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness-and-v-measure)
* [Silhouette'](https://en.wikipedia.org/wiki/Silhouette_(clustering))

In [None]:
def get_metrics(df, labels, i_labels):
    m = dict()
    m['ARI'] = metrics.adjusted_rand_score(labels, i_labels)
    m['NMI'] = metrics.normalized_mutual_info_score(labels,i_labels)
    m['Homogeneity'] = metrics.homogeneity_score(labels, i_labels)
    m['Completeness'] = metrics.completeness_score(labels, i_labels)
    m['Silhouette'] = metrics.silhouette_score(df, i_labels, metric='euclidean')
    
    return pd.Series(m)

In [None]:
evaluation = pd.DataFrame()

Next we'll generate the dataset. We'll make a few blobs with some half-moons in there to throw off traditional clustering algorithms.

In [None]:
circles, c_labels = datasets.make_moons(400, True, 0.001, 43)

In [None]:
circles += 1,1
circles *= 3

In [None]:
blobs, b_labels = datasets.make_blobs(n_samples=600, random_state=31)

In [None]:
blobs -= 1, 1

In [None]:
merged = np.concatenate((circles, blobs))
labels = np.concatenate((c_labels, b_labels))

In [None]:
y = [i[1] for i in merged]
x = [i[0] for i in merged]

In [None]:
df = pd.DataFrame({'x': x, 'y' : y})

Below is what the dataset looks like

In [None]:
plt.scatter(df.x, df.y)

SKLearns kmeans implementation

In [None]:
k = cluster.KMeans(5)
k.fit(df)
plt.scatter(df.x, df.y, c=k.labels_, cmap='prism')

In [None]:
evaluation['KMeans_ref'] = get_metrics(df, labels, k.labels_)

Our implementation:

In [None]:
k2 = kmeans.KMeans(5)
k2.Fit(df)
plt.scatter(df.x, df.y, c=k2.GetLabels(df.shape[0]), cmap='prism') 
# we pass in df.shape[0] because GetLabels requires a size, in this case the number of points

In [None]:
evaluation['KMeans_impl'] = get_metrics(df, labels, k2.GetLabels(df.shape[0]))

In [None]:
evaluation

KMeans also has some extra methods associated with it. You may wish to get the cluster centroids for plotting, or be able to predict a new data point

In [None]:
# in sk-learn:
print('Sk-learn clusters centroids')
print(k.cluster_centers_)

print()

print('C++ cluster centroids')
# in c++
# first we need to create an empty matrix in the shape of [n_centroids, n_attributes]
centroids = np.empty((5, df.shape[1]),)
# then we pass it to GetClusters, which populates it
k2.GetClusters(centroids)
# then we can print the populated array
print(centroids)


In [None]:
tup = np.array(df.iloc[0])

print('Tuple: {}'.format(tup))

# predicting in sk-learn
print('sklearn predicts:')
print(k.predict(tup.reshape(1, -1))[0])

#predicting in c++
print('c++ predicts:')
print(k2.Predict(tup))

In [None]:
# while c++ looks simpler than sklearn, c++ can only predict a single tuple at a time

tups = np.array(df.iloc[[0,1,2,3]])
print('Tuples:')
print(tups)

#predicting multipel tuples in sk-learn
print('sk-learn predicts:')
print(k.predict(tups))

#predicting multiple tuples in c++
print('c++ predicts:')
print([k2.Predict(x) for x in tups])

SKlearns Agnes implementation

In [None]:
a = cluster.AgglomerativeClustering(5, linkage='complete')
a.fit(df)
plt.scatter(df.x, df.y, c=a.labels_ ,cmap='prism')

In [None]:
evaluation['Agnes_ref'] = get_metrics(df, labels, a.labels_)

Our Agnes implementation

In [None]:
a2 = agnes.Agnes(5, 'complete')
a2.Fit(df)
plt.scatter(df.x, df.y, c=a2.GetLabels(df.shape[0]), cmap='prism')
# we pass in df.shape[0] because GetLabels requires a size, in this case the number of points

In [None]:
evaluation['Agnes_impl'] = get_metrics(df, labels, a2.GetLabels(df.shape[0]))

In [None]:
evaluation

Our Agnes implementation comes with the ability to infer the clusters. While the algorithm is not perfect, it produces acceptable results

Compare with bogus values for n_clusters:

In [None]:
a2 = agnes.Agnes(50, 'complete')
a2.Fit(df)
plt.scatter(df.x, df.y, c=a2.GetLabels(df.shape[0]), cmap='prism')

In [None]:
plt.scatter(df.x, df.y, c=a2.InferLabels(df.shape[0]), cmap='prism')

Agnes also comes with methods to view the cluster hierarchy. `PrintDotGraph()` outputs a block of code suitable for compiling into a graph, such as with graphviz. SK-learn does not currently have functionaly to easily explore the hierarchy structure.

We recommend using http://www.webgraphviz.com/ to preview hierarchies

In [None]:
# fitting with a subset to reduce graph complexity
a2 = agnes.Agnes(5, 'complete')
a2.Fit(df.sample(n=100))

# PrintDotGraph takes parameters that get inserted
# into the beginning of the graph declaration
# These should be used for formatting
opts = '''splines=False;
node [margin=0 fontcolor=blue fontsize=32 width=0.5 shape=circle style=filled];
'''
print(a2.PrintDotGraph(opts))

Sklearns dbscan implementation

In [None]:
d = cluster.DBSCAN(.6, 6)
d.fit(df)
plt.scatter(df.x, df.y, c=d.labels_, cmap='prism', label='DBSCAN')

Our DBSCAN implementation

In [None]:
dbs = dbscan.DBSCAN(.6,6)
dbs.Fit(df)
plt.scatter(df.x, df.y, c=dbs.GetLabels(df.shape[0]), cmap='prism', label='DBSCAN')
# we pass in df.shape[0] because GetLabels requires a size, in this case the number of points