<a href="https://colab.research.google.com/github/cagBRT/Clustering-Intro/blob/master/Intro_to_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Introduction to Clustering**

An overview of clustering techniques.
>Affinity Propagation
Agglomerative Clustering
BIRCH
DBSCAN
K-Means
Mini-batch K-Means
Mean Shift
Gaussian Mixture Model

Each algorithm offers a different approach to the challenge of discovering natural groups in data.

There is no best clustering algorithm, and no easy way to find the best algorithm for your data without using controlled experiments.

In [0]:
#!pip install scikit-learn
# check scikit-learn version
import sklearn
print(sklearn.__version__)

In [0]:
from matplotlib import pyplot
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification

**Create a Synthetic Dataset**<br>
The dataset has two distinct clusters. <br>

Can the clustering algorithms identifiy the two clusters?


In [0]:
# synthetic classification dataset
# define dataset
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# create scatter plot for samples from each class
for class_value in range(2):
	# get row indexes for samples with this class
	row_ix = where(y == class_value)
	# create scatter of these samples
	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()

Plot function

In [0]:
def plot_function(clusters, yhat,X):
  for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1])

K-Means

In [0]:
# k-means clustering
from sklearn.cluster import KMeans
# define dataset
#X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = KMeans(n_clusters=2)
# fit the model
model.fit(X)
# assign a cluster to each example
yhat = model.predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
plot_function(clusters, yhat,X)
# show the plot
pyplot.show()

Mini-Batch K-Means

In [0]:
# mini-batch k-means clustering
from sklearn.cluster import MiniBatchKMeans
# define dataset
#X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = MiniBatchKMeans(n_clusters=2)
# fit the model
model.fit(X)
# assign a cluster to each example
yhat = model.predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
plot_function(clusters, yhat,X)
# show the plot
pyplot.show()

Gaussian Mix

In [0]:
# gaussian mixture clustering
from sklearn.mixture import GaussianMixture
# define dataset
#X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = GaussianMixture(n_components=2)
# fit the model
model.fit(X)
# assign a cluster to each example
yhat = model.predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
plot_function(clusters, yhat,X)
# show the plot
pyplot.show()

Birch

In [0]:
# birch clustering
from sklearn.cluster import Birch
# define dataset
#X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = Birch(threshold=0.01, n_clusters=2)
# fit the model
model.fit(X)
# assign a cluster to each example
yhat = model.predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
plot_function(clusters, yhat,X)
# show the plot
pyplot.show()

Affinity Propagation

In [0]:
# affinity propagation clustering
from sklearn.cluster import AffinityPropagation

#1 HYPER PARAMETER TO TUNE 0.5 to 1
#AffinityPropagation(damping=0.5, max_iter=200, convergence_iter=15, 
#copy=True, preference=None, affinity='euclidean', verbose=False)
model = AffinityPropagation(damping=0.9, )

# fit the model
model.fit(X)
# assign a cluster to each example
yhat = model.predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
plot_function(clusters, yhat,X)
# show the plot
pyplot.show()

Agglomerative Clustering

In [0]:
# agglomerative clustering
from sklearn.cluster import AgglomerativeClustering
# define dataset
#X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = AgglomerativeClustering(n_clusters=2)
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
plot_function(clusters, yhat,X)
# show the plot
pyplot.show()

DBSCAN

In [0]:
# dbscan clustering
from sklearn.cluster import DBSCAN
# define dataset
#X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = DBSCAN(eps=0.30, min_samples=9)
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
plot_function(clusters, yhat,X)
# show the plot
pyplot.show()

Mean Shift

In [0]:
# mean shift clustering
from sklearn.cluster import MeanShift
# define dataset
#X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = MeanShift()
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
plot_function(clusters, yhat,X)
# show the plot
pyplot.show()

OPTICS

In [0]:
# optics clustering
from sklearn.cluster import OPTICS
# define dataset
#X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = OPTICS(eps=0.8, min_samples=10)
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
plot_function(clusters, yhat,X)
# show the plot
pyplot.show()


Spectral Clustering

In [0]:
# spectral clustering
from sklearn.cluster import SpectralClustering
# define dataset
#X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = SpectralClustering(n_clusters=2)
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
plot_function(clusters, yhat,X)
# show the plot
pyplot.show()