![](algos.PNG)

##### Centroid-based :
###### cluster represented by a central reference vector which may not be part of the original data eg. KMeans 
##### Hierarchical : 
###### connectivity-based clustering based on the idea points connected to points close by rather than further away eg. agglomerative and Birch clustering
##### Distribution-based : 
###### built on statistical distribution models-objects of a cluster are the ones which may belong most likely to the same distribution eg. Gaussian Mixture Models
##### Density-based
###### create clusters from areas which have a higher density of Data points 

###### download the Data from :https://www.kaggle.com/uciml/iris?select=Iris.csv

In [1]:
#importing needed libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

In [2]:
from sklearn import metrics 

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

import warnings
warnings.filterwarnings("ignore")

In [4]:
# read dataset
iris_data = pd.read_csv('iris.csv',
                       skiprows = 1,
                       names = ['sepal-length',
                               'sepal-width',
                               'petal-length',
                               'petal-width',
                               'class'])
iris_data.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
# shufle the dataset 
iris_data = iris_data.sample(frac=1).reset_index(drop=True)
iris_data.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.4,3.7,1.5,0.2,Iris-setosa
1,5.8,2.6,4.0,1.2,Iris-versicolor
2,6.0,2.9,4.5,1.5,Iris-versicolor
3,5.2,2.7,3.9,1.4,Iris-versicolor
4,6.9,3.1,5.1,2.3,Iris-virginica


In [6]:
# Convert Class(target variable) from string to numeric form

from sklearn import preprocessing 
label_encod = preprocessing.LabelEncoder()
iris_data['class'] = label_encod.fit_transform(iris_data['class'].astype(str))
iris_data.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.4,3.7,1.5,0.2,0
1,5.8,2.6,4.0,1.2,1
2,6.0,2.9,4.5,1.5,1
3,5.2,2.7,3.9,1.4,1
4,6.9,3.1,5.1,2.3,2


In [8]:
# put the features in a dataframe
features = iris_data.drop('class', axis = 1 )
features.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,5.4,3.7,1.5,0.2
1,5.8,2.6,4.0,1.2
2,6.0,2.9,4.5,1.5
3,5.2,2.7,3.9,1.4
4,6.9,3.1,5.1,2.3


In [10]:
# store the labels in seperate Series object 
labels = iris_data['class']
labels.sample(5)

44    0
66    2
59    0
83    1
11    1
Name: class, dtype: int32

In [56]:
# function which takes  our dataset 
# and apply a clustering model we sepecify and then score and evalute this model

def build_model(clustering_model, data, labels):
    model = clustering_model(data)
    print('homog\tcomp\tv-meas\tARI\tAMI')
    print(40 *'-')
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
         %(metrics.homogeneity_score(labels, model.labels_),
           metrics.completeness_score(labels, model.labels_),
           metrics.v_measure_score(labels, model.labels_),
           metrics.adjusted_rand_score(labels, model.labels_),
           metrics.adjusted_mutual_info_score(labels, model.labels_),
           ))

In [57]:
# K-means clustering and 
def k_means(data,n_clusters =3, max_iter=1000):
    model = KMeans(n_clusters=n_clusters,max_iter=max_iter).fit(data)
    return model 

In [58]:
build_model(k_means, features, labels)

homog	comp	v-meas	ARI	AMI
----------------------------------------
0.751	0.765	0.758	0.730	0.755


### Agglomerative Clustering:
***
###### -bottom up  hierarchical clustering technique which recursively merges pairs of clusters starting with single point clusters 
###### -efficent on large datasets with many clusters
###### -computationally intensive
###### -builds a tree representation for data points and merge points close to one another
###### - the default linkage criterion is WARD ; minimizes the variances of clusters being merged 

In [59]:
# building agglomerative clustering estimator 
def agglomerative(data, n_clusters = 3):
    model = AgglomerativeClustering(n_clusters = n_clusters).fit(data)
    return model 


In [60]:
build_model(agglomerative, features, labels)

homog	comp	v-meas	ARI	AMI
----------------------------------------
0.761	0.780	0.770	0.731	0.767


### DBSCAN Clustering
##### Density-Based Spatial Clustering of Applications with Noise
***

###### works well on large datasets for moderate number of clusters
###### DBSCAN focus on areas where there is a high density of points, resulted clusters can be of any shape and size
###### points with few near neighbors are marked as outliers
##### Main Parameters to consider for DBSCAN are : 
###### 1. eps : Minimum distance, points closer than this are neighbors
###### 2. min_samples : number of points to form a dense region  

In [61]:
def dbscan(data, eps = 0.45, min_samples=4):
    model = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
    
    return model 

In [62]:
build_model(dbscan, features, labels)

homog	comp	v-meas	ARI	AMI
----------------------------------------
0.577	0.609	0.593	0.508	0.584


### Mean-Shift Clustering 
***

###### uses a kernel function  applied to each point 
###### no need to specify number of clusters 
###### uses density function to handle even complex non-linear data ( pixels)
###### computationally very intensive ( O(N²) N is number of data points ) 

In [63]:
def mean_shift(data, bandwidth = 0.85):
    model = MeanShift(bandwidth=bandwidth).fit(data)
    return model

In [64]:
build_model(mean_shift, features, labels)

homog	comp	v-meas	ARI	AMI
----------------------------------------
0.760	0.772	0.766	0.744	0.763


### BIRCH Clustering
##### Balanced Iterative Reducing and Clustering using Hierarchies
***
###### Consider Birch for Large Dataset with many clusters 
###### Detects and removes outliers 
###### incrementally processes incoming data and updates clusters( Online clustering algorithm )
###### very effective at handling noise and outliers 
###### very memory and time efficient 

In [67]:
def birch(data, n_clusters = 3):
    model = Birch(n_clusters=n_clusters).fit(data)
    return model

In [68]:
build_model(birch, features, labels)

homog	comp	v-meas	ARI	AMI
----------------------------------------
0.778	0.804	0.791	0.732	0.788


### Affinity Propagation Clustering 
***
###### consider when you have a small Dataset with MANY clusters
###### work well with uneven cluster sizes and manifold shapes
###### doesn not need number of classifiers to be specified 

In [70]:
def afp(data, damping = 0.6, max_iter = 1000):
    model = AffinityPropagation(damping=damping, max_iter=max_iter).fit(data)
    return model

# damping : extent to which the current value is maintained relative to the incoming values

In [71]:
build_model(afp, features, labels)

homog	comp	v-meas	ARI	AMI
----------------------------------------
0.851	0.492	0.623	0.437	0.612


##### we see homogeneity score very high and completness relatively low  :  that means every cluster does contain members of same class  but all members of same class do not lie in same cluster that is the reasong for low completnes score

 


### Mini-bath K- Means clustering
***
##### K-means clustering on random subsets of data rather than the entire dataset 
###### used for large datasets and moderate number of clusters
###### even cluster sizes and flat surfaces
###### Perform K-means on randomly sample subsets ( mini-batches)

In [76]:
def mini_batch_kmeans(data,n_clusters =3, max_iter=1000):
    model = MiniBatchKMeans(n_clusters=n_clusters,max_iter=max_iter,  batch_size =20).fit(data)
    return model 

In [77]:
build_model(mini_batch_kmeans, features, labels)

homog	comp	v-meas	ARI	AMI
----------------------------------------
0.736	0.747	0.742	0.716	0.739


***

### Spectral Clustering 
##### using Precomputed similarity matrix 

###### -Consider spectral clustering when you have a samll dataset and few clusters
###### -Simple to implement, intuitive results, Even cluster size, Fine manifolds
###### - relies on distances between points 
###### -Creates an affinity matrix of input data 
###### -input can be precomputed similarity matrix 
###### -Dimensionality reduction is followed by pairwise similarity measurments 
###### -DBSCAN is a special case of spectral clustering

In [1]:
# similarity matrix to feed to the model 
from sklearn.cluster import SpectralClustering

In [2]:
# SS: Self-Similarity, the similarity of a data point with it self  set value to 1000
SS = 1000
# IS: Intra-Cluster similarity between point with the same cluster 
IS = 10
# LS: Low Similarity, between points in diffrent clusters 
LS = 0.01

In [3]:
# set similarity matrix for 9 data points in the dataset 
# one row corresponding to every data point in our data set 
# one column corresponding to every data point in our data set 
# individual values in this matrix contain information on how similar a data point to another 
# main diagonal is SS
Similarity_matrix = [[SS, IS, IS, LS, LS, LS, LS, LS, LS],
                     [IS, SS, IS, LS, LS, LS, LS, LS, LS],
                     [IS, IS, SS, LS, LS, LS, LS, LS, LS],
                     [LS, LS, LS, SS, IS, IS, LS, LS, LS],
                     [LS, LS, LS, IS, SS, IS, LS, LS, LS],
                     [LS, LS, LS, IS, IS, SS, LS, LS, LS],
                     [LS, LS, LS, LS, LS, LS, SS, IS, IS],
                     [LS, LS, LS, LS, LS, LS, IS, SS, IS],
                     [LS, LS, LS, LS, LS, LS, IS, IS, SS]]

In [4]:
# instantiate spectral similariy object using the matrix above
# affinity values are available(similarity matrix as input) so affinity = precomputed 
spectral_model = SpectralClustering(n_clusters =3, affinity = 'precomputed').fit(Similarity_matrix)


In [5]:
# labels that spectral clustering has applied to the data points 
spectral_model.labels_

array([2, 2, 2, 0, 0, 0, 1, 1, 1])

###### we can see that first 3 points are assigned to same cluster with label 2 and 2nd 3 point to cluster labled 0 and last 3 points to cluster labled 1