In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


# Clustering technique algorithm Libraries

In [2]:
from sklearn import metrics
#Centroid based cluster
from sklearn.cluster import KMeans
#Hierarchical amximum connectivity,tree based cluster
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
#Density based cluster
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import AffinityPropagation
#Improve the performance of KMeans clustering
from sklearn.cluster import MiniBatchKMeans

import warnings
warnings.filterwarnings("ignore")


# Exploring Data

In [3]:
iris_df=pd.read_csv("C:\\Users\\admin\\Dropbox\\DS\\6.building-clustering-models-scikit-learn\\datasets\\iris.csv",skiprows=1,
                   names=['seplength','sepwidth','petlength','petwidth','class'])

In [4]:
iris_df.head()

Unnamed: 0,seplength,sepwidth,petlength,petwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
iris_df=iris_df.sample(frac=1).reset_index(drop=True)

In [6]:
iris_df.head()

Unnamed: 0,seplength,sepwidth,petlength,petwidth,class
0,4.4,3.0,1.3,0.2,Iris-setosa
1,6.5,3.0,5.5,1.8,Iris-virginica
2,6.5,3.0,5.2,2.0,Iris-virginica
3,5.8,2.8,5.1,2.4,Iris-virginica
4,6.8,3.2,5.9,2.3,Iris-virginica


In [7]:
iris_df.shape

(150, 5)

# Preprocessing data

In [8]:
from sklearn import preprocessing
label_encoding=preprocessing.LabelEncoder()
iris_df['class']=label_encoding.fit_transform(iris_df['class'].astype(str))
iris_df['class'].head(5)

0    0
1    2
2    2
3    2
4    2
Name: class, dtype: int32

# 1. Store only features(4) in iris_df_features. Drop class column

In [9]:
iris_df_features=iris_df.drop('class',axis=1)
iris_df_features.sample(5)

Unnamed: 0,seplength,sepwidth,petlength,petwidth
63,5.1,3.5,1.4,0.3
3,5.8,2.8,5.1,2.4
12,6.8,3.0,5.5,2.1
74,5.2,4.1,1.5,0.1
142,7.2,3.0,5.8,1.6


# 2.Store class separately in iris_df_labels 

In [10]:
iris_df_labels=iris_df['class']
iris_df_labels.sample(5)

102    1
65     0
68     1
67     2
96     2
Name: class, dtype: int32

# Build & Train Different classification Model & Evaluating Scores 
# Helper Function

In [11]:
def buildmodel(clustering_model,data,labels):
    model=clustering_model(data)
    
    print('homo \t compl \t v-means \t ARI \t AMI \t silhouette')
    print(50 * '-')
    
    print('%.3f \t %.3f \t %.3f \t %.3f \t %.3f \t %.3f'
          %(metrics.homogeneity_score(labels, model.labels_),
            metrics.completeness_score(labels, model.labels_),
            metrics.v_measure_score(labels, model.labels_),
            metrics.adjusted_rand_score(labels, model.labels_),
            metrics.adjusted_mutual_info_score(labels,  model.labels_),
            metrics.silhouette_score(data, model.labels_)))

# Reusable helper function setup

# 1.KMeans Function

In [22]:
def k_means(data, n_clusters=3, max_iter=1000):
    model = KMeans(n_clusters=n_clusters, max_iter=max_iter).fit(data)
    
    return model

# Build & Train model

In [13]:
buildmodel(k_means, iris_df_features, iris_df_labels)

homo 	 compl 	 v-means 	 ARI 	 AMI 	 silhouette
--------------------------------------------------
0.751 	 0.765 	 0.758 	 0.730 	 0.748 	 0.553


# 2. Agglomerative Function

In [14]:
def agglomerative(data,n_clusters=3):
    model=AgglomerativeClustering(n_clusters=n_clusters)
    model.fit(data)
    return model

# Build & Train model

In [15]:
buildmodel(agglomerative,iris_df_features,iris_df_labels)

homo 	 compl 	 v-means 	 ARI 	 AMI 	 silhouette
--------------------------------------------------
0.761 	 0.780 	 0.770 	 0.731 	 0.758 	 0.554


# 3.DBSCAN-Density Based Spartial Clustering of Algorithms with Noise

In [16]:
def dbscan(data,eps=0.45,min_samples=4):
    model=DBSCAN(eps=eps,min_samples=min_samples)
    model.fit(data)
    return model

# Build & Train model

In [17]:
buildmodel(dbscan,iris_df_features,iris_df_labels)

homo 	 compl 	 v-means 	 ARI 	 AMI 	 silhouette
--------------------------------------------------
0.577 	 0.609 	 0.593 	 0.508 	 0.569 	 0.372


# 3. Mean shift clustering

In [18]:
def meanshift_fn(data,bandwidth=0.85):
    model=MeanShift(bandwidth=bandwidth)
    model.fit(data)
    return model
    

# Build & Train model

In [19]:
buildmodel(meanshift_fn,iris_df_features,iris_df_labels)

homo 	 compl 	 v-means 	 ARI 	 AMI 	 silhouette
--------------------------------------------------
0.760 	 0.772 	 0.766 	 0.744 	 0.757 	 0.551


# 4.BIRCH clustering-Balance Iterative Reducing and Clustering using Hierarchies

In [20]:
def birch_fn(data,n_clusters=3):
    model=Birch(n_clusters=n_clusters)
    model.fit(data)
    return model

# Build and Train Model

In [21]:
buildmodel(birch_fn,iris_df_features,iris_df_labels)

homo 	 compl 	 v-means 	 ARI 	 AMI 	 silhouette
--------------------------------------------------
0.738 	 0.782 	 0.759 	 0.674 	 0.735 	 0.545


# 5.Affinity Propogation Clustering

In [28]:
def affinity_propagation(data,damping=0.6,max_iter=1000):
    model=AffinityPropagation(damping=damping,max_iter=max_iter)
    model.fit(data)
    return model

# Build and Train Model

In [29]:
buildmodel(affinity_propagation,iris_df_features,iris_df_labels)

homo 	 compl 	 v-means 	 ARI 	 AMI 	 silhouette
--------------------------------------------------
0.851 	 0.492 	 0.623 	 0.437 	 0.480 	 0.349


# 6.Mini Batch KMeans Clustering

In [30]:
def mini_batch_kmeans(data,n_clusters=3,max_iter=1000):
    model=MiniBatchKMeans(n_clusters=n_clusters,max_iter=max_iter,batch_size=20)
    model.fit(data)
    return model

In [31]:
buildmodel(mini_batch_kmeans,iris_df_features,iris_df_labels)

homo 	 compl 	 v-means 	 ARI 	 AMI 	 silhouette
--------------------------------------------------
0.736 	 0.747 	 0.742 	 0.716 	 0.733 	 0.551


# 7. Spectral Clustering

In [32]:
from sklearn.cluster import SpectralClustering

In [40]:
#Self similarity(similarity of datapoints itself)
SS=1000

In [41]:
#IntraCluster Similarity(Points belong to same cluster)
IS=10

In [42]:
#Low similarity(Points lie on diff clusters)
LS=0.01