In [1]:
# Import necessary libraries
import numpy as np  # NumPy is a library for working with large, multi-dimensional arrays and matrices
import matplotlib.pyplot as plt  # Matplotlib is a library for creating visualizations of data
import pandas as pd  # Pandas is a library for working with tabular data
from datetime import datetime  # datetime is a module for working with dates and times
import seaborn as sns  # Seaborn is a library for creating more advanced statistical plots


In [2]:
# Load the seeds data from a CSV file into a Pandas dataframe
df = pd.read_csv("seeds.csv", low_memory=False)

In [3]:
df

Unnamed: 0,area,perimeter,compactness,length,width,asymmetry,groove_length
0,15.26,14.84,0.871,5.763,3.312,2.221,5.220
1,14.88,14.57,0.881,5.554,3.333,1.018,4.956
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825
3,13.84,13.94,0.895,5.324,3.379,2.259,4.805
4,16.14,14.99,0.903,5.658,3.562,1.355,5.175
...,...,...,...,...,...,...,...
205,12.19,13.20,0.878,5.137,2.981,3.631,4.870
206,11.23,12.88,0.851,5.140,2.795,4.325,5.003
207,13.20,13.66,0.888,5.236,3.232,8.315,5.056
208,11.84,13.21,0.852,5.175,2.836,3.598,5.044


In [4]:
# Import the normalize function from the preprocessing module of scikit-learn
from sklearn.preprocessing import normalize

# Normalize the data in the dataframe using the normalize function
X = normalize(df)

# Create a new dataframe with the normalized data and the original column names
df2 = pd.DataFrame(X, columns=df.columns)


In [5]:
df2

Unnamed: 0,area,perimeter,compactness,length,width,asymmetry,groove_length
0,0.662718,0.644478,0.037826,0.250278,0.143835,0.096455,0.226696
1,0.664107,0.650272,0.039320,0.247880,0.148755,0.045434,0.221191
2,0.656884,0.647690,0.041601,0.243217,0.153395,0.124068,0.221796
3,0.649097,0.653788,0.041976,0.249696,0.158475,0.105947,0.225355
4,0.682446,0.633821,0.038181,0.239237,0.150612,0.057293,0.218814
...,...,...,...,...,...,...,...
205,0.612738,0.663507,0.044133,0.258215,0.149842,0.182515,0.244794
206,0.583269,0.668968,0.044200,0.266964,0.145168,0.224634,0.259848
207,0.593783,0.614476,0.039945,0.235534,0.145387,0.374038,0.227437
208,0.600600,0.670095,0.043219,0.262509,0.143860,0.182513,0.255864


In [6]:
# Import the KMeans, DBSCAN, and AffinityPropagation clustering algorithms from scikit-learn
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AffinityPropagation

# Create a KMeans clustering model with 8 clusters
km = KMeans(n_clusters=8)

# Create a DBSCAN clustering model with an epsilon value of 0.015 and a minimum number of samples per cluster of 2
dbscan = DBSCAN(eps=0.015, min_samples=2)

# Create an AffinityPropagation clustering model
aff = AffinityPropagation()


In [7]:
# Fit the KMeans model to the data in the dataframe
km.fit(df)

# Fit the DBSCAN model to the data in the dataframe
dbscan.fit(df2)

# Fit the AffinityPropagation model to the data in the dataframe
aff.fit(df)


AffinityPropagation()

In [8]:
km.labels_


array([4, 4, 0, 0, 4, 0, 0, 0, 4, 4, 2, 0, 0, 0, 0, 0, 5, 4, 4, 3, 0, 0,
       4, 7, 4, 4, 7, 7, 0, 0, 7, 2, 0, 0, 4, 4, 4, 2, 0, 5, 0, 0, 7, 2,
       0, 0, 4, 0, 0, 4, 0, 2, 0, 0, 4, 4, 0, 4, 4, 7, 7, 7, 7, 0, 7, 7,
       4, 0, 4, 7, 2, 2, 2, 1, 2, 2, 2, 6, 6, 2, 2, 6, 6, 1, 1, 1, 1, 1,
       6, 6, 1, 1, 1, 6, 6, 2, 1, 1, 1, 1, 2, 1, 6, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 6, 6, 1, 6, 1, 1, 1, 6, 1, 2, 1, 2, 6, 6, 1, 1, 2, 1, 1,
       2, 2, 2, 0, 2, 4, 4, 2, 5, 5, 5, 5, 3, 5, 7, 3, 7, 3, 3, 5, 3, 3,
       3, 5, 3, 3, 3, 3, 7, 3, 3, 3, 5, 7, 3, 3, 3, 3, 5, 5, 3, 3, 3, 3,
       3, 3, 5, 7, 3, 3, 3, 3, 5, 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 5, 3,
       7, 7, 5, 7, 3, 5, 3, 3, 3, 5, 3, 5])

In [9]:
dbscan.labels_

array([ 0,  1,  2,  2,  3,  2,  4,  2,  5,  5,  6,  7,  8,  2,  2, -1, -1,
        3, -1,  9,  2,  2, 10, -1, -1, 10,  8, -1,  2,  8, -1,  2,  8, 11,
        0,  5, -1, 12,  2, -1,  2, 11, 13,  6,  2,  7,  1,  2,  2,  0,  8,
       -1,  8,  4,  7,  0,  2,  1,  0, -1, -1, -1,  8,  9, -1, 13,  7, 11,
        7, 14, 15, 16, 16, 17, 15, 16, 15, 18, -1, 12, -1, -1, 18, 19, 17,
       17, 19, 15, -1, 18, 19, 17, 17, -1, -1, 15, 17, 17, -1, 17, 16, 17,
       20, 17, 17, 17, 17, 15, 17, 19, 17, 17, 17, -1, 18, 17, 20, 17, 17,
       17, -1, 15, 16, 17, -1, 20, -1, 17, 19, -1, 17, 17, -1, 16, -1, -1,
       15, 21, 21, 15, 22, -1, -1, 23, 22, -1, -1, 24, 14, 25, 23, 26, 22,
       27, 22, -1, 27, 22, 22, 28, 14, -1, 22, 24, 26, -1, 22, 27, 22, 27,
       -1, 26, 22, 28, 25, -1, 28, 29, -1, -1, 22, 22, 22, 23, -1, 22, -1,
       22, -1, 28, 28, 22, 27, 29, 24, -1, -1, -1, 14, -1, 23, 13, 22, -1,
        9,  9, 22, -1, 27, 23], dtype=int64)

In [10]:
aff.labels_

array([ 1,  1,  0,  0,  1,  0,  0,  0,  1,  1,  0,  1,  0,  0,  0,  0,  8,
        1,  1,  8,  0,  0,  1,  7,  1,  1, 10,  7,  0,  0,  7,  0,  0,  0,
        1,  1,  1,  2,  0,  6,  0,  0,  7,  2,  0,  1,  1,  0,  0,  1,  0,
        2,  0,  0,  1,  1,  0,  1,  1,  7,  7,  7, 10,  8,  7,  7,  1,  0,
        1, 10,  2,  2,  2,  5,  2,  2,  2,  3,  3,  2,  2,  3,  3,  4,  5,
        4,  4,  5,  3,  3,  4,  5,  5,  3,  5,  2,  5,  4,  4,  4,  2,  4,
        5,  5,  5,  4,  5,  2,  5,  4,  4,  5,  4,  3,  3,  4,  5,  5,  5,
        5,  3,  5,  2,  5,  1,  5,  5,  4,  4,  2,  4,  5,  2,  2,  2,  0,
        2,  1,  1,  2,  8,  6,  6,  8,  8,  9,  7,  8, 10,  9,  9,  6,  8,
       10, 10,  9, 10,  8,  8,  9, 10, 10,  8,  8,  9,  7,  8, 10, 10, 10,
        9,  9, 10,  9,  9,  9,  9,  9,  9,  7,  9,  8,  8,  9,  6, 10,  8,
       10,  9,  9,  9, 10, 10,  9, 10,  8,  8,  8, 10,  7,  8,  7, 10,  6,
       10, 10, 10,  6, 10,  8], dtype=int64)

The silhouette score is a measure of how similar an object is to its own cluster compared to other clusters. It ranges from -1 to 1, with a high score indicating that the object is well-matched to its own cluster and poorly-matched to neighboring clusters.

To calculate the silhouette score, you need to have a dataset that has been clustered into groups. Then, for each object in the dataset, you can calculate its silhouette score as follows:

Calculate the average distance between the object and all other objects in the same cluster. This is known as the object's intra-cluster distance.
Calculate the average distance between the object and all objects in the nearest neighboring cluster. This is known as the object's inter-cluster distance.
Calculate the silhouette score for the object as the difference between the inter-cluster distance and the intra-cluster distance, divided by the maximum of the two distances.
The silhouette score for the entire dataset is then calculated as the mean of the silhouette scores for all the objects in the dataset.

The silhouette score can be a useful tool for evaluating the results of clustering algorithms and for selecting the optimal number of clusters for a dataset. However, it is important to note that the silhouette score is sensitive to the scale of the distances and is not always reliable for datasets with non-convex shapes or with clusters of very different sizes.

In [11]:
from sklearn.metrics import silhouette_score
silhouette_score(df,km.labels_)

0.36119154788878327

A silhouette score of 0.3620743885834876 is intermediate, falling between a score of 0 and a score of 1. This could indicate that the clusters in your dataset are somewhat distinct, but there may be some overlap or fuzzy boundaries between the clusters.

It is important to note that the silhouette score is sensitive to the scale of the distances and is not always reliable for datasets with non-convex shapes or with clusters of very different sizes. Therefore, it is important to consider the context and characteristics of your dataset when interpreting the silhouette score.


In [12]:
silhouette_score(df2,dbscan.labels_)

0.051893570770423565

A silhouette score of 0.051893570770423565 is relatively low, indicating that the clusters in your dataset may not be well-separated or distinct. This could mean that there is significant overlap between the clusters or that the objects within a cluster are not similar to each other.

In [13]:
silhouette_score(df,aff.labels_)

0.32408630302078106

The Davies-Bouldin index, also known as the Davies-Bouldin score or DBI, is a measure of the compactness and separation of the clusters in a dataset. It ranges from 0 to infinity, with a low score indicating that the clusters are well-separated and compact, and a high score indicating that the clusters are overlapping or scattered.

To calculate the Davies-Bouldin index, you need to have a dataset that has been clustered into groups. Then, for each cluster, you can calculate the average distance between the objects in the cluster and the centroid of the cluster, which is the mean position of the objects in the cluster. This is known as the scatter of the cluster.

Then, for each pair of clusters, you can calculate the distance between the centroids of the two clusters, and divide it by the maximum scatter of the two clusters. The Davies-Bouldin index is then calculated as the mean of these values for all pairs of clusters.

The Davies-Bouldin index can be a useful tool for evaluating the results of clustering algorithms and for selecting the optimal number of clusters for a dataset. However, it is sensitive to the scale of the distances and can be affected by the presence of outliers or skewed distributions. It is

In [14]:
from sklearn.metrics import davies_bouldin_score
davies_bouldin_score(df,km.labels_)

0.9403321967833498

In [15]:
davies_bouldin_score(df2,dbscan.labels_)

1.5505483824660646

In [16]:
davies_bouldin_score(df,aff.labels_)

0.972372889285975

It is not uncommon for different evaluation metrics to yield different results when assessing the quality of clusters in a dataset. This can be due to the different assumptions and characteristics of the metrics, as well as the context and characteristics of the dataset itself.

In this case, it appears that the silhouette score favored the k-means algorithm, while the Davies-Bouldin index favored affinity propagation. This could be due to differences in the way these metrics measure the compactness and separation of the clusters.

The silhouette score measures the similarity of an object to its own cluster compared to other clusters, and assigns a score ranging from -1 to 1 based on this comparison. A high silhouette score indicates that the object is well-matched to its own cluster and poorly-matched to neighboring clusters.

On the other hand, the Davies-Bouldin index measures the compactness and separation of the clusters by calculating the average distance between the objects in a cluster and the centroid of the cluster, and the distance between the centroids of pairs of clusters. It assigns a score based on the mean of these values, with a low score indicating that the clusters are well-separated and compact, and a high score indicating that the clusters are overlapping or scattered.

Given these differences in the way the silhouette score and the Davies-Bouldin index measure cluster quality, it is not surprising that they favored different algorithms in this case. It is important to consider the context and characteristics of your dataset when choosing an evaluation metric, and to use multiple metrics if possible to get a more comprehensive view of the cluster quality.