In [1]:
import pandas as pd
from sklearn.metrics import silhouette_score

In [2]:
# load the data
df = pd.read_csv("data/model.csv")

In [3]:
# removing cluster columns
X = df.drop(['Kmeans_cluster', 'AG_Cluster', 'DBSCAN_Cluster', 'GMM_Cluster', 'kmedoids_Cluster'], axis=1)

### Evaluating silhoutte score for all the models

In [4]:
# KMeans silhouette score
kmeans_score = silhouette_score(X, df['Kmeans_cluster'])

In [5]:
# Agglomerative silhouette score
aglomerative_score = silhouette_score(X, df['AG_Cluster'])

In [6]:
# DBSCAN silhouette score
# ignoring noise points, by masking them
mask = df['DBSCAN_Cluster']  != -1
dbscan_score = silhouette_score(X[mask], df['DBSCAN_Cluster'][mask])

In [7]:
# GMM silhouette score
gmm_score = silhouette_score(X, df['GMM_Cluster'])

In [8]:
# K-Medoids silhouette score
kmedoids_score = silhouette_score(X, df['kmedoids_Cluster'])

### counting the clusters formed

In [9]:
kmeans_clusters = df['Kmeans_cluster'].nunique()
aglomerative_clusters = df['AG_Cluster'].nunique()
gmm_clusters = df['GMM_Cluster'].nunique()
kmedoids_clusters = df['kmedoids_Cluster'].nunique()

# DBSCAN created a noise cluster labeled -1. So while counting clusters, we excluded -1 to count only actual clusters.”
dbscan_clusters = len(df['DBSCAN_Cluster'].unique()) - \
                  (df['DBSCAN_Cluster'] == -1).any()

noise_points = (df['DBSCAN_Cluster'] == -1).sum()

### Comparing the scores

In [10]:
evaluation = pd.DataFrame({
    'Algorithms': ['KMeans', 'Agglomerative', 'DBSCAN', 'GMM', 'K-Moedoids'],
    'Silhouette Score': [kmeans_score, aglomerative_score, dbscan_score, gmm_score, kmedoids_score],
    'Clusters Formed': [kmeans_clusters, aglomerative_clusters, dbscan_clusters, gmm_clusters, kmedoids_clusters],
    'Noise Points (DBSCAN)': ['N/A', 'N/A', noise_points, 'N/A', 'N/A']
})
evaluation

Unnamed: 0,Algorithms,Silhouette Score,Clusters Formed,Noise Points (DBSCAN)
0,KMeans,0.227152,4,
1,Agglomerative,0.501186,3,
2,DBSCAN,0.262103,3,622.0
3,GMM,0.227152,4,
4,K-Moedoids,0.218994,4,


Although K-Means is commonly used, Agglomerative clustering produced the highest silhouette score for this dataset.

This indicates that the world development indicators follow a hierarchical structure rather than spherical clusters.

Therefore, Agglomerative clustering better captures the natural grouping of countries.”