In [1]:
import pandas as pd
import numpy as np

rfm = pd.read_csv("../data/processed/rfm_features.csv", index_col=0)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)

rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary,UniqueProducts,TotalQuantity,AvgOrderValue,Cluster
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346.0,326,0.693147,11.253955,0.693147,11.214735,11.253955,1
12347.0,2,2.079442,8.368925,4.644391,7.80751,3.206047,1
12348.0,75,1.609438,7.494564,3.135494,7.758761,4.077122,1
12349.0,19,0.693147,7.472245,4.304065,6.448889,3.221912,2
12350.0,310,0.693147,5.815324,2.890372,5.288267,3.028712,0


In [2]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [3]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
labels_km = kmeans.fit_predict(rfm_scaled)

sil_km = silhouette_score(rfm_scaled, labels_km)
db_km = davies_bouldin_score(rfm_scaled, labels_km)

print("KMeans Silhouette:", sil_km)
print("KMeans DB Index:", db_km)

KMeans Silhouette: 0.38251802985619193
KMeans DB Index: 0.9813803315998918


In [4]:
hier = AgglomerativeClustering(n_clusters=3)
labels_hier = hier.fit_predict(rfm_scaled)

sil_hier = silhouette_score(rfm_scaled, labels_hier)
db_hier = davies_bouldin_score(rfm_scaled, labels_hier)

print("Hierarchical Silhouette:", sil_hier)
print("Hierarchical DB Index:", db_hier)

Hierarchical Silhouette: 0.37850722065722786
Hierarchical DB Index: 0.9643154461628921


In [5]:
dbscan = DBSCAN(eps=0.8, min_samples=5)
labels_db = dbscan.fit_predict(rfm_scaled)

# Remove noise (-1) for evaluation
mask = labels_db != -1

if len(set(labels_db[mask])) > 1:
    sil_db = silhouette_score(rfm_scaled[mask], labels_db[mask])
    db_db = davies_bouldin_score(rfm_scaled[mask], labels_db[mask])
else:
    sil_db = -1
    db_db = -1

print("DBSCAN Silhouette:", sil_db)
print("DBSCAN DB Index:", db_db)

DBSCAN Silhouette: 0.37222656416727706
DBSCAN DB Index: 0.753579469452091


In [6]:
gmm = GaussianMixture(n_components=3, random_state=42)
labels_gmm = gmm.fit_predict(rfm_scaled)

sil_gmm = silhouette_score(rfm_scaled, labels_gmm)
db_gmm = davies_bouldin_score(rfm_scaled, labels_gmm)

print("GMM Silhouette:", sil_gmm)
print("GMM DB Index:", db_gmm)

GMM Silhouette: 0.3907056624207143
GMM DB Index: 0.9706527326147493


In [7]:
comparison = pd.DataFrame({
    "Algorithm": ["KMeans", "Hierarchical", "DBSCAN", "GMM"],
    "Silhouette Score": [sil_km, sil_hier, sil_db, sil_gmm],
    "Davies-Bouldin Index": [db_km, db_hier, db_db, db_gmm]
})

comparison

Unnamed: 0,Algorithm,Silhouette Score,Davies-Bouldin Index
0,KMeans,0.382518,0.98138
1,Hierarchical,0.378507,0.964315
2,DBSCAN,0.372227,0.753579
3,GMM,0.390706,0.970653
