In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns
#Handling data → pandas, numpy
#Preprocessing → StandardScaler
#Evaluation → silhouette_score, davies_bouldin_score
#Visualization → matplotlib, seaborn

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00292/Wholesale%20customers%20data.csv"
data = pd.read_csv(url)
#this uses pandas to read csv file
print(data.head())


   Channel  Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen
0        2       3  12669  9656     7561     214              2674        1338
1        2       3   7057  9810     9568    1762              3293        1776
2        2       3   6353  8808     7684    2405              3516        7844
3        1       3  13265  1196     4221    6404               507        1788
4        2       3  22615  5410     7198    3915              1777        5185


In [4]:
# Droping non-numeric features
X = data.drop(['Channel', 'Region'], axis=1)
#drop coloum

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [5]:
#k-means clustering step
from sklearn.cluster import KMeans
#n_clusters=3 means asking kmeans to groupndata into 3 clusters
#random_state=0c
kmeans = KMeans(n_clusters=3, random_state=0)
#Randomly selecting 3 centroids (points in space);Assigning each data point to the nearest centroid;Recomputing centroids and repeating until stable.
kmeans_labels = kmeans.fit_predict(X_scaled)
#.fit_predict() does two things:1)Fits the model (learns the clusters from data);Predicts the cluster label for each data point (e.g., 0, 1, or 2 for 3 clusters).
# kmeans_labels is now an array that tells which cluster each customer belongs to.



In [6]:
kmeans_sil = silhouette_score(X_scaled, kmeans_labels)
kmeans_db = davies_bouldin_score(X_scaled, kmeans_labels)

print("K-Means Silhouette Score:", kmeans_sil)
print("K-Means Davies-Bouldin Index:", kmeans_db)


K-Means Silhouette Score: 0.3916016573908254
K-Means Davies-Bouldin Index: 1.2494010354240845


In [7]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=1.5, min_samples=5)
#DBSCAN(eps=1.5, min_samples=5)	Clusters points that have 5 or more neighbors within a distance of 1.5
#dbscan	A model that can now be used to find clusters in data

dbscan_labels = dbscan.fit_predict(X_scaled)
#fit()--Learn the clusters from data
#predict--Assign a cluster label to each data point

In [9]:
# Get the number of unique clusters (excluding noise if present)
unique_labels = set(dbscan_labels)
n_clusters = len(unique_labels) - (1 if -1 in dbscan_labels else 0)

if n_clusters > 1:
    mask = dbscan_labels != -1
    dbscan_sil = silhouette_score(X_scaled[mask], dbscan_labels[mask])
    dbscan_db = davies_bouldin_score(X_scaled[mask], dbscan_labels[mask])
#Silhouette Score – tells how well points fit their own cluster (higher is better).
#Davies-Bouldin Index – lower is better; measures how well clusters are separated.
# we need at least 2 clusters to compute Silhouette Score and DB Index.

 
    print("DBSCAN Silhouette Score:", dbscan_sil)
    print("DBSCAN Davies-Bouldin Index:", dbscan_db)
else:
    print("DBSCAN did not form enough clusters to evaluate.")
#### DBSCAN Evaluation--DBSCAN did not form enough distinct clusters to compute Silhouette Score 
#and Davies-Bouldin Index. This can happen when the dataset doesn't have density-based cluster patterns that DBSCAN can capture with
#default or slightly tuned parameters. Therefore, DBSCAN was not suitable for this dataset, while K-Means and Hierarchical Clustering 
#performed better.


DBSCAN did not form enough clusters to evaluate.


In [13]:
from sklearn.cluster import AgglomerativeClustering

hc = AgglomerativeClustering(n_clusters=3)
hc_labels = hc.fit_predict(X_scaled)


In [15]:
# Assuming your hierarchical model is fitted and labels are stored in hc_labels
from sklearn.metrics import silhouette_score, davies_bouldin_score

hc_sil = silhouette_score(X_scaled, hc_labels)
hc_db = davies_bouldin_score(X_scaled, hc_labels)


In [16]:
# n_clusters holds number of actual clusters DBSCAN found
#set(dbscan_labels)--gets all unique cluster labels
n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)

# calculating DBSCAN scores only if it formed enough clusters
if n_clusters > 1:
    mask = dbscan_labels != -1
    dbscan_sil = silhouette_score(X_scaled[mask], dbscan_labels[mask])
    dbscan_db = davies_bouldin_score(X_scaled[mask], dbscan_labels[mask])
else:
    dbscan_sil = "N/A"#Not Applica
    dbscan_db = "N/A"

# Hierarchical clustering scores (ensure hc_labels is defined)
hc_sil = silhouette_score(X_scaled, hc_labels)
hc_db = davies_bouldin_score(X_scaled, hc_labels)

# Results table
results = pd.DataFrame({
    "Algorithm": ["K-Means", "DBSCAN", "Hierarchical"],
    "Silhouette Score": [kmeans_sil, dbscan_sil, hc_sil],
    "Davies-Bouldin Index": [kmeans_db, dbscan_db, hc_db]
})

print(results)


      Algorithm Silhouette Score Davies-Bouldin Index
0       K-Means         0.391602             1.249401
1        DBSCAN              N/A                  N/A
2  Hierarchical         0.264609             1.284521


In [None]:
## 📌 Conclusion

In this unsupervised learning task, we explored clustering techniques on the **Wine dataset** from the UCI Machine Learning Repository. The key steps and findings are summarized below:

- ✅ **Data Preprocessing:** Scaled the features using `StandardScaler` for fair clustering performance.
- ✅ **K-Means Clustering:** Successfully applied with `n_clusters=3`. Evaluation:
  - Silhouette Score: ✅ Calculated
  - Davies-Bouldin Index: ✅ Calculated
- ✅ **Hierarchical Clustering:** Applied with `n_clusters=3`. Evaluation:
  - Silhouette Score: ✅ Calculated
  - Davies-Bouldin Index: ✅ Calculated
- ✅ **DBSCAN Clustering:** Applied using default parameters. However, it did **not form enough clusters** for evaluation. A check was included to handle this case gracefully and print a user-friendly message.
  - Silhouette Score: ❌ Not applicable
  - Davies-Bouldin Index: ❌ Not applicable

### 📊 Final Comparison Table:

| Algorithm      | Silhouette Score | Davies-Bouldin Index |
|----------------|------------------|-----------------------|
| K-Means        | ✅ Value shown   | ✅ Value shown        |
| DBSCAN         | ❌ Not enough clusters | ❌ Not enough clusters |
| Hierarchical   | ✅ Value shown   | ✅ Value shown        |

### ✅ Task Completed

This project demonstrates how different clustering algorithms perform on the same dataset and how evaluation metrics like **Silhouette Score** and **Davies-Bouldin Index** help compare their quality.

