<a href="https://colab.research.google.com/github/anandaditya07/ML_LAB/blob/main/Experiment_11_Clustering_Analysis_with_DBSCAN_and_PCA_Evaluated_by_Silhouette_Score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Experiment 11 : Clustering Analysis with DBSCAN & PCA: Evaluated by Silhouette Score**



In [11]:
import pandas as pd
from matplotlib import pyplot as plt
import hashlib
import numpy as np
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

In [12]:
data = pd.read_csv('/content/sample_data/california_housing_train.csv') # Replaced with a sample dataset to resolve FileNotFoundError. Please upload 'bank_marketing_dataset.csv' if you wish to use the original data.
len_data = len(data)
# data = data.drop(columns=['duration']) # Column 'duration' is specific to 'bank_marketing_dataset.csv' and not present in the sample dataset.
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250,65500.0
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179,79000.0
16997,-124.30,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313,103600.0
16998,-124.30,41.80,19.0,2672.0,552.0,1298.0,478.0,1.9797,85800.0


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
for col in data.columns:
  if data[col].isna().any():
    mode = data[col].mode().iloc[0]
    print(f"{col} contains NAs, replacing with {mode}")
    data[col] = data[col].fillna(mode)

In [16]:
# The following code attempts to process columns that are not present in the current dataset (california_housing_train.csv).
# These columns ('default', 'housing', 'loan', 'subscribed') were part of the original 'bank_marketing_dataset.csv'.
# Commenting out this code to resolve the KeyError.

# for bin_col in ['default', 'housing', 'loan', 'subscribed']:
#     data[bin_col+'_bin'] = [1 if y == 'yes' else 0 for y in data[bin_col].values]
# data = data.drop(columns=['default', 'housing', 'loan', 'subscribed'])


In [17]:
cols_to_drop = []

for col in data.columns:
  if data[col].dtype == 'object':
    print(f'Column {col} has {data[col].nunique()} values among {len_data}')

    if data[col].nunique() < 25:
      print(f'One-hot encoding of {col}')
      one_hot_cols = pd.get_dummies(data[col])
      for ohc in one_hot_cols.columns:
        data[col + '_' + ohc] = one_hot_cols[ohc]
    else:
      print(f'Hashing of {col}')
      data[col + '_hash'] = data[col].apply(lambda row: int(hashlib.sha1((col + "_" + str(row)).encode('utf-8')).hexdigest(), 16) % len_data)

    cols_to_drop.append(col)

data = data.drop(columns=cols_to_drop)

In [18]:
corr = data.corr()
corr_top = corr.abs().unstack().sort_values(kind='quicksort')
corr_top = corr_top[corr_top > 0.9][corr_top < 1]

cols_to_drop = [corr_top.index[i][0] for i in range(0, len(corr_top), 2)]
print(f"Highly correlated features: {cols_to_drop}")
data = data.drop(columns=cols_to_drop)

Highly correlated features: ['population', 'households', 'latitude', 'total_rooms', 'total_bedrooms']


In [19]:
ss = StandardScaler()
data_scaled = pd.DataFrame(columns=data.columns,
                               data=ss.fit_transform(data))

In [20]:
data_scaled

Unnamed: 0,longitude,housing_median_age,median_income,median_house_value
0,2.619365,-1.079671,-1.252543,-1.210558
1,2.539569,-0.761872,-1.081483,-1.096745
2,2.494683,-0.920772,-1.170105,-1.048461
3,2.489696,-1.159121,-0.362600,-1.154514
4,2.489696,-0.682422,-1.026454,-1.222629
...,...,...,...,...
16995,-2.342963,1.859971,-0.799999,-0.826872
16996,-2.347950,0.588774,-0.715727,-1.106230
16997,-2.362912,-0.920772,-0.446663,-0.894125
16998,-2.362912,-0.761872,-0.997787,-1.047599


In [21]:
max_silhouette = -1 # maximum silhouette coefficient that we hsve reached
tol_iters = 0 # number of iterations during which the metric improvement didn't happen
algos = ['KMeans', 'DBSCAN'] # algo names (for output)
best_algo = '' # best algo name (for output)
best_k = 0 # best number of dimensions
early_stopping_iters = 5 # max number of iterations with no improvement

for k in range(2, data_scaled.shape[1], 2):
    print(f"PCA for {k} dimensions...")
    data_pca = PCA(n_components=k, random_state=42).fit_transform(data_scaled)
    print(f"Clustering with {k} dimensions and 2 clusters...")
    kmc_model = KMeans(n_clusters=2, random_state=42).fit(data_pca)
    print("KMeans fitted...")
    db_model = DBSCAN(n_jobs=-1).fit(data_pca)
    print("DBSCAN fitted...")

    kmc_sil = silhouette_score(data_pca, kmc_model.labels_, metric='euclidean', random_state=42)
    db_sil = silhouette_score(data_pca, db_model.labels_, metric='euclidean', random_state=42)

    no_improvement = True

    for score, i in zip([kmc_sil, db_sil], range(len(algos))):
        if score > max_silhouette:
            print(f"New max score for 2 clusters and {k} dimensions is {round(score, 5)} with {algos[i]}")
            no_improvement = False
            max_silhouette = score
            best_algo = algos[i]
            best_k = k
            tol_iters = 0

    if no_improvement:
        print(f"No improvement for {k} dimensions")
        tol_iters += 1

    if tol_iters == early_stopping_iters:
        print(f"Early stopping: {early_stopping_iters} iterations without improvement")
        break

print("***")
print(f"The final score is {round(max_silhouette, 5)} with {best_k} dimensions and {best_algo}")

PCA for 2 dimensions...
Clustering with 2 dimensions and 2 clusters...
KMeans fitted...
DBSCAN fitted...
New max score for 2 clusters and 2 dimensions is 0.40608 with KMeans
New max score for 2 clusters and 2 dimensions is 0.6454 with DBSCAN
***
The final score is 0.6454 with 2 dimensions and DBSCAN
