## **DBSCAN (Density-Based Spatial Clustering of Applications with Noise)**

In [23]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [1]:
## Function to tune the DBSCAN model by changing the epsilon and min_samples values
def tune_dbscan(data):

    results = []

    # define a range of eps and min_samples values to loop through
    eps_values = np.arange(.1, 10, .1)
    min_samples_values = np.arange(2, 10, 1)

    # loop through the combinations of eps and min_samples
    for eps in eps_values:
        for min_samples in min_samples_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            dbscan.fit(data)
            labels = dbscan.labels_

            # count the number of clusters (excluding noise points labeled as -1)
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            # count the number of noise points (labeled as -1)
            n_noise = list(labels).count(-1)

            # calculate the silhouette score
            if n_clusters > 1:  # silhouette score requires at least 2 clusters
                silhouette = silhouette_score(data, labels, metric='euclidean', sample_size=None)
            else:
                silhouette = None

            results.append([eps, min_samples, n_clusters, n_noise, silhouette])

    # put the results in a dataframe
    dbscan_results = pd.DataFrame(results, columns=["Eps", "Min Samples", "Number of Clusters",
                                                    "Number of Noise Points", "Silhouette Score"])
    dbscan_results = dbscan_results.sort_values('Silhouette Score',ascending=False).head(1)
    return dbscan_results

In [25]:
cereal = pd.read_csv('../Data/cereal.csv')

In [26]:
data = cereal.drop(['Cereal Name','Manufacturer'],axis=1)

In [27]:
## Scale the features
from sklearn.preprocessing import StandardScaler

sd = StandardScaler()
data_scaled = sd.fit_transform(data)

## Convert the scaled data into a Dataframe
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)

In [28]:
## Drop the Fat column from both original and standardized dataset
data = data.drop('Fat',axis=1)
data_scaled = data_scaled.drop(columns=['Fat'])

In [29]:
data

Unnamed: 0,Calories,Protein (g),Sugars,Vitamins and Minerals
0,70,4,6,25
1,120,3,8,0
2,70,4,5,25
3,50,4,0,25
4,110,2,8,25
...,...,...,...,...
69,110,2,3,25
70,110,1,12,25
71,100,3,3,25
72,100,3,3,25


In [44]:
tuned_parm_orig = tune_dbscan(data)
tuned_parm_orig

Unnamed: 0,Eps,Min Samples,Number of Clusters,Number of Noise Points,Silhouette Score
144,1.9,2,12,20,0.317582


In [33]:
tuned_parm_scaled = tune_dbscan(data_scaled)

In [49]:
## Fit the DBSCAN model with tunes hyperparemeters
dbscan_orig = DBSCAN(eps=tuned_parm_orig['Eps'].iloc[0],
                min_samples=tuned_parm_orig['Min Samples'].iloc[0]
               )
dbscan_orig.fit(data)

In [47]:
tuned_parm_scaled = tune_dbscan(data_scaled)

In [50]:
## Fit the DBSCAN model with tunes hyperparemeters
dbscan_scaled = DBSCAN(eps=tuned_parm_scaled['Eps'].iloc[0],
                min_samples=tuned_parm_scaled['Min Samples'].iloc[0]
               )
dbscan_scaled.fit(data_scaled)

In [51]:
silhouette_score(data,dbscan_orig.labels_,metric='euclidean',sample_size=None)

np.float64(0.31758190506869993)

In [52]:
silhouette_score(data_scaled,dbscan_scaled.labels_,metric='euclidean',sample_size=None)

np.float64(0.43509036304297305)

In [53]:
dbscan_orig.labels_

array([ 0, -1,  0, -1,  1,  1,  1, -1,  2,  2,  3, -1,  4,  1,  1,  5,  6,
        1,  1,  1, -1,  5, -1,  7,  1,  1,  7,  8,  1, -1,  1,  7,  5, -1,
        3,  1,  1, -1, -1,  5,  7,  1,  6, -1,  7,  4, -1, -1, -1, -1,  9,
       10, 10,  7, -1,  8,  7,  2,  5,  5, -1, 11, 11,  1, -1,  2, -1, -1,
        9,  5,  1,  6,  6,  1])

In [54]:
dbscan_scaled.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, -1,
        1,  0,  0,  0,  0,  0])

In [56]:
dbscan_orig.get_params()

{'algorithm': 'auto',
 'eps': np.float64(1.9000000000000001),
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'min_samples': np.int64(2),
 'n_jobs': None,
 'p': None}

In [57]:
dbscan_scaled.get_params()

{'algorithm': 'auto',
 'eps': np.float64(1.9000000000000001),
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'min_samples': np.int64(4),
 'n_jobs': None,
 'p': None}