In [1]:
# Install necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import necessary dataset

df = pd.read_csv('C:/Users/ariji/OneDrive/Desktop/Data science & ML/Unsupervised Machine Learning/Data/cereal.csv')
df.head()

Unnamed: 0,Cereal Name,Manufacturer,Calories,Protein (g),Fat,Sugars,Vitamins and Minerals
0,100%_Bran,Nabisco,70,4,1,6,25
1,100%_Natural_Bran,Quaker Oats,120,3,5,8,0
2,All-Bran,Kelloggs,70,4,1,5,25
3,All-Bran_with_Extra_Fiber,Kelloggs,50,4,0,0,25
4,Almond_Delight,Ralston Purina,110,2,2,8,25


In [3]:
# drop the name and manufactuer column 

data = df[['Calories', 'Protein (g)', 'Fat', 'Sugars', 'Vitamins and Minerals']]
data.head()

Unnamed: 0,Calories,Protein (g),Fat,Sugars,Vitamins and Minerals
0,70,4,1,6,25
1,120,3,5,8,0
2,70,4,1,5,25
3,50,4,0,0,25
4,110,2,2,8,25


In [4]:
# 2. Standardize the remaining columns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler_ft = scaler.fit_transform(data)
data_scaled = pd.DataFrame(scaler_ft, columns=data.columns)
data_scaled.head()

Unnamed: 0,Calories,Protein (g),Fat,Sugars,Vitamins and Minerals
0,-1.940286,1.387392,0.056478,-0.173586,-0.14927
1,0.789394,0.462464,4.235869,0.277129,-1.253871
2,-1.940286,1.387392,0.056478,-0.398944,-0.14927
3,-3.032158,1.387392,-0.988369,-1.525731,-0.14927
4,0.243458,-0.462464,1.101326,0.277129,-0.14927


In [5]:
# 1. Copy over the function that loops through multiple eps and min_samples values to fit multiple DBSCAN models
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

def tune_dbscan(data):

    results = []

    # define a range of eps and min_samples values to loop through
    eps_values = np.arange(.1, 2, .1)
    min_samples_values = np.arange(2, 10, 1)

    # loop through the combinations of eps and min_samples
    for eps in eps_values:
        for min_samples in min_samples_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            dbscan.fit(data)
            labels = dbscan.labels_

            # count the number of clusters (excluding noise points labeled as -1)
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            # count the number of noise points (labeled as -1)
            n_noise = list(labels).count(-1)

            # calculate the silhouette score (excluding noise points)
            if n_clusters > 1:  # silhouette score requires at least 2 clusters
                silhouette = silhouette_score(data, labels, metric='euclidean', sample_size=None)
            else:
                silhouette = None

            results.append([eps, min_samples, n_clusters, n_noise, silhouette])

    # put the results in a dataframe
    dbscan_results = pd.DataFrame(results, columns=["Eps", "Min Samples", "Number of Clusters",
                                                    "Number of Noise Points", "Silhouette Score"])
    return dbscan_results

In [None]:
# 2. Apply the function on both the original and standardized data sets

In [6]:
# tune dbscan on the data
dbscan_results_1 = tune_dbscan(data)
dbscan_results_1.head()

Unnamed: 0,Eps,Min Samples,Number of Clusters,Number of Noise Points,Silhouette Score
0,0.1,2,7,59,-0.302829
1,0.1,3,1,71,
2,0.1,4,0,74,
3,0.1,5,0,74,
4,0.1,6,0,74,


In [7]:
# tune dbscan on the scaled data
dbscan_results_2 = tune_dbscan(data_scaled)
dbscan_results_2.head()

Unnamed: 0,Eps,Min Samples,Number of Clusters,Number of Noise Points,Silhouette Score
0,0.1,2,7,59,-0.145006
1,0.1,3,1,71,
2,0.1,4,0,74,
3,0.1,5,0,74,
4,0.1,6,0,74,


In [None]:
# 3. Find the highest silhouette score and note down the eps and min_samples values

In [8]:
# top results for data
(dbscan_results_1.sort_values('Silhouette Score', ascending=False)
               .groupby('Silhouette Score')
               .head(1)).head()

Unnamed: 0,Eps,Min Samples,Number of Clusters,Number of Noise Points,Silhouette Score
136,1.8,2,12,23,0.230822
120,1.6,2,13,25,0.154347
145,1.9,3,6,35,0.085334
138,1.8,4,6,36,0.074636
96,1.3,2,13,34,0.051047


In [15]:
# top results for data_v2
(dbscan_results_2.sort_values('Silhouette Score', ascending=False)
               .groupby('Silhouette Score')
               .head(1)).head(10)

Unnamed: 0,Eps,Min Samples,Number of Clusters,Number of Noise Points,Silhouette Score
137,1.8,3,2,7,0.343322
129,1.7,3,2,8,0.34164
146,1.9,4,2,6,0.341601
138,1.8,4,2,8,0.34118
130,1.7,4,2,9,0.339193
106,1.4,4,2,12,0.331755
128,1.7,2,4,4,0.315605
136,1.8,2,4,3,0.311324
144,1.9,2,4,2,0.307593
120,1.6,2,5,6,0.242909


In [None]:
# The highest silhouette score is for eps = 1.8 and min_samples = 3 on the scaled data.

In [10]:
# 4. Fit a final DBSCAN model with those eps and min_samples values and view the labels
dbscan_final = DBSCAN(eps=1.8, min_samples=3)
dbscan_final.fit(data_scaled)

In [11]:
# view the cluster labels
dbscan_final.labels_

array([ 0, -1,  0, -1,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  1, -1,
        1,  0,  0,  0,  0,  0], dtype=int64)

In [12]:
# view the value counts
from collections import Counter

Counter(dbscan_final.labels_)

Counter({0: 63, -1: 7, 1: 4})

In [None]:
# Trying few more 

# Experiment 1 : eps = 1.7 , min_samples = 2
# Experiment 2 : eps = 1.8 , min_samples = 2
# Experiment 3 : eps = 1.9 , min_samples =2

In [16]:
# Experiment 1. Fit a final DBSCAN model with those eps and min_samples values and view the labels
dbscan_final = DBSCAN(eps=1.7, min_samples=2)
dbscan_final.fit(data_scaled)

In [17]:
# view the cluster labels
dbscan_final.labels_

array([ 0, -1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  2,  3,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  2,
        0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  2,  3,
        2,  0,  0,  0,  0,  0], dtype=int64)

In [18]:
pd.Series(dbscan_final.labels_)

0     0
1    -1
2     0
3    -1
4     0
     ..
69    0
70    0
71    0
72    0
73    0
Length: 74, dtype: int64

In [19]:
# view the value counts
from collections import Counter

Counter(dbscan_final.labels_)

Counter({0: 62, -1: 4, 2: 4, 1: 2, 3: 2})

In [20]:
# Experiment 2. Fit a final DBSCAN model with those eps and min_samples values and view the labels
dbscan_final = DBSCAN(eps=1.8, min_samples=2)
dbscan_final.fit(data_scaled)

In [21]:
# view the cluster labels
dbscan_final.labels_

array([ 0, -1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  2,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,
        0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  2,  3,
        2,  0,  0,  0,  0,  0], dtype=int64)

In [22]:
pd.Series(dbscan_final.labels_)

0     0
1    -1
2     0
3    -1
4     0
     ..
69    0
70    0
71    0
72    0
73    0
Length: 74, dtype: int64

In [23]:
# view the value counts
from collections import Counter

Counter(dbscan_final.labels_)

Counter({0: 63, 2: 4, -1: 3, 1: 2, 3: 2})

In [24]:
# Experiment 3. Fit a final DBSCAN model with those eps and min_samples values and view the labels
dbscan_final = DBSCAN(eps=1.9, min_samples=2)
dbscan_final.fit(data_scaled)

In [25]:
# view the cluster labels
dbscan_final.labels_

array([ 0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  2,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,
        0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  2,  3,
        2,  0,  0,  0,  0,  0], dtype=int64)

In [26]:
# view the value counts
from collections import Counter

Counter(dbscan_final.labels_)

Counter({0: 64, 2: 4, -1: 2, 1: 2, 3: 2})

In [None]:
# None of the cluster models created well distributed clusters , so DBSCAN is not a good choice for the dataset
# If you want to find out the outliers it is a good choice ( u can deep dive for patterns using the noice points )