Import Libraries

In [None]:
import pandas as pd                               # For dataframes
import matplotlib.pyplot as plt                   # For plotting data
import seaborn as sns                             # For plotting data
from sklearn.cluster import DBSCAN                # For DBSCAN
import numpy as np                                # For various computations
from sklearn.preprocessing import StandardScaler  # For standardizing data    
from sklearn.neighbors import NearestNeighbors    # For nearest neighbors

Load and prepare data

In [None]:
# Reads the .csv file into variable df
df = pd.read_csv('data/penguins.csv')

# Separates the class variable in y
y = df.y

# Drop the y column from df
df = df.drop('y', axis=1)

# Standardizes df using fit_transform (range 0-1)
df = pd.DataFrame(
    StandardScaler().fit_transform(df),
    columns=df.columns)

# Displays the first 5 rows of df
df.head()

Calculate eps (epsilon neighborhood radius)

In [None]:
# Set k to 7
k = 7

# Fits a NearestNeighbors object to df
nn = NearestNeighbors(n_neighbors=k) \
    .fit(df)

# Finds the distance of k closest neighbors
dist, ind = nn.kneighbors(df)

# Sorts the distances
dist = np.sort(dist, axis=0)[:,1]

# Plots the distances
plt.plot(dist)

# Draws a horizontal line at the bend
plt.axhline(y=.6, color='red', ls='--')

Examine the plot to identify the point where it changes direction sharply, resembling the shape of a knee. Determine the y-axis value at this knee point and set the epsilon value to that specific y-axis value.

In [None]:
# Fits a DBSCAN object to df
db=DBSCAN(eps=0.6, min_samples=k) \
    .fit(df)

# Creates a scatter plot to visualize the clusters
sns.scatterplot(
    x='bill_length_mm', 
    y='bill_depth_mm',
    data=df, 
    hue=y,
    style=db.labels_,
    palette=['orange', 'green', 'blue'])