## Testing scikit-learn's DBSCAN vs a custom implementation

In [1]:
# scikit-learn imports
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import DBSCAN

# custom DBSCAN
from custom_dbscan import DB_SCAN

# plotting
import altair as alt
import pandas as pd

### first on artificially-made clusters

In [2]:
# Create three gaussian blobs to use as our clustering data.
centers = [[1, 1], [-1, -1], [1, -1]]
X, true_labels = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
                            random_state=0)

In [3]:
# dataframe for plotting
df = pd.DataFrame(data=X,columns=['x','y'])
df['true_labels'] = true_labels

In [4]:
# run sklearn's implementation and add its labels
sk_labels = DBSCAN(eps=0.3, min_samples=10).fit_predict(X)
df['sk_labels'] = sk_labels

# run custom implementation and add its labels.
custom_labels = DB_SCAN(max_radius=0.3, min_points=10).fit_predict(X)
df['custom_labels'] = custom_labels

In [5]:
df

Unnamed: 0,x,y,true_labels,sk_labels,custom_labels
0,0.840220,1.148022,0,0,1
1,-1.154748,-1.204117,1,1,2
2,0.678636,0.724180,0,0,1
3,0.450783,-1.427097,2,2,3
4,1.492116,1.480952,0,0,1
...,...,...,...,...,...
745,0.852327,0.904248,0,0,1
746,-1.375152,-0.826757,1,1,2
747,0.267989,-1.278334,2,2,3
748,-0.886288,-0.302932,1,1,2


In [6]:
# true labels
alt.Chart(df).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('true_labels', scale=alt.Scale(scheme='inferno')),
)

In [7]:
# sk labels
alt.Chart(df).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('sk_labels', scale=alt.Scale(scheme='inferno')),
)

In [8]:
# custom labels
alt.Chart(df).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('custom_labels', scale=alt.Scale(scheme='inferno')),
)

### on iris data

In [9]:
# pull iris data

iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 
                    header=None, names=['sepal_length','sepal_width','petal_length','petal_width','true_labels'])

In [28]:
# run sklearn's implementation and add its labels
sk_labels = DBSCAN(eps=0.525, min_samples=13).fit_predict(iris.iloc[:,:4].values)
iris['sk_labels'] = sk_labels

# run custom implementation and add its labels.
custom_labels = DB_SCAN(max_radius=0.525, min_points=13).fit_predict(iris.iloc[:,:4].values)
iris['custom_labels'] = custom_labels

In [29]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,true_labels,sk_labels,custom_labels
0,5.1,3.5,1.4,0.2,Iris-setosa,0,1
1,4.9,3.0,1.4,0.2,Iris-setosa,0,1
2,4.7,3.2,1.3,0.2,Iris-setosa,0,1
3,4.6,3.1,1.5,0.2,Iris-setosa,0,1
4,5.0,3.6,1.4,0.2,Iris-setosa,0,1
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,2,3
146,6.3,2.5,5.0,1.9,Iris-virginica,1,2
147,6.5,3.0,5.2,2.0,Iris-virginica,2,3
148,6.2,3.4,5.4,2.3,Iris-virginica,-1,-1


In [30]:
# true labels
alt.Chart(iris).mark_circle(size=60).encode(
    x='sepal_length',
    y='petal_width',
    color=alt.Color('true_labels', scale=alt.Scale(scheme='inferno')),
)

In [31]:
# sk labels
alt.Chart(iris).mark_circle(size=60).encode(
    x='sepal_length',
    y='petal_width',
    color=alt.Color('sk_labels', scale=alt.Scale(scheme='inferno')),
)

In [32]:
# custom labels
alt.Chart(iris).mark_circle(size=60).encode(
    x='sepal_length',
    y='petal_width',
    color=alt.Color('custom_labels', scale=alt.Scale(scheme='inferno')),
)