# Clustering with DBSCAN

In [None]:
import pandas as pd
import numpy as np

First we make our fictional dataset.

In [None]:
from sklearn.datasets import make_blobs

In [None]:
x, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

In [None]:
x[:10]

In [None]:
y

In [None]:
import matplotlib.pyplot as plt

In [None]:
ourcolors = ['red','blue','black','green','yellow','magenta','orange','brown','grey','aqua']

In [None]:
plt.scatter(x[:,0],
            x[:,1],
            color=[ourcolors[i] for i in y])
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
plt.scatter(x[:,0],
            x[:,1])
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
# from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

We create an object for our model by calling "DBSCAN" with specification for `eps` (the distance around a given point to search for neighboring instances) and `min_samples` (the minimum number of samples that a neighborhood has to include for a point to be considered a core point).

In [None]:
# kmeans = KMeans(n_clusters=4, n_init=10)
dbscan = DBSCAN(eps=0.1, min_samples=5)

We then call the fit method, and pass in the data in which we want to search for clusters

In [None]:
# kmeans.fit(x)
dbscan.fit(x)

In [None]:
x[[0]]

In [None]:
dbscan.core_sample_indices_

In [None]:
# kmeans.predict(x[[0]])
dbscan.predict(x[[0]])

# Will give an error!
# DBSCAN is a density-based clustering method that 
# does not learn explicit cluster centers

In [None]:
# One could instead identify the nearest core point and identify
# the cluster with that point's cluster
# or
# train KNN on the core points
#
# THIS IS ONLY ILLUSTRATIVE, and for example the "labels" below does not exist
#
# neighbor identification can also be done with sklearn
from sklearn.neighbors import NearestNeighbors
nn = NearestNeighbors(n_neighbors=1)

core_points = x[dbscan.core_sample_indices_]
nn.fit(core_points)

distances, indices = nn.kneighbors(x[[0]])
if distances[0][0] <= dbscan.eps:
    new_label = labels[dbscan.core_sample_indices_][indices[0][0]]
else:
    new_label = -1  # Mark as noise

new_label

In [None]:
plt.scatter(x[:,0],
            x[:,1],
            # color=[ourcolors[i] for i in kmeans.labels_])
            color=[ourcolors[i] for i in dbscan.labels_])

In [None]:
# kmeans.labels_
dbscan.labels_

In [None]:
# kmeans.cluster_centers_
dbscan.cluster_centers_

# also ERROR
# there are no cluster centers with DBSCAN

In [None]:
# but there are core points
dbscan.core_sample_indices_[:10], dbscan.components_

In [None]:
import ipywidgets

In [None]:
dbscan = DBSCAN(eps=0.3, min_samples=5)
dbscan.fit(x)
dbscan.labels_

In [None]:
# def plotblobs(n):
    # kmeans = KMeans(n_clusters=n, n_init=10)
    # kmeans.fit(x)
    # plt.scatter(x[:,0], x[:,1], color=[ourcolors[i] for i in kmeans.labels_])

# Note: we need to remember that "-1" is used for unclustered points
# so this color mapping is a little sloppy if number of clusters > 9
def plotblobs(eps,min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i % 10] for i in dbscan.labels_])
    
ipywidgets.interact(plotblobs,eps=(0.1,1.0),min_samples=(1,10));

In [None]:
# def plotblobs(n):
    # kmeans = KMeans(n_clusters=n, n_init=10)
    # kmeans.fit(x)
    # plt.scatter(x[:,0], x[:,1], color=[ourcolors[i] for i in kmeans.labels_])

# Note: we need to remember that "-1" is used for unclustered points
# so this color mapping is a little sloppy if number of clusters > 9
def plotblobs(eps,min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i % 10] for i in dbscan.labels_])
    for i in range(len(x[:,0])):
        if dbscan.labels_[i] == -1:
            plt.scatter(x[i,0],x[i,1],marker='x',color='red')
    
ipywidgets.interact(plotblobs,eps=(0.1,1.0),min_samples=(1,10));

In [None]:
# def plotblobs(n):
    # kmeans = KMeans(n_clusters=n, n_init=10)
    # kmeans.fit(x)
    # plt.scatter(x[:,0], x[:,1], color=[ourcolors[i] for i in kmeans.labels_])

# Note: we need to remember that "-1" is used for unclustered points
# so this color mapping is a little sloppy if number of clusters > 9
def plotblobs(eps,min_samples,metric='euclidean'):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
    dbscan.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i % 10] for i in dbscan.labels_])
    for i in range(len(x[:,0])):
        if dbscan.labels_[i] == -1:
            plt.scatter(x[i,0],x[i,1],marker='x',color='red')
    
ipywidgets.interact(plotblobs,
                    eps=(0.01,1.0,0.02),
                    min_samples=(1,10),
                    metric=['euclidean','cosine']);

In [None]:
# def plotblobs(n):
    # kmeans = KMeans(n_clusters=n, n_init=10)
    # kmeans.fit(x)
    # plt.scatter(x[:,0], x[:,1], color=[ourcolors[i] for i in kmeans.labels_])

# Note: we need to remember that "-1" is used for unclustered points
# so this color mapping is a little sloppy if number of clusters > 9
def plotblobs(eps,min_samples,metric='euclidean'):
    xtmp = x
    for i in range(xtmp.shape[0]):
        if xtmp[i,0] > 0 and xtmp[i,1] > 0:
            xtmp[i,1] = -xtmp[i,1]
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
    dbscan.fit(xtmp)
    plt.scatter(xtmp[:,0], xtmp[:,1], color=[ourcolors[i % 10] for i in dbscan.labels_])
    for i in range(len(xtmp[:,0])):
        if dbscan.labels_[i] == -1:
            plt.scatter(xtmp[i,0],xtmp[i,1],marker='x',color='red')
    
ipywidgets.interact(plotblobs,
                    eps=(0.01,1.0,0.02),
                    min_samples=(1,10),
                    metric=['euclidean','cosine']);

In [None]:
x = np.linspace(5,10,100)
y1 = 2*x + np.random.normal(0,0.5,100)
y2 = -x + np.random.normal(0,0.5,100)
y3 = x/3 + np.random.normal(0,0.5,100)
z = np.zeros([300,300])
z[:,0] = np.concatenate([x,x,x])
z[:,1] = np.concatenate([y1,y2,y3])

In [None]:
plt.plot(z[:,0], z[:,1],'ko')

In [None]:
def dplot(eps=0.3,min_samples=4):

    fig,ax = plt.subplots(1,2)
    
    db_cosine = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
    labels_cosine = db_cosine.fit(z)
    ax[0].scatter(z[:,0], z[:,1], color=[ourcolors[i % 10] for i in db_cosine.labels_])

    db_euclidean = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
    labels_euclidean = db_euclidean.fit(z)
    ax[1].scatter(z[:,0], z[:,1], color=[ourcolors[i % 10] for i in db_euclidean.labels_])

ipywidgets.interact(dplot,eps=(0.01,1.0,0.02),min_samples=(1,10));