In [None]:
# Source : https://www.reneshbedre.com/blog/dbscan-python.html
# DBSCAN Video : https://www.youtube.com/watch?v=RDZUdRSDOok

# For clustering using DBSCAN, we are using a single-cell gene expression dataset 
# of Arabidopsis thaliana root cells processed by a 10x genomics Cell Ranger pipeline. 
# The dataset is preprocessed t-SNE dimensionality reduction technique. 
# Now, we will use the t-SNE embedding vectors to identify the clusters using DBSCAN.

In [None]:
import pandas as pd
import numpy as np
import math

from sklearn.neighbors import NearestNeighbors

from collections import Counter

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN


import scipy.cluster.hierarchy as shc

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

import plotly.express as px
import plotly.graph_objs as go

In [None]:
df = pd.read_csv("https://reneshbedre.github.io/assets/posts/tsne/tsne_scores.csv")
# df = pd.read_csv("SelfShiksha_ULB_FC89_DBSCAN.csv")

In [None]:
df

In [None]:
# To determine the optimal eps parameter of DBSCAN algorithm, 
# it is common practice to compute the k-nearest neighbor (kNN) distances 
# (average distance of every data point to its k-nearest neighbors) 
# of an input dataset using the k-nearest neighbor method.
# NearestNeighbors function requires n_neighbors (number of neighbors) parameter, which can be same as the minPts value.

# n_neighbors = 5 as kneighbors function returns distance of point to itself (i.e. first column will be zeros) 
nbrs = NearestNeighbors(n_neighbors=5).fit(df)
# Find the k-neighbors of a point
neigh_dist, neigh_ind = nbrs.kneighbors(df)
# sort the neighbor distances (lengths to points) in ascending order
# axis = 0 represents sort along first axis i.e. sort along row
sort_neigh_dist = np.sort(neigh_dist, axis=0)

In [None]:
# In the k-NN distance plot, you should look for the “knee” of the curve to find the optimal value of eps. 
# In the below plot, the knee occurs at approximately 2.5 i.e. 
# the points below 2.5 belong to a cluster and points above 2.5 
# are noise or outliers (noise points will have higher kNN distance).

k_dist = sort_neigh_dist[:, 4]
plt.plot(k_dist)
plt.axhline(y=2.5, linewidth=1, linestyle='dashed', color='k')
plt.ylabel("k-NN distance")
plt.xlabel("Sorted observations (4th NN)")
plt.show()

In [None]:
clusters = DBSCAN(eps=2.5, min_samples=4).fit(df)
# get cluster labels
clusters.labels_

In [None]:
# check unique clusters
set(clusters.labels_)
# -1 value represents noisy points could not assigned to any cluster

In [None]:
Counter(clusters.labels_)

In [None]:
p = sns.scatterplot(data=df, x="t-SNE-1", y="t-SNE-2", hue=clusters.labels_, legend="full", palette="deep")
sns.move_legend(p, "upper right", bbox_to_anchor=(1.17, 1.2), title='Clusters')
plt.show()