# DBSCAN Debugging

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

agg = pd.read_csv('data/outputs/agg_features.csv')

The following plot helps choose a good `eps` value by examining the distance to the 4th nearest neighbour.

In [None]:
sim_cols = [c for c in agg.columns if c not in {'record_id','cluster'}]
neigh = NearestNeighbors(n_neighbors=4)
neigh.fit(agg[sim_cols])
# Distances to the 4th nearest neighbour for each point
dists, _ = neigh.kneighbors(agg[sim_cols])
fourth = dists[:, -1]
plt.plot(sorted(fourth))
plt.xlabel('Points sorted by distance')
plt.ylabel('4th NN distance')
plt.title('4th nearest neighbour curve')
plt.show()

Use a knee in the curve above to set `eps`.

In [None]:
pca = PCA(n_components=2)
X = pca.fit_transform(agg[sim_cols])
plt.figure()
if 'cluster' in agg.columns:
    plt.scatter(X[:,0], X[:,1], c=agg['cluster'], cmap='tab10', s=20)
else:
    plt.scatter(X[:,0], X[:,1], s=20)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA of aggregated features')
plt.show()

If cluster labels are provided, the scatter plot is coloured accordingly to inspect separability.