<a href="https://colab.research.google.com/github/aniket-alt/Clustering_Assignment/blob/main/Task(d)DBSCAN_with_PyCaret.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task (d): DBSCAN Clustering with PyCaret

This notebook utilizes PyCaret, a high-level machine learning library, to implement DBSCAN (Density-Based Spatial Clustering of Applications with Noise). Unlike previous methods, DBSCAN identifies clusters based on the 'density' of points, meaning it can find moon-shaped or ring-shaped clusters that K-Means would miss. It also automatically labels isolated points as outliers, making it a powerful tool for cleaning messy datasets.

In [None]:
# Install PyCaret
!pip install pycaret scikit-learn matplotlib seaborn pandas numpy

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_moons, make_circles, make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors
from pycaret.clustering import *
import warnings
warnings.filterwarnings('ignore')

# Set random seed
np.random.seed(42)

# Plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("✓ Libraries imported successfully")

In [None]:
# Generate different types of datasets
np.random.seed(42)

# 1. Moon-shaped clusters (non-convex)
X_moons, y_moons = make_moons(n_samples=300, noise=0.1, random_state=42)

# 2. Circular clusters
X_circles, y_circles = make_circles(n_samples=300, factor=0.5, noise=0.05, random_state=42)

# 3. Blobs with noise
X_blobs, y_blobs = make_blobs(n_samples=300, centers=3, n_features=2,
                              cluster_std=0.6, random_state=42)
# Add noise points
noise_points = np.random.uniform(low=-3, high=3, size=(30, 2))
X_blobs = np.vstack([X_blobs, noise_points])
y_blobs = np.hstack([y_blobs, np.ones(30) * -1])  # -1 for noise

# 4. Varying density clusters
X_varied_density = np.vstack([
    np.random.randn(100, 2) * 0.3,
    np.random.randn(100, 2) * 0.8 + [3, 3],
    np.random.randn(100, 2) * 0.5 + [-3, 3]
])
y_varied_density = np.hstack([np.zeros(100), np.ones(100), np.ones(100) * 2])

# Standardize all datasets
scaler = StandardScaler()
X_moons_scaled = scaler.fit_transform(X_moons)
X_circles_scaled = scaler.fit_transform(X_circles)
X_blobs_scaled = scaler.fit_transform(X_blobs)
X_varied_scaled = scaler.fit_transform(X_varied_density)

# Visualize all datasets
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.flatten()

datasets = [
    (X_moons_scaled, y_moons, 'Moon-Shaped Clusters'),
    (X_circles_scaled, y_circles, 'Circular Clusters'),
    (X_blobs_scaled, y_blobs, 'Blobs with Noise'),
    (X_varied_scaled, y_varied_density, 'Varying Density Clusters')
]

for idx, (X, y, title) in enumerate(datasets):
    scatter = axes[idx].scatter(X[:, 0], X[:, 1], c=y, cmap='viridis',
                               s=50, alpha=0.6, edgecolors='black', linewidths=0.5)
    axes[idx].set_title(title, fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Feature 1')
    axes[idx].set_ylabel('Feature 2')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("✓ Datasets generated and visualized")

In [None]:
def plot_k_distance(X, k=4):
    """
    Plot k-distance graph to help determine eps parameter.
    The elbow point suggests a good eps value.
    """
    neighbors = NearestNeighbors(n_neighbors=k)
    neighbors_fit = neighbors.fit(X)
    distances, indices = neighbors_fit.kneighbors(X)

    # Sort distances
    distances = np.sort(distances[:, k-1], axis=0)

    plt.figure(figsize=(10, 6))
    plt.plot(distances, linewidth=2)
    plt.ylabel(f'{k}-th Nearest Neighbor Distance', fontsize=12)
    plt.xlabel('Points sorted by distance', fontsize=12)
    plt.title(f'K-Distance Graph (k={k})\nElbow point suggests optimal eps',
             fontsize=13, fontweight='bold')
    plt.grid(True, alpha=0.3)

    # Suggest eps value (at the elbow)
    elbow_idx = int(len(distances) * 0.95)  # Approximate elbow at 95th percentile
    suggested_eps = distances[elbow_idx]
    plt.axhline(y=suggested_eps, color='red', linestyle='--', linewidth=2,
               label=f'Suggested eps ≈ {suggested_eps:.3f}')
    plt.legend(fontsize=11)
    plt.tight_layout()
    plt.show()

    return suggested_eps

# Plot k-distance for moon dataset
print("Analyzing Moon Dataset:")
suggested_eps_moons = plot_k_distance(X_moons_scaled, k=4)
print(f"Suggested eps for Moon dataset: {suggested_eps_moons:.3f}\n")

# Plot k-distance for blobs dataset
print("Analyzing Blobs Dataset:")
suggested_eps_blobs = plot_k_distance(X_blobs_scaled, k=4)
print(f"Suggested eps for Blobs dataset: {suggested_eps_blobs:.3f}")

In [None]:
# Apply DBSCAN to moon dataset
dbscan_moons = DBSCAN(eps=0.3, min_samples=5)
labels_moons_dbscan = dbscan_moons.fit_predict(X_moons_scaled)

# Apply K-Means for comparison
kmeans_moons = KMeans(n_clusters=2, random_state=42, n_init=10)
labels_moons_kmeans = kmeans_moons.fit_predict(X_moons_scaled)

# Calculate metrics (excluding noise points for DBSCAN)
mask_no_noise = labels_moons_dbscan != -1
if np.sum(mask_no_noise) > 0 and len(np.unique(labels_moons_dbscan[mask_no_noise])) > 1:
    silhouette_dbscan = silhouette_score(X_moons_scaled[mask_no_noise],
                                         labels_moons_dbscan[mask_no_noise])
else:
    silhouette_dbscan = 0

silhouette_kmeans = silhouette_score(X_moons_scaled, labels_moons_kmeans)

# Count clusters and noise
n_clusters_dbscan = len(set(labels_moons_dbscan)) - (1 if -1 in labels_moons_dbscan else 0)
n_noise = list(labels_moons_dbscan).count(-1)

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# True labels
axes[0].scatter(X_moons_scaled[:, 0], X_moons_scaled[:, 1],
               c=y_moons, cmap='viridis', s=50, alpha=0.6,
               edgecolors='black', linewidths=0.5)
axes[0].set_title('True Labels', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
axes[0].grid(True, alpha=0.3)

# DBSCAN
axes[1].scatter(X_moons_scaled[:, 0], X_moons_scaled[:, 1],
               c=labels_moons_dbscan, cmap='viridis', s=50, alpha=0.6,
               edgecolors='black', linewidths=0.5)
# Highlight noise points
if n_noise > 0:
    axes[1].scatter(X_moons_scaled[labels_moons_dbscan == -1, 0],
                   X_moons_scaled[labels_moons_dbscan == -1, 1],
                   c='red', marker='x', s=100, linewidths=2, label='Noise')
axes[1].set_title(f'DBSCAN\nClusters: {n_clusters_dbscan}, Noise: {n_noise}, Silhouette: {silhouette_dbscan:.3f}',
                 fontsize=11, fontweight='bold')
axes[1].set_xlabel('Feature 1')
axes[1].set_ylabel('Feature 2')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# K-Means
axes[2].scatter(X_moons_scaled[:, 0], X_moons_scaled[:, 1],
               c=labels_moons_kmeans, cmap='viridis', s=50, alpha=0.6,
               edgecolors='black', linewidths=0.5)
axes[2].scatter(kmeans_moons.cluster_centers_[:, 0],
               kmeans_moons.cluster_centers_[:, 1],
               c='red', marker='X', s=200, edgecolors='black', linewidths=2)
axes[2].set_title(f'K-Means\nSilhouette: {silhouette_kmeans:.3f}',
                 fontsize=11, fontweight='bold')
axes[2].set_xlabel('Feature 1')
axes[2].set_ylabel('Feature 2')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("MOON DATASET RESULTS")
print("="*60)
print(f"DBSCAN: {n_clusters_dbscan} clusters, {n_noise} noise points")
print(f"DBSCAN Silhouette: {silhouette_dbscan:.4f}")
print(f"K-Means Silhouette: {silhouette_kmeans:.4f}")
print("\nDBSCAN successfully separates non-convex clusters!")
print("="*60)

In [None]:
# Apply DBSCAN to circular dataset
dbscan_circles = DBSCAN(eps=0.3, min_samples=5)
labels_circles_dbscan = dbscan_circles.fit_predict(X_circles_scaled)

kmeans_circles = KMeans(n_clusters=2, random_state=42, n_init=10)
labels_circles_kmeans = kmeans_circles.fit_predict(X_circles_scaled)

# Count clusters
n_clusters_circles = len(set(labels_circles_dbscan)) - (1 if -1 in labels_circles_dbscan else 0)
n_noise_circles = list(labels_circles_dbscan).count(-1)

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].scatter(X_circles_scaled[:, 0], X_circles_scaled[:, 1],
               c=y_circles, cmap='viridis', s=50, alpha=0.6,
               edgecolors='black', linewidths=0.5)
axes[0].set_title('True Labels', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3)

axes[1].scatter(X_circles_scaled[:, 0], X_circles_scaled[:, 1],
               c=labels_circles_dbscan, cmap='viridis', s=50, alpha=0.6,
               edgecolors='black', linewidths=0.5)
if n_noise_circles > 0:
    axes[1].scatter(X_circles_scaled[labels_circles_dbscan == -1, 0],
                   X_circles_scaled[labels_circles_dbscan == -1, 1],
                   c='red', marker='x', s=100, linewidths=2, label='Noise')
axes[1].set_title(f'DBSCAN\nClusters: {n_clusters_circles}, Noise: {n_noise_circles}',
                 fontsize=11, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

axes[2].scatter(X_circles_scaled[:, 0], X_circles_scaled[:, 1],
               c=labels_circles_kmeans, cmap='viridis', s=50, alpha=0.6,
               edgecolors='black', linewidths=0.5)
axes[2].set_title('K-Means', fontsize=11, fontweight='bold')
axes[2].grid(True, alpha=0.3)

for ax in axes:
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')

plt.tight_layout()
plt.show()

print("DBSCAN handles nested circular clusters better than K-Means!")

In [None]:
# Prepare data for PyCaret
df_blobs = pd.DataFrame(X_blobs_scaled, columns=['Feature_1', 'Feature_2'])

print("Setting up PyCaret clustering environment...")
print("=" * 60)

# Setup PyCaret
cluster_setup = setup(data=df_blobs,
                     normalize=False,  # Already normalized
                     session_id=42,
                     verbose=False)

print("✓ PyCaret setup complete")

In [None]:
# Compare multiple clustering models
print("\nComparing clustering models...\n")
best_models = compare_models()

print("\n✓ Model comparison complete")

In [None]:
# Create DBSCAN model with PyCaret
print("Creating DBSCAN model with PyCaret...")
dbscan_pycaret = create_model('dbscan', num_clusters=None)

print("\n✓ DBSCAN model created")

# Get cluster assignments
result_df = assign_model(dbscan_pycaret)
print(f"\nCluster distribution:")
print(result_df['Cluster'].value_counts().sort_index())

In [None]:
# Visualize clusters using PyCaret
print("\nGenerating visualizations...\n")

# Plot model (cluster visualization)
plot_model(dbscan_pycaret, plot='cluster')

# Distribution plot
plot_model(dbscan_pycaret, plot='distribution')

In [None]:
# Tune DBSCAN parameters
print("Tuning DBSCAN parameters...\n")

tuned_dbscan = tune_model(dbscan_pycaret,
                         supervised_target='Cluster',
                         estimator='knn')

print("\n✓ Parameter tuning complete")

# Evaluate tuned model
tuned_results = assign_model(tuned_dbscan)
print(f"\nTuned model cluster distribution:")
print(tuned_results['Cluster'].value_counts().sort_index())

In [None]:
# Test different parameter combinations
eps_values = [0.2, 0.3, 0.4, 0.5]
min_samples_values = [3, 5, 7, 10]

fig, axes = plt.subplots(len(min_samples_values), len(eps_values),
                        figsize=(16, 14))

print("Testing parameter combinations...\n")
print(f"{'eps':<8} {'min_samples':<12} {'n_clusters':<12} {'n_noise':<10} {'silhouette':<12}")
print("="*60)

for i, min_samp in enumerate(min_samples_values):
    for j, eps in enumerate(eps_values):
        dbscan = DBSCAN(eps=eps, min_samples=min_samp)
        labels = dbscan.fit_predict(X_blobs_scaled)

        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)

        # Calculate silhouette score (if possible)
        mask = labels != -1
        if np.sum(mask) > 0 and len(np.unique(labels[mask])) > 1:
            silhouette = silhouette_score(X_blobs_scaled[mask], labels[mask])
        else:
            silhouette = 0

        print(f"{eps:<8.2f} {min_samp:<12} {n_clusters:<12} {n_noise:<10} {silhouette:<12.4f}")

        # Plot
        axes[i, j].scatter(X_blobs_scaled[:, 0], X_blobs_scaled[:, 1],
                          c=labels, cmap='viridis', s=30, alpha=0.6,
                          edgecolors='black', linewidths=0.3)
        if n_noise > 0:
            axes[i, j].scatter(X_blobs_scaled[labels == -1, 0],
                              X_blobs_scaled[labels == -1, 1],
                              c='red', marker='x', s=50, linewidths=1.5)
        axes[i, j].set_title(f'eps={eps}, min_samples={min_samp}\n'
                            f'Clusters: {n_clusters}, Noise: {n_noise}',
                            fontsize=9)
        axes[i, j].grid(True, alpha=0.3)
        axes[i, j].set_xticks([])
        axes[i, j].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
# Apply DBSCAN to varying density data
dbscan_varied = DBSCAN(eps=0.4, min_samples=5)
labels_varied = dbscan_varied.fit_predict(X_varied_scaled)

n_clusters_varied = len(set(labels_varied)) - (1 if -1 in labels_varied else 0)
n_noise_varied = list(labels_varied).count(-1)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# True labels
axes[0].scatter(X_varied_scaled[:, 0], X_varied_scaled[:, 1],
               c=y_varied_density, cmap='viridis', s=50, alpha=0.6,
               edgecolors='black', linewidths=0.5)
axes[0].set_title('True Labels\n(3 clusters with varying densities)',
                 fontsize=12, fontweight='bold')
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
axes[0].grid(True, alpha=0.3)

# DBSCAN results
axes[1].scatter(X_varied_scaled[:, 0], X_varied_scaled[:, 1],
               c=labels_varied, cmap='viridis', s=50, alpha=0.6,
               edgecolors='black', linewidths=0.5)
if n_noise_varied > 0:
    axes[1].scatter(X_varied_scaled[labels_varied == -1, 0],
                   X_varied_scaled[labels_varied == -1, 1],
                   c='red', marker='x', s=100, linewidths=2, label='Noise')
axes[1].set_title(f'DBSCAN Results\nFound {n_clusters_varied} clusters, {n_noise_varied} noise points',
                 fontsize=12, fontweight='bold')
axes[1].set_xlabel('Feature 1')
axes[1].set_ylabel('Feature 2')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("VARYING DENSITY CHALLENGE")
print("="*60)
print("True clusters: 3")
print(f"DBSCAN found: {n_clusters_varied} clusters")
print(f"Noise points: {n_noise_varied}")
print("\nLimitation: DBSCAN uses global density threshold (eps)")
print("Solution: Consider HDBSCAN for varying density clusters")
print("="*60)

In [None]:
# Compare all methods on moon dataset
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture

# Apply different clustering methods
kmeans_comp = KMeans(n_clusters=2, random_state=42, n_init=10)
labels_kmeans_comp = kmeans_comp.fit_predict(X_moons_scaled)

agg_comp = AgglomerativeClustering(n_clusters=2, linkage='ward')
labels_agg_comp = agg_comp.fit_predict(X_moons_scaled)

gmm_comp = GaussianMixture(n_components=2, random_state=42)
labels_gmm_comp = gmm_comp.fit_predict(X_moons_scaled)

dbscan_comp = DBSCAN(eps=0.3, min_samples=5)
labels_dbscan_comp = dbscan_comp.fit_predict(X_moons_scaled)

# Visualize comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

methods = [
    (y_moons, 'True Labels'),
    (labels_kmeans_comp, 'K-Means'),
    (labels_agg_comp, 'Hierarchical'),
    (labels_gmm_comp, 'GMM'),
    (labels_dbscan_comp, 'DBSCAN')
]

for idx, (labels, title) in enumerate(methods):
    axes[idx].scatter(X_moons_scaled[:, 0], X_moons_scaled[:, 1],
                     c=labels, cmap='viridis', s=50, alpha=0.6,
                     edgecolors='black', linewidths=0.5)

    # Highlight noise for DBSCAN
    if title == 'DBSCAN' and -1 in labels:
        axes[idx].scatter(X_moons_scaled[labels == -1, 0],
                         X_moons_scaled[labels == -1, 1],
                         c='red', marker='x', s=100, linewidths=2)

    axes[idx].set_title(title, fontsize=13, fontweight='bold')
    axes[idx].set_xlabel('Feature 1')
    axes[idx].set_ylabel('Feature 2')
    axes[idx].grid(True, alpha=0.3)

# Hide last subplot
axes[-1].axis('off')

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("ALGORITHM COMPARISON ON NON-CONVEX DATA")
print("="*60)
print("K-Means:      ✗ Fails (assumes spherical clusters)")
print("Hierarchical: ✗ Fails (depends on linkage, usually poor)")
print("GMM:          ✗ Fails (assumes Gaussian distributions)")
print("DBSCAN:       ✓ Success (density-based, handles any shape)")
print("="*60)