# Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBCAN)

In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import hdbscan
from pathlib import Path
import os

def hdbscan_cluster(features_path, min_cluster_size=30, min_samples=None, output_dir=''):
    """
    Perform HDBSCAN clustering on features
    
    Args:
        features_path: Path to feature file
        min_cluster_size: Minimum size of clusters
        min_samples: Number of samples in neighborhood for core points
        output_dir: Where to save cluster assignments
    """
    # Load features
    X = np.load(features_path)
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Fit HDBSCAN
    print(f"Fitting HDBSCAN with min_cluster_size={min_cluster_size}...")
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        prediction_data=True
    )
    
    cluster_labels = clusterer.fit_predict(X_scaled)
    probabilities = clusterer.probabilities_
    
    # Save cluster assignments
    np.save(output_dir, cluster_labels)
    
    # Print cluster sizes
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    print(f"\nNumber of clusters found: {n_clusters}")
    print("\nCluster sizes:")
    for i in range(-1, max(cluster_labels) + 1):
        if i == -1:
            print(f"Noise points: {np.sum(cluster_labels == i)} sounds")
        else:
            print(f"Cluster {i}: {np.sum(cluster_labels == i)} sounds")
    
    # Print average probabilities per cluster
    print("\nAverage membership probabilities:")
    for i in range(-1, max(cluster_labels) + 1):
        if i == -1:
            print(f"Noise points: {np.mean(probabilities[cluster_labels == i]):.3f}")
        else:
            print(f"Cluster {i}: {np.mean(probabilities[cluster_labels == i]):.3f}")
    
    return cluster_labels, probabilities, clusterer

In [2]:
def evaluate_hdbscan_clustering(features_path, labels_path, cluster_labels_path, 
                              probabilities=None, viz_dir=''):
    """
    Evaluate HDBSCAN clustering results
    """
    # Load data
    X = np.load(features_path)
    y_true = np.load(labels_path)
    cluster_labels = np.load(cluster_labels_path)
    
    # Standardize features
    X_scaled = StandardScaler().fit_transform(X)
    
    # Calculate metrics (excluding noise points)
    non_noise_mask = cluster_labels != -1
    if np.sum(non_noise_mask) > 0:  # Only calculate if we have non-noise points
        # Internal Metrics
        silhouette = silhouette_score(X_scaled[non_noise_mask], 
                                    cluster_labels[non_noise_mask])
        davies_bouldin = davies_bouldin_score(X_scaled[non_noise_mask], 
                                            cluster_labels[non_noise_mask])
        calinski_harabasz = calinski_harabasz_score(X_scaled[non_noise_mask], 
                                                   cluster_labels[non_noise_mask])
        
        print("Internal Metrics (excluding noise):")
        print(f"Silhouette Score: {silhouette:.3f}")
        print(f"Davies-Bouldin Index: {davies_bouldin:.3f}")
        print(f"Calinski-Harabasz Index: {calinski_harabasz:.3f}")
        
        # External Metrics
        ari = adjusted_rand_score(y_true[non_noise_mask], 
                                cluster_labels[non_noise_mask])
        nmi = normalized_mutual_info_score(y_true[non_noise_mask], 
                                         cluster_labels[non_noise_mask])
        
        print("\nExternal Metrics (excluding noise):")
        print(f"Adjusted Rand Index: {ari:.3f}")
        print(f"Normalized Mutual Information: {nmi:.3f}")
    
    # Create confusion matrix
    unique_labels = np.unique(y_true)
    unique_clusters = np.unique(cluster_labels)
    confusion_matrix = np.zeros((len(unique_labels), len(unique_clusters)))
    
    for i, label in enumerate(unique_labels):
        for j, cluster in enumerate(unique_clusters):
            confusion_matrix[i, j] = np.sum((y_true == label) & (cluster_labels == cluster))
    
    # Normalize by row
    confusion_matrix_normalized = confusion_matrix / confusion_matrix.sum(axis=1)[:, np.newaxis]
    
    # Plot confusion matrix
    plt.figure(figsize=(12, 8))
    sns.heatmap(confusion_matrix_normalized, 
                annot=True, 
                fmt='.2f', 
                xticklabels=['Noise' if x == -1 else f'Cluster {x}' 
                            for x in unique_clusters],
                yticklabels=unique_labels,
                cmap='YlOrRd')
    plt.title('Normalized Confusion Matrix:\nTrue Labels vs Cluster Assignments')
    plt.xlabel('Predicted Cluster')
    plt.ylabel('True Label')
    
    # Save confusion matrix
    os.makedirs(os.path.dirname(viz_dir), exist_ok=True)
    plt.savefig(viz_dir, dpi=300, bbox_inches='tight')
    plt.close()
    
    # Print cluster composition
    print("\nCluster Composition:")
    for cluster in unique_clusters:
        cluster_mask = cluster_labels == cluster
        if cluster == -1:
            print(f"\nNoise points:")
        else:
            print(f"\nCluster {cluster}:")
        
        for label in unique_labels:
            count = np.sum((y_true == label) & cluster_mask)
            percentage = (count / np.sum(cluster_mask)) * 100
            
            if probabilities is not None:
                avg_prob = np.mean(probabilities[cluster_mask & (y_true == label)])
                print(f"{label}: {count} samples ({percentage:.1f}%), "
                      f"Avg Probability: {avg_prob:.3f}")
            else:
                print(f"{label}: {count} samples ({percentage:.1f}%)")

In [3]:
os.makedirs('../../cluster_assignments/hdbscan', exist_ok=True)
os.makedirs('../../visualization/clustering_eval/hdbscan', exist_ok=True)

# Run HDBSCAN for each feature set
feature_sets = [
    {
        'name': 'MFCC Basic',
        'features': '../../extracted_features/features/mfcc_features.npy',
        'labels': '../../extracted_features/labels/mfcc_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc.png'
    },
    {
        'name': 'MFCC + Envelope',
        'features': '../../extracted_features/features/mfcc_env_features.npy',
        'labels': '../../extracted_features/labels/mfcc_env_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_env_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_env.png'
    },
    {
        'name': 'MFCC Optimized',
        'features': '../../extracted_features/features/mfcc_extracted_features.npy',
        'labels': '../../extracted_features/labels/mfcc_extracted_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_extracted_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_extracted.png'
    },
    {
        'name': 'MFCC Basic Augmented',
        'features': '../../extracted_features/features/mfcc_features_aug.npy',
        'labels': '../../extracted_features/labels/mfcc_labels_aug.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_aug.png'
    },
    {
        'name': 'MFCC + Envelope Augmented',
        'features': '../../extracted_features/features/mfcc_env_aug_features.npy',
        'labels': '../../extracted_features/labels/mfcc_env_aug_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_env_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_env_aug.png'
    },
    {
        'name': 'MFCC Optimized Augmented',
        'features': '../../extracted_features/features/mfcc_extracted_aug_features.npy',
        'labels': '../../extracted_features/labels/mfcc_extracted_aug_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_extracted_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_extracted_aug.png'
    }
    
]

# Create directories
os.makedirs('../../cluster_assignments', exist_ok=True)
os.makedirs('../../visualization/clustering_eval/hdbscan', exist_ok=True)

# Run clustering and evaluation for each feature set
for feature_set in feature_sets:
    print(f"\nProcessing {feature_set['name']}...")
    
    # Run HDBSCAN clustering
    cluster_labels, probabilities, clusterer = hdbscan_cluster(
        features_path=feature_set['features'],
        min_cluster_size=30,  # We can tune this parameter
        output_dir=feature_set['output']
    )
    
    # Evaluate results
    evaluate_hdbscan_clustering(
        features_path=feature_set['features'],
        labels_path=feature_set['labels'],
        cluster_labels_path=feature_set['output'],
        probabilities=probabilities,
        viz_dir=feature_set['viz']
    )


Processing MFCC Basic...
Fitting HDBSCAN with min_cluster_size=30...





Number of clusters found: 2

Cluster sizes:
Noise points: 5433 sounds
Cluster 0: 79 sounds
Cluster 1: 202 sounds

Average membership probabilities:
Noise points: 0.000
Cluster 0: 0.999
Cluster 1: 0.901
Internal Metrics (excluding noise):
Silhouette Score: 0.398
Davies-Bouldin Index: 1.095
Calinski-Harabasz Index: 179.754

External Metrics (excluding noise):
Adjusted Rand Index: 0.020
Normalized Mutual Information: 0.025

Cluster Composition:

Noise points:
hhc: 1383 samples (25.5%), Avg Probability: 0.000
hho: 997 samples (18.4%), Avg Probability: 0.000
kd: 1656 samples (30.5%), Avg Probability: 0.000
sd: 1397 samples (25.7%), Avg Probability: 0.000

Cluster 0:
hhc: 31 samples (39.2%), Avg Probability: 0.998
hho: 9 samples (11.4%), Avg Probability: 0.999
kd: 31 samples (39.2%), Avg Probability: 0.999
sd: 8 samples (10.1%), Avg Probability: 0.999

Cluster 1:
hhc: 84 samples (41.6%), Avg Probability: 0.900
hho: 3 samples (1.5%), Avg Probability: 0.834
kd: 89 samples (44.1%), Avg Probabi




Number of clusters found: 2

Cluster sizes:
Noise points: 5507 sounds
Cluster 0: 88 sounds
Cluster 1: 119 sounds

Average membership probabilities:
Noise points: 0.000
Cluster 0: 0.980
Cluster 1: 0.962
Internal Metrics (excluding noise):
Silhouette Score: 0.291
Davies-Bouldin Index: 1.377
Calinski-Harabasz Index: 97.779

External Metrics (excluding noise):
Adjusted Rand Index: 0.112
Normalized Mutual Information: 0.253

Cluster Composition:

Noise points:
hhc: 1372 samples (24.9%), Avg Probability: 0.000
hho: 997 samples (18.1%), Avg Probability: 0.000
kd: 1724 samples (31.3%), Avg Probability: 0.000
sd: 1414 samples (25.7%), Avg Probability: 0.000

Cluster 0:
hhc: 63 samples (71.6%), Avg Probability: 0.983
hho: 12 samples (13.6%), Avg Probability: 0.977
kd: 0 samples (0.0%), Avg Probability: nan
sd: 13 samples (14.8%), Avg Probability: 0.968

Cluster 1:
hhc: 63 samples (52.9%), Avg Probability: 0.976
hho: 0 samples (0.0%), Avg Probability: nan
kd: 52 samples (43.7%), Avg Probability:

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



Number of clusters found: 2

Cluster sizes:
Noise points: 5136 sounds
Cluster 0: 101 sounds
Cluster 1: 477 sounds

Average membership probabilities:
Noise points: 0.000
Cluster 0: 0.958
Cluster 1: 0.987
Internal Metrics (excluding noise):
Silhouette Score: 0.196
Davies-Bouldin Index: 1.418
Calinski-Harabasz Index: 107.336

External Metrics (excluding noise):
Adjusted Rand Index: 0.025
Normalized Mutual Information: 0.033

Cluster Composition:

Noise points:
hhc: 1352 samples (26.3%), Avg Probability: 0.000
hho: 973 samples (18.9%), Avg Probability: 0.000
kd: 1428 samples (27.8%), Avg Probability: 0.000
sd: 1383 samples (26.9%), Avg Probability: 0.000

Cluster 0:
hhc: 45 samples (44.6%), Avg Probability: 0.961
hho: 1 samples (1.0%), Avg Probability: 0.911
kd: 51 samples (50.5%), Avg Probability: 0.958
sd: 4 samples (4.0%), Avg Probability: 0.936

Cluster 1:
hhc: 101 samples (21.2%), Avg Probability: 0.986
hho: 35 samples (7.3%), Avg Probability: 0.975
kd: 297 samples (62.3%), Avg Proba




Number of clusters found: 3

Cluster sizes:
Noise points: 4137 sounds
Cluster 0: 34 sounds
Cluster 1: 30082 sounds
Cluster 2: 31 sounds

Average membership probabilities:
Noise points: 0.000
Cluster 0: 0.999
Cluster 1: 0.991
Cluster 2: 1.000
Internal Metrics (excluding noise):
Silhouette Score: 0.143
Davies-Bouldin Index: 1.048
Calinski-Harabasz Index: 61.144

External Metrics (excluding noise):
Adjusted Rand Index: 0.001
Normalized Mutual Information: 0.005

Cluster Composition:

Noise points:
hhc: 806 samples (19.5%), Avg Probability: 0.000
hho: 1145 samples (27.7%), Avg Probability: 0.000
kd: 875 samples (21.2%), Avg Probability: 0.000
sd: 1311 samples (31.7%), Avg Probability: 0.000

Cluster 0:
hhc: 0 samples (0.0%), Avg Probability: nan
hho: 34 samples (100.0%), Avg Probability: 0.999
kd: 0 samples (0.0%), Avg Probability: nan
sd: 0 samples (0.0%), Avg Probability: nan

Cluster 1:
hhc: 8151 samples (27.1%), Avg Probability: 0.993
hho: 4875 samples (16.2%), Avg Probability: 0.987


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



Number of clusters found: 3

Cluster sizes:
Noise points: 20084 sounds
Cluster 0: 55 sounds
Cluster 1: 109 sounds
Cluster 2: 14036 sounds

Average membership probabilities:
Noise points: 0.000
Cluster 0: 0.986
Cluster 1: 0.943
Cluster 2: 0.984
Internal Metrics (excluding noise):
Silhouette Score: 0.129
Davies-Bouldin Index: 1.290
Calinski-Harabasz Index: 144.508

External Metrics (excluding noise):
Adjusted Rand Index: 0.006
Normalized Mutual Information: 0.028

Cluster Composition:

Noise points:
hhc: 4519 samples (22.5%), Avg Probability: 0.000
hho: 3769 samples (18.8%), Avg Probability: 0.000
kd: 6201 samples (30.9%), Avg Probability: 0.000
sd: 5595 samples (27.9%), Avg Probability: 0.000

Cluster 0:
hhc: 0 samples (0.0%), Avg Probability: nan
hho: 0 samples (0.0%), Avg Probability: nan
kd: 0 samples (0.0%), Avg Probability: nan
sd: 55 samples (100.0%), Avg Probability: 0.986

Cluster 1:
hhc: 0 samples (0.0%), Avg Probability: nan
hho: 109 samples (100.0%), Avg Probability: 0.943
k

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



Number of clusters found: 3

Cluster sizes:
Noise points: 20957 sounds
Cluster 0: 41 sounds
Cluster 1: 13241 sounds
Cluster 2: 45 sounds

Average membership probabilities:
Noise points: 0.000
Cluster 0: 0.998
Cluster 1: 0.997
Cluster 2: 0.989
Internal Metrics (excluding noise):
Silhouette Score: 0.004
Davies-Bouldin Index: 1.181
Calinski-Harabasz Index: 53.161

External Metrics (excluding noise):
Adjusted Rand Index: -0.000
Normalized Mutual Information: 0.007

Cluster Composition:

Noise points:
hhc: 5602 samples (26.7%), Avg Probability: 0.000
hho: 4097 samples (19.5%), Avg Probability: 0.000
kd: 5133 samples (24.5%), Avg Probability: 0.000
sd: 6125 samples (29.2%), Avg Probability: 0.000

Cluster 0:
hhc: 0 samples (0.0%), Avg Probability: nan
hho: 4 samples (9.8%), Avg Probability: 0.994
kd: 31 samples (75.6%), Avg Probability: 0.999
sd: 6 samples (14.6%), Avg Probability: 0.994

Cluster 1:
hhc: 3349 samples (25.3%), Avg Probability: 0.997
hho: 1953 samples (14.7%), Avg Probability

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [4]:
from tqdm import tqdm

def optimize_hdbscan(features_path, labels_path, min_sizes=[20, 30, 50, 100], 
                    min_samples=[5, 10, 15, 20]):
    """
    Optimize HDBSCAN parameters
    """
    # Load and preprocess data
    X = np.load(features_path)
    y = np.load(labels_path)
    X_scaled = StandardScaler().fit_transform(X)
    
    best_score = -1
    best_params = None
    results = []
    
    print("Optimizing HDBSCAN parameters...")
    for min_size in tqdm(min_sizes, desc="Testing min_cluster_sizes"):
        for min_sample in min_samples:
            # Skip invalid combinations
            if min_sample > min_size:
                continue
                
            clusterer = hdbscan.HDBSCAN(
                min_cluster_size=min_size,
                min_samples=min_sample,
                prediction_data=True
            )
            
            cluster_labels = clusterer.fit_predict(X_scaled)
            
            # Skip if all points are noise
            if len(np.unique(cluster_labels)) <= 1:
                continue
                
            # Calculate metrics
            non_noise_mask = cluster_labels != -1
            if np.sum(non_noise_mask) > 0:
                silhouette = silhouette_score(X_scaled[non_noise_mask], 
                                           cluster_labels[non_noise_mask])
                ari = adjusted_rand_score(y[non_noise_mask], 
                                        cluster_labels[non_noise_mask])
                nmi = normalized_mutual_info_score(y[non_noise_mask], 
                                                 cluster_labels[non_noise_mask])
                
                # Calculate noise ratio
                noise_ratio = np.sum(cluster_labels == -1) / len(cluster_labels)
                
                # Combined score (adjust weights as needed)
                score = (0.3 * silhouette + 
                        0.3 * ari + 
                        0.2 * nmi + 
                        0.2 * (1 - noise_ratio))  # Penalize high noise ratio
                
                results.append({
                    'min_cluster_size': min_size,
                    'min_samples': min_sample,
                    'silhouette': silhouette,
                    'ari': ari,
                    'nmi': nmi,
                    'noise_ratio': noise_ratio,
                    'n_clusters': len(np.unique(cluster_labels)) - 1,
                    'score': score
                })
                
                if score > best_score:
                    best_score = score
                    best_params = {
                        'min_cluster_size': min_size,
                        'min_samples': min_sample
                    }
    
    # Print results
    print("\nBest parameters:")
    print(f"min_cluster_size: {best_params['min_cluster_size']}")
    print(f"min_samples: {best_params['min_samples']}")
    
    # Sort results by score and print top 5
    results.sort(key=lambda x: x['score'], reverse=True)
    print("\nTop 5 configurations:")
    for i, result in enumerate(results[:5]):
        print(f"\nRank {i+1}:")
        print(f"min_cluster_size: {result['min_cluster_size']}")
        print(f"min_samples: {result['min_samples']}")
        print(f"Number of clusters: {result['n_clusters']}")
        print(f"Silhouette Score: {result['silhouette']:.3f}")
        print(f"ARI: {result['ari']:.3f}")
        print(f"NMI: {result['nmi']:.3f}")
        print(f"Noise ratio: {result['noise_ratio']:.2%}")
    
    return best_params

# Run optimization for each feature set
for feature_set in feature_sets:
    print(f"\nOptimizing HDBSCAN for {feature_set['name']}...")
    best_params = optimize_hdbscan(
        feature_set['features'],
        feature_set['labels']
    )


Optimizing HDBSCAN for MFCC Basic...
Optimizing HDBSCAN parameters...


Testing min_cluster_sizes: 100%|██████████| 4/4 [00:09<00:00,  2.34s/it]



Best parameters:
min_cluster_size: 20
min_samples: 5

Top 5 configurations:

Rank 1:
min_cluster_size: 20
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.259
ARI: -0.000
NMI: 0.008
Noise ratio: 0.86%

Rank 2:
min_cluster_size: 30
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.143
ARI: 0.001
NMI: 0.015
Noise ratio: 7.25%

Rank 3:
min_cluster_size: 50
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.245
ARI: 0.003
NMI: 0.015
Noise ratio: 25.95%

Rank 4:
min_cluster_size: 30
min_samples: 10
Number of clusters: 2
Silhouette Score: 0.195
ARI: 0.003
NMI: 0.021
Noise ratio: 22.75%

Rank 5:
min_cluster_size: 20
min_samples: 10
Number of clusters: 2
Silhouette Score: 0.134
ARI: 0.001
NMI: 0.012
Noise ratio: 17.08%

Optimizing HDBSCAN for MFCC + Envelope...
Optimizing HDBSCAN parameters...


Testing min_cluster_sizes: 100%|██████████| 4/4 [00:16<00:00,  4.19s/it]



Best parameters:
min_cluster_size: 30
min_samples: 15

Top 5 configurations:

Rank 1:
min_cluster_size: 30
min_samples: 15
Number of clusters: 3
Silhouette Score: 0.290
ARI: 0.215
NMI: 0.384
Noise ratio: 95.50%

Rank 2:
min_cluster_size: 20
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.115
ARI: 0.002
NMI: 0.013
Noise ratio: 15.31%

Rank 3:
min_cluster_size: 50
min_samples: 15
Number of clusters: 2
Silhouette Score: 0.236
ARI: 0.183
NMI: 0.318
Noise ratio: 95.24%

Rank 4:
min_cluster_size: 100
min_samples: 15
Number of clusters: 2
Silhouette Score: 0.236
ARI: 0.183
NMI: 0.318
Noise ratio: 95.24%

Rank 5:
min_cluster_size: 20
min_samples: 10
Number of clusters: 2
Silhouette Score: 0.139
ARI: 0.003
NMI: 0.014
Noise ratio: 26.62%

Optimizing HDBSCAN for MFCC Optimized...
Optimizing HDBSCAN parameters...


Testing min_cluster_sizes: 100%|██████████| 4/4 [00:07<00:00,  1.83s/it]



Best parameters:
min_cluster_size: 20
min_samples: 10

Top 5 configurations:

Rank 1:
min_cluster_size: 20
min_samples: 10
Number of clusters: 2
Silhouette Score: 0.435
ARI: 0.000
NMI: 0.000
Noise ratio: 6.90%

Rank 2:
min_cluster_size: 30
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.416
ARI: -0.000
NMI: 0.000
Noise ratio: 6.04%

Rank 3:
min_cluster_size: 20
min_samples: 5
Number of clusters: 3
Silhouette Score: 0.405
ARI: 0.000
NMI: 0.008
Noise ratio: 5.65%

Rank 4:
min_cluster_size: 50
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.101
ARI: 0.009
NMI: 0.029
Noise ratio: 45.20%

Rank 5:
min_cluster_size: 50
min_samples: 15
Number of clusters: 2
Silhouette Score: 0.232
ARI: 0.003
NMI: 0.012
Noise ratio: 68.85%

Optimizing HDBSCAN for MFCC Basic Augmented...
Optimizing HDBSCAN parameters...


Testing min_cluster_sizes: 100%|██████████| 4/4 [04:15<00:00, 63.92s/it]



Best parameters:
min_cluster_size: 20
min_samples: 5

Top 5 configurations:

Rank 1:
min_cluster_size: 20
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.210
ARI: 0.000
NMI: 0.008
Noise ratio: 0.65%

Rank 2:
min_cluster_size: 30
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.210
ARI: 0.000
NMI: 0.008
Noise ratio: 0.65%

Rank 3:
min_cluster_size: 50
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.210
ARI: 0.000
NMI: 0.008
Noise ratio: 0.65%

Rank 4:
min_cluster_size: 100
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.210
ARI: 0.000
NMI: 0.008
Noise ratio: 0.65%

Rank 5:
min_cluster_size: 20
min_samples: 10
Number of clusters: 2
Silhouette Score: 0.207
ARI: -0.000
NMI: 0.007
Noise ratio: 2.24%

Optimizing HDBSCAN for MFCC + Envelope Augmented...
Optimizing HDBSCAN parameters...


Testing min_cluster_sizes: 100%|██████████| 4/4 [08:28<00:00, 127.07s/it]



Best parameters:
min_cluster_size: 20
min_samples: 5

Top 5 configurations:

Rank 1:
min_cluster_size: 20
min_samples: 5
Number of clusters: 245
Silhouette Score: 0.159
ARI: 0.040
NMI: 0.361
Noise ratio: 60.02%

Rank 2:
min_cluster_size: 50
min_samples: 5
Number of clusters: 3
Silhouette Score: 0.061
ARI: -0.001
NMI: 0.007
Noise ratio: 9.22%

Rank 3:
min_cluster_size: 30
min_samples: 5
Number of clusters: 4
Silhouette Score: 0.048
ARI: -0.001
NMI: 0.009
Noise ratio: 9.12%

Rank 4:
min_cluster_size: 50
min_samples: 10
Number of clusters: 41
Silhouette Score: 0.169
ARI: 0.103
NMI: 0.368
Noise ratio: 81.89%

Rank 5:
min_cluster_size: 20
min_samples: 10
Number of clusters: 3
Silhouette Score: 0.070
ARI: -0.001
NMI: 0.006
Noise ratio: 21.01%

Optimizing HDBSCAN for MFCC Optimized Augmented...
Optimizing HDBSCAN parameters...


Testing min_cluster_sizes: 100%|██████████| 4/4 [02:36<00:00, 39.02s/it]


Best parameters:
min_cluster_size: 100
min_samples: 10

Top 5 configurations:

Rank 1:
min_cluster_size: 100
min_samples: 10
Number of clusters: 14
Silhouette Score: 0.221
ARI: 0.332
NMI: 0.337
Noise ratio: 80.48%

Rank 2:
min_cluster_size: 50
min_samples: 5
Number of clusters: 50
Silhouette Score: 0.131
ARI: 0.287
NMI: 0.348
Noise ratio: 68.73%

Rank 3:
min_cluster_size: 20
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.073
ARI: -0.000
NMI: 0.001
Noise ratio: 4.55%

Rank 4:
min_cluster_size: 100
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.129
ARI: 0.000
NMI: 0.009
Noise ratio: 18.29%

Rank 5:
min_cluster_size: 30
min_samples: 5
Number of clusters: 2
Silhouette Score: 0.061
ARI: 0.000
NMI: 0.003
Noise ratio: 9.23%



