In [2]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.cluster import AgglomerativeClustering
import os

In [None]:
def hierarchical_cluster(features_path, n_clusters=4, linkage='ward', output_dir=''):
    """
    Perform hierarchical clustering on features
    """

    X = np.load(features_path)
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print(f"Fitting Hierarchical Clustering with {linkage} linkage...")
    clustering = AgglomerativeClustering(
        n_clusters=n_clusters,
        linkage=linkage
    )
    
    cluster_labels = clustering.fit_predict(X_scaled)
    
    np.save(output_dir, cluster_labels)
    
    print("\nCluster sizes:")
    for i in range(n_clusters):
        print(f"Cluster {i}: {np.sum(cluster_labels == i)} sounds")
    
    return cluster_labels, clustering

def evaluate_hierarchical_clustering(features_path, labels_path, cluster_labels_path, 
                                  viz_dir=''):
    """
    Evaluate hierarchical clustering results
    """

    X = np.load(features_path)
    y_true = np.load(labels_path)
    cluster_labels = np.load(cluster_labels_path)
    
    # Standardize features
    X_scaled = StandardScaler().fit_transform(X)
    
    silhouette = silhouette_score(X_scaled, cluster_labels)
    davies_bouldin = davies_bouldin_score(X_scaled, cluster_labels)
    calinski_harabasz = calinski_harabasz_score(X_scaled, cluster_labels)
    
    print("Internal Metrics:")
    print(f"Silhouette Score: {silhouette:.3f}")
    print(f"Davies-Bouldin Index: {davies_bouldin:.3f}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz:.3f}")
    
    ari = adjusted_rand_score(y_true, cluster_labels)
    nmi = normalized_mutual_info_score(y_true, cluster_labels)
    
    print("\nExternal Metrics:")
    print(f"Adjusted Rand Index: {ari:.3f}")
    print(f"Normalized Mutual Information: {nmi:.3f}")
    
    # Create confusion matrix
    unique_labels = np.unique(y_true)
    unique_clusters = np.unique(cluster_labels)
    confusion_matrix = np.zeros((len(unique_labels), len(unique_clusters)))
    
    for i, label in enumerate(unique_labels):
        for j, cluster in enumerate(unique_clusters):
            confusion_matrix[i, j] = np.sum((y_true == label) & (cluster_labels == cluster))
    
    confusion_matrix_normalized = confusion_matrix / confusion_matrix.sum(axis=1)[:, np.newaxis]
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(confusion_matrix_normalized, 
                annot=True, 
                fmt='.2f', 
                xticklabels=[f'Cluster {i}' for i in range(len(unique_clusters))],
                yticklabels=unique_labels,
                cmap='YlOrRd')
    plt.title('Normalized Confusion Matrix:\nTrue Labels vs Cluster Assignments')
    plt.xlabel('Predicted Cluster')
    plt.ylabel('True Label')
    
    os.makedirs(os.path.dirname(viz_dir), exist_ok=True)
    plt.savefig(viz_dir, dpi=300, bbox_inches='tight')
    plt.close()
    
    # Print cluster composition
    print("\nCluster Composition:")
    for cluster in unique_clusters:
        cluster_mask = cluster_labels == cluster
        print(f"\nCluster {cluster}:")
        for label in unique_labels:
            count = np.sum((y_true == label) & cluster_mask)
            percentage = (count / np.sum(cluster_mask)) * 100
            print(f"{label}: {count} samples ({percentage:.1f}%)")





In [None]:
def optimize_hierarchical(features_path, labels_path):
    """
    Try different linkage methods and evaluate results
    """

    X = np.load(features_path)
    y = np.load(labels_path)
    X_scaled = StandardScaler().fit_transform(X)
    
    # Linkage methods to try
    linkage_methods = ['ward', 'complete', 'average']
    
    results = []
    print("\nTrying different linkage methods...")
    
    for linkage in linkage_methods:
        print(f"\nTesting {linkage} linkage:")
        clustering = AgglomerativeClustering(
            n_clusters=4,  # Fixed for our use case
            linkage=linkage
        )
        
        cluster_labels = clustering.fit_predict(X_scaled)
        
        silhouette = silhouette_score(X_scaled, cluster_labels)
        ari = adjusted_rand_score(y, cluster_labels)
        nmi = normalized_mutual_info_score(y, cluster_labels)
        
        results.append({
            'linkage': linkage,
            'silhouette': silhouette,
            'ari': ari,
            'nmi': nmi
        })
        
        print(f"Silhouette Score: {silhouette:.3f}")
        print(f"ARI: {ari:.3f}")
        print(f"NMI: {nmi:.3f}")
    
    best_result = max(results, key=lambda x: 0.4 * x['silhouette'] + 0.3 * x['ari'] + 0.3 * x['nmi'])
    
    print(f"\nBest linkage method: {best_result['linkage']}")
    print(f"Silhouette Score: {best_result['silhouette']:.3f}")
    print(f"ARI: {best_result['ari']:.3f}")
    print(f"NMI: {best_result['nmi']:.3f}")
    
    return best_result['linkage']

In [None]:
os.makedirs('../../cluster_assignments/hierarchical', exist_ok=True)
os.makedirs('../../visualization/clustering_eval/hierarchical', exist_ok=True)

# Feature sets to process
feature_sets = [
    {
        'name': 'MFCC Basic',
        'features': '../../extracted_features/features/mfcc_features.npy',
        'labels': '../../extracted_features/labels/mfcc_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc.png'
    },
    {
        'name': 'MFCC + Envelope',
        'features': '../../extracted_features/features/mfcc_env_features.npy',
        'labels': '../../extracted_features/labels/mfcc_env_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_env_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_env.png'
    },
    {
        'name': 'MFCC Optimized',
        'features': '../../extracted_features/features/mfcc_extracted_features.npy',
        'labels': '../../extracted_features/labels/mfcc_extracted_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_extracted_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_extracted.png'
    },
    {
        'name': 'MFCC Basic Augmented',
        'features': '../../extracted_features/features/mfcc_features_aug.npy',
        'labels': '../../extracted_features/labels/mfcc_labels_aug.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_aug.png'
    },
    {
        'name': 'MFCC + Envelope Augmented',
        'features': '../../extracted_features/features/mfcc_env_aug_features.npy',
        'labels': '../../extracted_features/labels/mfcc_env_aug_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_env_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_env_aug.png'
    },
    {
        'name': 'MFCC Optimized Augmented',
        'features': '../../extracted_features/features/mfcc_extracted_aug_features.npy',
        'labels': '../../extracted_features/labels/mfcc_extracted_aug_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_extracted_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_extracted_aug.png'
    }
    
]

for feature_set in feature_sets:
    print(f"\nProcessing {feature_set['name']}...")
    
    # First optimize to find best linkage method
    best_linkage = optimize_hierarchical(
        feature_set['features'],
        feature_set['labels']
    )
    
    # Run clustering with best linkage
    cluster_labels, clustering = hierarchical_cluster(
        feature_set['features'],
        linkage=best_linkage,
        output_dir=feature_set['output']
    )
    
    evaluate_hierarchical_clustering(
        feature_set['features'],
        feature_set['labels'],
        feature_set['output'],
        viz_dir=feature_set['viz']
    )


Processing MFCC Basic...

Trying different linkage methods...

Testing ward linkage:
Silhouette Score: 0.079
ARI: 0.082
NMI: 0.138

Testing complete linkage:
Silhouette Score: 0.027
ARI: 0.005
NMI: 0.044

Testing average linkage:
Silhouette Score: 0.196
ARI: 0.002
NMI: 0.012

Best linkage method: ward
Silhouette Score: 0.079
ARI: 0.082
NMI: 0.138
Fitting Hierarchical Clustering with ward linkage...

Cluster sizes:
Cluster 0: 2434 sounds
Cluster 1: 1080 sounds
Cluster 2: 1423 sounds
Cluster 3: 777 sounds
Internal Metrics:
Silhouette Score: 0.079
Davies-Bouldin Index: 2.680
Calinski-Harabasz Index: 456.693

External Metrics:
Adjusted Rand Index: 0.082
Normalized Mutual Information: 0.138

Cluster Composition:

Cluster 0:
hhc: 553 samples (22.7%)
hho: 516 samples (21.2%)
kd: 518 samples (21.3%)
sd: 847 samples (34.8%)

Cluster 1:
hhc: 197 samples (18.2%)
hho: 67 samples (6.2%)
kd: 763 samples (70.6%)
sd: 53 samples (4.9%)

Cluster 2:
hhc: 444 samples (31.2%)
hho: 47 samples (3.3%)
kd: 48