# K-Means Unsupervised Learning

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.decomposition import PCA

def cluster_sounds(n_clusters=4, features_path='features', output_dir=''):
    """
    Perform K-means clustering on MFCC features
    """
    # Load the MFCC features
    # features_path = Path('../projectFiles/features')
    # X = np.load(features_path / 'mfcc_features.npy')
    
    X = np.load(features_path)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print(f"Performing K-means clustering with {n_clusters} clusters...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X_scaled)
    
    # Save cluster assignments
    # if features_path == 'features/mfcc_features_augmented.npy':
    #     np.save('features/cluster_labels_augmented.npy', cluster_labels)
    # if features_path == 'features/mfcc_features.npy':
    #     np.save('features/cluster_labels.npy', cluster_labels)
    # if features_path == 'features/mfcc_features_expanded.npy':
    #     np.save('features/cluster_labels_expanded.npy', cluster_labels)
    # else:
    #     np.save('features/cluster_labels_expanded_augmented.npy', cluster_labels)
        
    # if features_path == 'features/mfcc_features_optimized.npy':
    #     np.save('features/cluster_labels_optimized.npy', cluster_labels)
    # if features_path == 'features/mfcc_features_optimized_augmented.npy':
    #     np.save('features/cluster_labels_optimized_augmented.npy', cluster_labels)
        
    np.save(output_dir, cluster_labels)
    
    # Print cluster sizes
    print("\nCluster sizes:")
    for i in range(n_clusters):
        print(f"Cluster {i}: {np.sum(cluster_labels == i)} sounds")
    
    return cluster_labels, kmeans

In [2]:
import os
os.makedirs('../../cluster_assignments', exist_ok=True)

cluster_labels_mfcc, kmeans_model_mfcc = cluster_sounds(n_clusters=4, features_path='../../extracted_features/features/mfcc_features.npy', output_dir='../../cluster_assignments/mfcc_cluster.npy')

cluster_labels_mfcc_aug, kmeans_model_mfcc_aug = cluster_sounds(n_clusters=4, features_path='../../extracted_features/features/mfcc_features_aug.npy', output_dir='../../cluster_assignments/mfcc_aug_cluster.npy')

cluster_labels_env, kmeans_model_env = cluster_sounds(n_clusters=4, features_path='../../extracted_features/features/mfcc_env_features.npy', output_dir='../../cluster_assignments/mfcc_env_cluster.npy')

cluster_labels_env_aug, kmeans_model_env_aug = cluster_sounds(n_clusters=4, features_path='../../extracted_features/features/mfcc_env_aug_features.npy', output_dir='../../cluster_assignments/mfcc_env_aug_cluster.npy')

cluster_labels_extracted, kmeans_model_extracted = cluster_sounds(n_clusters=4, features_path='../../extracted_features/features/mfcc_extracted_features.npy', output_dir='../../cluster_assignments/mfcc_extracted_cluster.npy')

cluster_labels_extracted_aug, kmeans_model_extracted_aug = cluster_sounds(n_clusters=4, features_path='../../extracted_features/features/mfcc_extracted_aug_features.npy', output_dir='../../cluster_assignments/mfcc_extracted_aug_cluster.npy')

Performing K-means clustering with 4 clusters...

Cluster sizes:
Cluster 0: 1026 sounds
Cluster 1: 1522 sounds
Cluster 2: 1992 sounds
Cluster 3: 1174 sounds
Performing K-means clustering with 4 clusters...

Cluster sizes:
Cluster 0: 8413 sounds
Cluster 1: 9337 sounds
Cluster 2: 10910 sounds
Cluster 3: 5624 sounds
Performing K-means clustering with 4 clusters...

Cluster sizes:
Cluster 0: 1582 sounds
Cluster 1: 965 sounds
Cluster 2: 1307 sounds
Cluster 3: 1860 sounds
Performing K-means clustering with 4 clusters...

Cluster sizes:
Cluster 0: 7885 sounds
Cluster 1: 5408 sounds
Cluster 2: 11225 sounds
Cluster 3: 9766 sounds
Performing K-means clustering with 4 clusters...

Cluster sizes:
Cluster 0: 1428 sounds
Cluster 1: 941 sounds
Cluster 2: 1311 sounds
Cluster 3: 2034 sounds
Performing K-means clustering with 4 clusters...

Cluster sizes:
Cluster 0: 8473 sounds
Cluster 1: 11840 sounds
Cluster 2: 3115 sounds
Cluster 3: 10856 sounds


# Evaluating K-Means Models

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import seaborn as sns

def evaluate_clustering(features_path='features', labels_path='labels.npy', cluster_labels_path='cluster_labels.npy', viz_dir=''):
    """
    Evaluate clustering results using both internal and external metrics
    """

    # features_path = Path('../projectFiles/features')
    X = np.load(features_path)
    y_true = np.load(labels_path)
    cluster_labels = np.load(cluster_labels_path)
    
    # Standardize features (same as in clustering)
    X_scaled = StandardScaler().fit_transform(X)
    
    silhouette = silhouette_score(X_scaled, cluster_labels)
    davies_bouldin = davies_bouldin_score(X_scaled, cluster_labels)
    calinski_harabasz = calinski_harabasz_score(X_scaled, cluster_labels)
    
    print("Internal Metrics:")
    print(f"Silhouette Score: {silhouette:.3f} (ranges from -1 to 1, higher is better)")
    print(f"Davies-Bouldin Index: {davies_bouldin:.3f} (lower is better)")
    print(f"Calinski-Harabasz Index: {calinski_harabasz:.3f} (higher is better)")
    
    ari = adjusted_rand_score(y_true, cluster_labels)
    nmi = normalized_mutual_info_score(y_true, cluster_labels)
    
    print("\nExternal Metrics:")
    print(f"Adjusted Rand Index: {ari:.3f} (ranges from -1 to 1, higher is better)")
    print(f"Normalized Mutual Information: {nmi:.3f} (ranges from 0 to 1, higher is better)")
    
    unique_labels = np.unique(y_true)
    unique_clusters = np.unique(cluster_labels)
    confusion_matrix = np.zeros((len(unique_labels), len(unique_clusters)))
    
    for i, label in enumerate(unique_labels):
        for j, cluster in enumerate(unique_clusters):
            confusion_matrix[i, j] = np.sum((y_true == label) & (cluster_labels == cluster))
    
    # Normalize by row (true labels)
    confusion_matrix_normalized = confusion_matrix / confusion_matrix.sum(axis=1)[:, np.newaxis]
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(confusion_matrix_normalized, 
                annot=True, 
                fmt='.2f', 
                xticklabels=[f'Cluster {i}' for i in range(len(unique_clusters))],
                yticklabels=unique_labels,
                cmap='YlOrRd')
    plt.title('Normalized Confusion Matrix:\nTrue Labels vs Cluster Assignments')
    plt.xlabel('Predicted Cluster')
    plt.ylabel('True Label')
    
    # Save confusion matrix
    # viz_dir = Path(features_path).parent.parent / 'visualization'
    # viz_dir.mkdir(exist_ok=True, parents=True)
    # if features_path == 'features/mfcc_features_augmented.npy':
    #     plt.savefig(viz_dir / 'confusion_matrix_augmented.png', dpi=300, bbox_inches='tight')
    # if features_path == 'features/mfcc_features_expanded.npy':
    #     plt.savefig(viz_dir / 'confusion_matrix_expanded.png', dpi=300, bbox_inches='tight')
    # if features_path == 'features/mfcc_features_expanded_augmented.npy':
    #     plt.savefig(viz_dir / 'confusion_matrix_expanded_augmented.png', dpi=300, bbox_inches='tight')
    # else:
    #     plt.savefig(viz_dir / 'confusion_matrix.png', dpi=300, bbox_inches='tight')

    # Ensure the directory exists before saving
    os.makedirs(os.path.dirname(viz_dir), exist_ok=True)
    plt.savefig(viz_dir, dpi=300, bbox_inches='tight')
    plt.close()
    
    print("\nCluster Composition:")
    for cluster in unique_clusters:
        cluster_mask = cluster_labels == cluster
        print(f"\nCluster {cluster}:")
        for label in unique_labels:
            count = np.sum((y_true == label) & cluster_mask)
            percentage = (count / np.sum(cluster_mask)) * 100
            print(f"{label}: {count} samples ({percentage:.1f}%)")

    return {
        'silhouette': silhouette,
        'davies_bouldin': davies_bouldin,
        'calinski_harabasz': calinski_harabasz,
        'ari': ari,
        'nmi': nmi,
        'confusion_matrix': confusion_matrix,
        'confusion_matrix_normalized': confusion_matrix_normalized
    }

In [4]:
evaluation_results_mfcc = evaluate_clustering(features_path='../../extracted_features/features/mfcc_features.npy', labels_path='../../extracted_features/labels/mfcc_labels.npy', cluster_labels_path='../../cluster_assignments/mfcc_cluster.npy', viz_dir='../../visualization/clustering_eval/mfcc.png')

Internal Metrics:
Silhouette Score: 0.120 (ranges from -1 to 1, higher is better)
Davies-Bouldin Index: 2.136 (lower is better)
Calinski-Harabasz Index: 611.699 (higher is better)

External Metrics:
Adjusted Rand Index: 0.108 (ranges from -1 to 1, higher is better)
Normalized Mutual Information: 0.115 (ranges from 0 to 1, higher is better)

Cluster Composition:

Cluster 0:
hhc: 174 samples (17.0%)
hho: 312 samples (30.4%)
kd: 125 samples (12.2%)
sd: 415 samples (40.4%)

Cluster 1:
hhc: 288 samples (18.9%)
hho: 115 samples (7.6%)
kd: 951 samples (62.5%)
sd: 168 samples (11.0%)

Cluster 2:
hhc: 519 samples (26.1%)
hho: 184 samples (9.2%)
kd: 596 samples (29.9%)
sd: 693 samples (34.8%)

Cluster 3:
hhc: 517 samples (44.0%)
hho: 398 samples (33.9%)
kd: 104 samples (8.9%)
sd: 155 samples (13.2%)


In [5]:
evaluation_results_mfcc_aug = evaluate_clustering(features_path='../../extracted_features/features/mfcc_features_aug.npy', labels_path='../../extracted_features/labels/mfcc_labels_aug.npy', cluster_labels_path='../../cluster_assignments/mfcc_aug_cluster.npy', viz_dir='../../visualization/clustering_eval/mfcc_aug.png')

Internal Metrics:
Silhouette Score: 0.118 (ranges from -1 to 1, higher is better)
Davies-Bouldin Index: 2.220 (lower is better)
Calinski-Harabasz Index: 3987.355 (higher is better)

External Metrics:
Adjusted Rand Index: 0.077 (ranges from -1 to 1, higher is better)
Normalized Mutual Information: 0.077 (ranges from 0 to 1, higher is better)

Cluster Composition:

Cluster 0:
hhc: 1432 samples (17.0%)
hho: 723 samples (8.6%)
kd: 5304 samples (63.0%)
sd: 954 samples (11.3%)

Cluster 1:
hhc: 3181 samples (34.1%)
hho: 2769 samples (29.7%)
kd: 1111 samples (11.9%)
sd: 2276 samples (24.4%)

Cluster 2:
hhc: 2652 samples (24.3%)
hho: 1229 samples (11.3%)
kd: 3114 samples (28.5%)
sd: 3915 samples (35.9%)

Cluster 3:
hhc: 1723 samples (30.6%)
hho: 1333 samples (23.7%)
kd: 1127 samples (20.0%)
sd: 1441 samples (25.6%)


In [6]:
evaluation_results_mfcc_env = evaluate_clustering(features_path='../../extracted_features/features/mfcc_env_features.npy', labels_path='../../extracted_features/labels/mfcc_env_labels.npy', cluster_labels_path='../../cluster_assignments/mfcc_env_cluster.npy', viz_dir='../../visualization/clustering_eval/mfcc_env.png')

Internal Metrics:
Silhouette Score: 0.075 (ranges from -1 to 1, higher is better)
Davies-Bouldin Index: 3.063 (lower is better)
Calinski-Harabasz Index: 365.253 (higher is better)

External Metrics:
Adjusted Rand Index: 0.100 (ranges from -1 to 1, higher is better)
Normalized Mutual Information: 0.112 (ranges from 0 to 1, higher is better)

Cluster Composition:

Cluster 0:
hhc: 572 samples (36.2%)
hho: 349 samples (22.1%)
kd: 156 samples (9.9%)
sd: 505 samples (31.9%)

Cluster 1:
hhc: 199 samples (20.6%)
hho: 347 samples (36.0%)
kd: 69 samples (7.2%)
sd: 350 samples (36.3%)

Cluster 2:
hhc: 175 samples (13.4%)
hho: 94 samples (7.2%)
kd: 911 samples (69.7%)
sd: 127 samples (9.7%)

Cluster 3:
hhc: 552 samples (29.7%)
hho: 219 samples (11.8%)
kd: 640 samples (34.4%)
sd: 449 samples (24.1%)


In [7]:
evaluation_results_mfcc_env_aug = evaluate_clustering(features_path='../../extracted_features/features/mfcc_env_aug_features.npy', labels_path='../../extracted_features/labels/mfcc_env_aug_labels.npy', cluster_labels_path='../../cluster_assignments/mfcc_env_aug_cluster.npy', viz_dir='../../visualization/clustering_eval/mfcc_env_aug.png')

Internal Metrics:
Silhouette Score: 0.078 (ranges from -1 to 1, higher is better)
Davies-Bouldin Index: 2.811 (lower is better)
Calinski-Harabasz Index: 2359.465 (higher is better)

External Metrics:
Adjusted Rand Index: 0.082 (ranges from -1 to 1, higher is better)
Normalized Mutual Information: 0.088 (ranges from 0 to 1, higher is better)

Cluster Composition:

Cluster 0:
hhc: 1061 samples (13.5%)
hho: 714 samples (9.1%)
kd: 5315 samples (67.4%)
sd: 795 samples (10.1%)

Cluster 1:
hhc: 1747 samples (32.3%)
hho: 1497 samples (27.7%)
kd: 604 samples (11.2%)
sd: 1560 samples (28.8%)

Cluster 2:
hhc: 3091 samples (27.5%)
hho: 1644 samples (14.6%)
kd: 3734 samples (33.3%)
sd: 2756 samples (24.6%)

Cluster 3:
hhc: 3089 samples (31.6%)
hho: 2199 samples (22.5%)
kd: 1003 samples (10.3%)
sd: 3475 samples (35.6%)


In [8]:
evaluation_results_mfcc_extracted = evaluate_clustering(features_path='../../extracted_features/features/mfcc_extracted_features.npy', labels_path='../../extracted_features/labels/mfcc_extracted_labels.npy', cluster_labels_path='../../cluster_assignments/mfcc_extracted_cluster.npy', viz_dir='../../visualization/clustering_eval/mfcc_extracted.png')

Internal Metrics:
Silhouette Score: 0.132 (ranges from -1 to 1, higher is better)
Davies-Bouldin Index: 2.148 (lower is better)
Calinski-Harabasz Index: 796.187 (higher is better)

External Metrics:
Adjusted Rand Index: 0.107 (ranges from -1 to 1, higher is better)
Normalized Mutual Information: 0.112 (ranges from 0 to 1, higher is better)

Cluster Composition:

Cluster 0:
hhc: 203 samples (14.2%)
hho: 127 samples (8.9%)
kd: 474 samples (33.2%)
sd: 624 samples (43.7%)

Cluster 1:
hhc: 291 samples (30.9%)
hho: 187 samples (19.9%)
kd: 238 samples (25.3%)
sd: 225 samples (23.9%)

Cluster 2:
hhc: 205 samples (15.6%)
hho: 100 samples (7.6%)
kd: 873 samples (66.6%)
sd: 133 samples (10.1%)

Cluster 3:
hhc: 799 samples (39.3%)
hho: 595 samples (29.3%)
kd: 191 samples (9.4%)
sd: 449 samples (22.1%)


In [9]:
evaluation_results_mfcc_extracted_aug = evaluate_clustering(features_path='../../extracted_features/features/mfcc_extracted_aug_features.npy', labels_path='../../extracted_features/labels/mfcc_extracted_aug_labels.npy', cluster_labels_path='../../cluster_assignments/mfcc_extracted_aug_cluster.npy', viz_dir='../../visualization/clustering_eval/mfcc_extracted_aug.png')

Internal Metrics:
Silhouette Score: 0.133 (ranges from -1 to 1, higher is better)
Davies-Bouldin Index: 2.182 (lower is better)
Calinski-Harabasz Index: 4725.724 (higher is better)

External Metrics:
Adjusted Rand Index: 0.088 (ranges from -1 to 1, higher is better)
Normalized Mutual Information: 0.090 (ranges from 0 to 1, higher is better)

Cluster Composition:

Cluster 0:
hhc: 1162 samples (13.7%)
hho: 703 samples (8.3%)
kd: 5541 samples (65.4%)
sd: 1067 samples (12.6%)

Cluster 1:
hhc: 4375 samples (37.0%)
hho: 3282 samples (27.7%)
kd: 1187 samples (10.0%)
sd: 2996 samples (25.3%)

Cluster 2:
hhc: 1007 samples (32.3%)
hho: 527 samples (16.9%)
kd: 560 samples (18.0%)
sd: 1021 samples (32.8%)

Cluster 3:
hhc: 2444 samples (22.5%)
hho: 1542 samples (14.2%)
kd: 3368 samples (31.0%)
sd: 3502 samples (32.3%)


In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

def optimize_kmeans(features_path, labels_path, n_splits=5, min_clusters=2, max_clusters=8):
    """
    Perform k-means optimization using cross-validation and hyperparameter tuning
    
    Args:
        features_path: Path to features .npy file
        labels_path: Path to labels .npy file
        n_splits: Number of cross-validation folds
        min_clusters: Minimum number of clusters to try
        max_clusters: Maximum number of clusters to try
    """

    X = np.load(features_path)
    y = np.load(labels_path)
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Store results
    results = {
        'n_clusters': [],
        'silhouette_scores': [],
        'ari_scores': [],
        'nmi_scores': [],
        'calinski_scores': []
    }
    
    print("Performing cross-validation and hyperparameter tuning...")
    # Try different numbers of clusters
    for n_clusters in tqdm(range(min_clusters, max_clusters + 1)):
        fold_silhouette = []
        fold_ari = []
        fold_nmi = []
        fold_calinski = []
        
        # Cross validation
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
            X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            # Train KMeans
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            train_clusters = kmeans.fit_predict(X_train)
            val_clusters = kmeans.predict(X_val)
            
            # Calculate metrics
            fold_silhouette.append(silhouette_score(X_val, val_clusters))
            fold_ari.append(adjusted_rand_score(y_val, val_clusters))
            fold_nmi.append(normalized_mutual_info_score(y_val, val_clusters))
            fold_calinski.append(calinski_harabasz_score(X_val, val_clusters))
        
        # Store average results
        results['n_clusters'].append(n_clusters)
        results['silhouette_scores'].append(np.mean(fold_silhouette))
        results['ari_scores'].append(np.mean(fold_ari))
        results['nmi_scores'].append(np.mean(fold_nmi))
        results['calinski_scores'].append(np.mean(fold_calinski))
    
    plt.figure(figsize=(15, 10))
    
    plt.subplot(2, 1, 1)
    plt.plot(results['n_clusters'], results['silhouette_scores'], 'o-', label='Silhouette Score')
    plt.plot(results['n_clusters'], np.array(results['calinski_scores'])/max(results['calinski_scores']), 
             'o-', label='Normalized Calinski-Harabasz')
    plt.title('Internal Clustering Metrics vs Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(2, 1, 2)
    plt.plot(results['n_clusters'], results['ari_scores'], 'o-', label='Adjusted Rand Index')
    plt.plot(results['n_clusters'], results['nmi_scores'], 'o-', label='Normalized Mutual Information')
    plt.title('External Clustering Metrics vs Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    
    # Save plot
    os.makedirs('../../visualization/optimization', exist_ok=True)
    plt.savefig('../../visualization/optimization/kmeans_optimization.png')
    plt.close()
    
    # Find optimal number of clusters
    # We'll use a weighted sum of normalized metrics
    normalized_metrics = {
        'silhouette': (np.array(results['silhouette_scores']) - min(results['silhouette_scores'])) / 
                     (max(results['silhouette_scores']) - min(results['silhouette_scores'])),
        'calinski': np.array(results['calinski_scores']) / max(results['calinski_scores']),
        'ari': (np.array(results['ari_scores']) - min(results['ari_scores'])) /
               (max(results['ari_scores']) - min(results['ari_scores'])),
        'nmi': (np.array(results['nmi_scores']) - min(results['nmi_scores'])) /
               (max(results['nmi_scores']) - min(results['nmi_scores']))
    }
    
    # Weighted sum (you can adjust weights based on importance)
    weights = {'silhouette': 0.3, 'calinski': 0.2, 'ari': 0.25, 'nmi': 0.25}
    combined_scores = sum(weights[metric] * normalized_metrics[metric] 
                        for metric in weights.keys())
    
    optimal_k = results['n_clusters'][np.argmax(combined_scores)]
    
    print("\nOptimization Results:")
    print(f"Optimal number of clusters: {optimal_k}")
    print("\nMetrics at optimal k:")
    print(f"Silhouette Score: {results['silhouette_scores'][optimal_k-min_clusters]:.3f}")
    print(f"Calinski-Harabasz Score: {results['calinski_scores'][optimal_k-min_clusters]:.3f}")
    print(f"Adjusted Rand Index: {results['ari_scores'][optimal_k-min_clusters]:.3f}")
    print(f"Normalized Mutual Information: {results['nmi_scores'][optimal_k-min_clusters]:.3f}")
    
    return optimal_k, results

# Use the function for different feature sets
feature_sets = [
    {
        'name': 'MFCC Basic',
        'features': '../../extracted_features/features/mfcc_features.npy',
        'labels': '../../extracted_features/labels/mfcc_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc.png'
    },
    {
        'name': 'MFCC + Envelope',
        'features': '../../extracted_features/features/mfcc_env_features.npy',
        'labels': '../../extracted_features/labels/mfcc_env_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_env_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_env.png'
    },
    {
        'name': 'MFCC Optimized',
        'features': '../../extracted_features/features/mfcc_extracted_features.npy',
        'labels': '../../extracted_features/labels/mfcc_extracted_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_extracted_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_extracted.png'
    },
    {
        'name': 'MFCC Basic Augmented',
        'features': '../../extracted_features/features/mfcc_features_aug.npy',
        'labels': '../../extracted_features/labels/mfcc_labels_aug.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_aug.png'
    },
    {
        'name': 'MFCC + Envelope Augmented',
        'features': '../../extracted_features/features/mfcc_env_aug_features.npy',
        'labels': '../../extracted_features/labels/mfcc_env_aug_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_env_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_env_aug.png'
    },
    {
        'name': 'MFCC Optimized Augmented',
        'features': '../../extracted_features/features/mfcc_extracted_aug_features.npy',
        'labels': '../../extracted_features/labels/mfcc_extracted_aug_labels.npy',
        'output': '../../cluster_assignments/hdbscan/hdbscan_mfcc_extracted_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/hdbscan/hdbscan_mfcc_extracted_aug.png'
    }
    
]

# Run optimization for each feature set
optimization_results = {}
for feature_set in feature_sets:
    print(f"\nOptimizing for {feature_set['name']}...")
    optimal_k, results = optimize_kmeans(
        feature_set['features'],
        feature_set['labels'],
        min_clusters=2,
        max_clusters=8
    )
    optimization_results[feature_set['name']] = {
        'optimal_k': optimal_k,
        'results': results
    }


Optimizing for MFCC Basic...
Performing cross-validation and hyperparameter tuning...


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]



Optimization Results:
Optimal number of clusters: 8

Metrics at optimal k:
Silhouette Score: 0.118
Calinski-Harabasz Score: 93.825
Adjusted Rand Index: 0.082
Normalized Mutual Information: 0.133

Optimizing for MFCC + Envelope...
Performing cross-validation and hyperparameter tuning...


100%|██████████| 7/7 [00:03<00:00,  1.88it/s]



Optimization Results:
Optimal number of clusters: 3

Metrics at optimal k:
Silhouette Score: 0.080
Calinski-Harabasz Score: 88.913
Adjusted Rand Index: 0.090
Normalized Mutual Information: 0.106

Optimizing for MFCC Optimized...
Performing cross-validation and hyperparameter tuning...


100%|██████████| 7/7 [00:02<00:00,  2.53it/s]



Optimization Results:
Optimal number of clusters: 6

Metrics at optimal k:
Silhouette Score: 0.133
Calinski-Harabasz Score: 137.563
Adjusted Rand Index: 0.113
Normalized Mutual Information: 0.126

Optimizing for MFCC Basic Augmented...
Performing cross-validation and hyperparameter tuning...


100%|██████████| 7/7 [00:21<00:00,  3.02s/it]



Optimization Results:
Optimal number of clusters: 4

Metrics at optimal k:
Silhouette Score: 0.119
Calinski-Harabasz Score: 770.088
Adjusted Rand Index: 0.070
Normalized Mutual Information: 0.070

Optimizing for MFCC + Envelope Augmented...
Performing cross-validation and hyperparameter tuning...


100%|██████████| 7/7 [00:19<00:00,  2.76s/it]



Optimization Results:
Optimal number of clusters: 4

Metrics at optimal k:
Silhouette Score: 0.078
Calinski-Harabasz Score: 472.313
Adjusted Rand Index: 0.083
Normalized Mutual Information: 0.089

Optimizing for MFCC Optimized Augmented...
Performing cross-validation and hyperparameter tuning...


100%|██████████| 7/7 [00:20<00:00,  2.98s/it]


Optimization Results:
Optimal number of clusters: 7

Metrics at optimal k:
Silhouette Score: 0.128
Calinski-Harabasz Score: 736.366
Adjusted Rand Index: 0.120
Normalized Mutual Information: 0.136



