# Gaussian Mixture Model

In [1]:
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

In [2]:
def gmm_cluster(n_components=4, features_path='features', output_dir='', covariance_type='full'):
    """
    Perform GMM clustering on features
    
    Args:
        n_components: Number of Gaussian components (clusters)
        features_path: Path to feature file
        output_dir: Where to save cluster assignments
        covariance_type: Type of covariance parameter ('full', 'tied', 'diag', 'spherical')
    """
    # Load features
    X = np.load(features_path)
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Fit GMM
    print(f"Fitting GMM with {n_components} components...")
    gmm = GaussianMixture(
        n_components=n_components,
        covariance_type=covariance_type,
        random_state=42
    )
    
    # Get cluster assignments and probabilities
    cluster_labels = gmm.fit_predict(X_scaled)
    probabilities = gmm.predict_proba(X_scaled)
    
    # Save cluster assignments
    np.save(output_dir, cluster_labels)
    
    # Print cluster sizes
    print("\nCluster sizes:")
    for i in range(n_components):
        print(f"Cluster {i}: {np.sum(cluster_labels == i)} sounds")
    
    # Calculate average probability for each cluster
    avg_probs = np.mean(probabilities, axis=0)
    print("\nAverage probability for each cluster:")
    for i, prob in enumerate(avg_probs):
        print(f"Cluster {i}: {prob:.3f}")
    
    return cluster_labels, probabilities, gmm

# Evaluating GMM Model

In [3]:
def evaluate_gmm_clustering(features_path='features', labels_path='labels.npy', 
                          cluster_labels_path='cluster_labels.npy', 
                          probabilities=None, viz_dir=''):
    """
    Evaluate GMM clustering results using both internal and external metrics
    """
    # Load data
    X = np.load(features_path)
    y_true = np.load(labels_path)
    cluster_labels = np.load(cluster_labels_path)
    
    # Standardize features
    X_scaled = StandardScaler().fit_transform(X)
    
    # 1. Internal Metrics
    silhouette = silhouette_score(X_scaled, cluster_labels)
    davies_bouldin = davies_bouldin_score(X_scaled, cluster_labels)
    calinski_harabasz = calinski_harabasz_score(X_scaled, cluster_labels)
    
    print("Internal Metrics:")
    print(f"Silhouette Score: {silhouette:.3f} (ranges from -1 to 1, higher is better)")
    print(f"Davies-Bouldin Index: {davies_bouldin:.3f} (lower is better)")
    print(f"Calinski-Harabasz Index: {calinski_harabasz:.3f} (higher is better)")
    
    # 2. External Metrics
    ari = adjusted_rand_score(y_true, cluster_labels)
    nmi = normalized_mutual_info_score(y_true, cluster_labels)
    
    print("\nExternal Metrics:")
    print(f"Adjusted Rand Index: {ari:.3f} (ranges from -1 to 1, higher is better)")
    print(f"Normalized Mutual Information: {nmi:.3f} (ranges from 0 to 1, higher is better)")
    
    # 3. Create confusion matrix with probabilities
    unique_labels = np.unique(y_true)
    unique_clusters = np.unique(cluster_labels)
    confusion_matrix = np.zeros((len(unique_labels), len(unique_clusters)))
    
    for i, label in enumerate(unique_labels):
        for j, cluster in enumerate(unique_clusters):
            confusion_matrix[i, j] = np.sum((y_true == label) & (cluster_labels == cluster))
    
    # Normalize by row (true labels)
    confusion_matrix_normalized = confusion_matrix / confusion_matrix.sum(axis=1)[:, np.newaxis]
    
    # Plot confusion matrix
    plt.figure(figsize=(12, 8))
    sns.heatmap(confusion_matrix_normalized, 
                annot=True, 
                fmt='.2f', 
                xticklabels=[f'Cluster {i}' for i in range(len(unique_clusters))],
                yticklabels=unique_labels,
                cmap='YlOrRd')
    plt.title('Normalized Confusion Matrix:\nTrue Labels vs Cluster Assignments')
    plt.xlabel('Predicted Cluster')
    plt.ylabel('True Label')
    
    # Save confusion matrix
    os.makedirs(os.path.dirname(viz_dir), exist_ok=True)
    plt.savefig(viz_dir, dpi=300, bbox_inches='tight')
    plt.close()
    
    # 4. Print cluster composition with probability information
    print("\nCluster Composition:")
    for cluster in unique_clusters:
        cluster_mask = cluster_labels == cluster
        print(f"\nCluster {cluster}:")
        for label in unique_labels:
            count = np.sum((y_true == label) & cluster_mask)
            percentage = (count / np.sum(cluster_mask)) * 100
            
            # Add average probability for this label in this cluster
            if probabilities is not None:
                avg_prob = np.mean(probabilities[cluster_mask & (y_true == label)][:, cluster])
                print(f"{label}: {count} samples ({percentage:.1f}%), Avg Probability: {avg_prob:.3f}")
            else:
                print(f"{label}: {count} samples ({percentage:.1f}%)")
    
    return {
        'silhouette': silhouette,
        'davies_bouldin': davies_bouldin,
        'calinski_harabasz': calinski_harabasz,
        'ari': ari,
        'nmi': nmi,
        'confusion_matrix': confusion_matrix,
        'confusion_matrix_normalized': confusion_matrix_normalized
    }

In [4]:
# Create necessary directories
os.makedirs('../../cluster_assignments/gmm', exist_ok=True)
os.makedirs('../../visualization/clustering_eval/gmm', exist_ok=True)

# Run GMM clustering for each feature set
feature_sets = [
    {
        'name': 'MFCC Basic',
        'features': '../../extracted_features/features/mfcc_features.npy',
        'labels': '../../extracted_features/labels/mfcc_labels.npy',
        'output': '../../cluster_assignments/gmm/gmm_mfcc_cluster.npy',
        'viz': '../../visualization/clustering_eval/gmm/gmm_mfcc.png'
    },
    {
        'name': 'MFCC + Envelope',
        'features': '../../extracted_features/features/mfcc_env_features.npy',
        'labels': '../../extracted_features/labels/mfcc_env_labels.npy',
        'output': '../../cluster_assignments/gmm/gmm_mfcc_env_cluster.npy',
        'viz': '../../visualization/clustering_eval/gmm/gmm_mfcc_env.png'
    },
    {
        'name': 'MFCC Optimized',
        'features': '../../extracted_features/features/mfcc_extracted_features.npy',
        'labels': '../../extracted_features/labels/mfcc_extracted_labels.npy',
        'output': '../../cluster_assignments/gmm/gmm_mfcc_extracted_cluster.npy',
        'viz': '../../visualization/clustering_eval/gmm/gmm_mfcc_extracted.png'
    },
    {
        'name': 'MFCC Basic Augmented',
        'features': '../../extracted_features/features/mfcc_features_aug.npy',
        'labels': '../../extracted_features/labels/mfcc_labels_aug.npy',
        'output': '../../cluster_assignments/gmm/gmm_mfcc_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/gmm/gmm_mfcc_aug.png'
    },
    {
        'name': 'MFCC + Envelope Augmented',
        'features': '../../extracted_features/features/mfcc_env_aug_features.npy',
        'labels': '../../extracted_features/labels/mfcc_env_aug_labels.npy',
        'output': '../../cluster_assignments/gmm/gmm_mfcc_env_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/gmm/gmm_mfcc_env_aug.png'
    },
    {
        'name': 'MFCC Optimized Augmented',
        'features': '../../extracted_features/features/mfcc_extracted_aug_features.npy',
        'labels': '../../extracted_features/labels/mfcc_extracted_aug_labels.npy',
        'output': '../../cluster_assignments/gmm/gmm_mfcc_extracted_aug_cluster.npy',
        'viz': '../../visualization/clustering_eval/gmm/gmm_mfcc_extracted_aug.png'
    }
    
]

# Run clustering and evaluation for each feature set
for feature_set in feature_sets:
    print(f"\nProcessing {feature_set['name']}...")
    
    # Run GMM clustering
    cluster_labels, probabilities, gmm_model = gmm_cluster(
        n_components=4,
        features_path=feature_set['features'],
        output_dir=feature_set['output']
    )
    
    # Evaluate results
    results = evaluate_gmm_clustering(
        features_path=feature_set['features'],
        labels_path=feature_set['labels'],
        cluster_labels_path=feature_set['output'],
        probabilities=probabilities,
        viz_dir=feature_set['viz']
    )


Processing MFCC Basic...
Fitting GMM with 4 components...

Cluster sizes:
Cluster 0: 1072 sounds
Cluster 1: 1562 sounds
Cluster 2: 1714 sounds
Cluster 3: 1366 sounds

Average probability for each cluster:
Cluster 0: 0.187
Cluster 1: 0.269
Cluster 2: 0.299
Cluster 3: 0.245
Internal Metrics:
Silhouette Score: 0.091 (ranges from -1 to 1, higher is better)
Davies-Bouldin Index: 2.362 (lower is better)
Calinski-Harabasz Index: 501.644 (higher is better)

External Metrics:
Adjusted Rand Index: 0.085 (ranges from -1 to 1, higher is better)
Normalized Mutual Information: 0.090 (ranges from 0 to 1, higher is better)

Cluster Composition:

Cluster 0:
hhc: 164 samples (15.3%), Avg Probability: 0.902
hho: 310 samples (28.9%), Avg Probability: 0.961
kd: 165 samples (15.4%), Avg Probability: 0.856
sd: 433 samples (40.4%), Avg Probability: 0.926

Cluster 1:
hhc: 316 samples (20.2%), Avg Probability: 0.920
hho: 129 samples (8.3%), Avg Probability: 0.921
kd: 860 samples (55.1%), Avg Probability: 0.956

# Adding CV and Hyperparameter Tuning

In [5]:
from sklearn.model_selection import KFold
from tqdm import tqdm


def optimize_gmm_fixed_components(features_path, labels_path, n_splits=5, n_components=4):
    """
    Optimize GMM parameters using cross-validation, fixing number of components to 4
    """
    # Load data
    X = np.load(features_path)
    y = np.load(labels_path)
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Covariance types to try
    covariance_types = ['full', 'tied', 'diag', 'spherical']
    
    # Store results
    results = {
        'covariance_type': [],
        'silhouette_scores': [],
        'ari_scores': [],
        'nmi_scores': [],
        'calinski_scores': [],
        'bic_scores': []
    }
    
    print("Performing cross-validation for different covariance types...")
    
    # Try different covariance types
    for cov_type in tqdm(covariance_types):
        fold_silhouette = []
        fold_ari = []
        fold_nmi = []
        fold_calinski = []
        fold_bic = []
        
        # Cross validation
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
            X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            # Train GMM
            gmm = GaussianMixture(
                n_components=n_components,
                covariance_type=cov_type,
                random_state=42
            )
            
            # Fit and predict
            gmm.fit(X_train)
            train_clusters = gmm.predict(X_train)
            val_clusters = gmm.predict(X_val)
            
            # Calculate metrics
            fold_silhouette.append(silhouette_score(X_val, val_clusters))
            fold_ari.append(adjusted_rand_score(y_val, val_clusters))
            fold_nmi.append(normalized_mutual_info_score(y_val, val_clusters))
            fold_calinski.append(calinski_harabasz_score(X_val, val_clusters))
            fold_bic.append(gmm.bic(X_val))
        
        # Store average results
        results['covariance_type'].append(cov_type)
        results['silhouette_scores'].append(np.mean(fold_silhouette))
        results['ari_scores'].append(np.mean(fold_ari))
        results['nmi_scores'].append(np.mean(fold_nmi))
        results['calinski_scores'].append(np.mean(fold_calinski))
        results['bic_scores'].append(np.mean(fold_bic))
    
    # Find best configuration
    # Weighted combination of metrics
    metric_weights = {
        'silhouette': 0.3,
        'ari': 0.3,
        'nmi': 0.2,
        'calinski': 0.1,
        'bic': 0.1
    }
    
    combined_scores = (
        metric_weights['silhouette'] * np.array(results['silhouette_scores']) +
        metric_weights['ari'] * np.array(results['ari_scores']) +
        metric_weights['nmi'] * np.array(results['nmi_scores']) +
        metric_weights['calinski'] * (np.array(results['calinski_scores']) / np.max(results['calinski_scores'])) +
        metric_weights['bic'] * (-np.array(results['bic_scores']) / np.max(-np.array(results['bic_scores'])))
    )
    
    best_idx = np.argmax(combined_scores)
    optimal_covariance = results['covariance_type'][best_idx]
    
    print(f"\nOptimization Results (fixed 4 components):")
    print(f"Optimal covariance type: {optimal_covariance}")
    print("\nMetrics with optimal covariance:")
    print(f"Silhouette Score: {results['silhouette_scores'][best_idx]:.3f}")
    print(f"Calinski-Harabasz Score: {results['calinski_scores'][best_idx]:.3f}")
    print(f"Adjusted Rand Index: {results['ari_scores'][best_idx]:.3f}")
    print(f"Normalized Mutual Information: {results['nmi_scores'][best_idx]:.3f}")
    print(f"BIC Score: {results['bic_scores'][best_idx]:.3f}")
    
    return optimal_covariance, results

# Run optimization for each feature set
feature_sets = [
    {
        'name': 'MFCC Basic',
        'features': '../../extracted_features/features/mfcc_features.npy',
        'labels': '../../extracted_features/labels/mfcc_labels.npy',
    },
    {
        'name': 'MFCC + Envelope',
        'features': '../../extracted_features/features/mfcc_env_features.npy',
        'labels': '../../extracted_features/labels/mfcc_env_labels.npy',
    },
    {
        'name': 'MFCC Optimized',
        'features': '../../extracted_features/features/mfcc_extracted_features.npy',
        'labels': '../../extracted_features/labels/mfcc_extracted_labels.npy',
    },
    {
        'name': 'MFCC Basic Augmented',
        'features': '../../extracted_features/features/mfcc_features_aug.npy',
        'labels': '../../extracted_features/labels/mfcc_labels_aug.npy',
    },
    {
        'name': 'MFCC + Envelope Augmented',
        'features': '../../extracted_features/features/mfcc_env_aug_features.npy',
        'labels': '../../extracted_features/labels/mfcc_env_aug_labels.npy',
    },
    {
        'name': 'MFCC Optimized Augmented',
        'features': '../../extracted_features/features/mfcc_extracted_aug_features.npy',
        'labels': '../../extracted_features/labels/mfcc_extracted_aug_labels.npy',
    }
    
]

# Run optimization for each feature set
optimization_results = {}
for feature_set in feature_sets:
    print(f"\nOptimizing GMM for {feature_set['name']}...")
    opt_covariance, results = optimize_gmm_fixed_components(
        feature_set['features'],
        feature_set['labels']
    )
    optimization_results[feature_set['name']] = {
        'optimal_covariance': opt_covariance,
        'results': results
    }


Optimizing GMM for MFCC Basic...
Performing cross-validation for different covariance types...


100%|██████████| 4/4 [00:02<00:00,  1.45it/s]



Optimization Results (fixed 4 components):
Optimal covariance type: tied

Metrics with optimal covariance:
Silhouette Score: 0.104
Calinski-Harabasz Score: 107.277
Adjusted Rand Index: 0.098
Normalized Mutual Information: 0.127
BIC Score: 40018.381

Optimizing GMM for MFCC + Envelope...
Performing cross-validation for different covariance types...


100%|██████████| 4/4 [00:04<00:00,  1.11s/it]



Optimization Results (fixed 4 components):
Optimal covariance type: tied

Metrics with optimal covariance:
Silhouette Score: 0.060
Calinski-Harabasz Score: 60.748
Adjusted Rand Index: 0.080
Normalized Mutual Information: 0.112
BIC Score: 96849.209

Optimizing GMM for MFCC Optimized...
Performing cross-validation for different covariance types...


100%|██████████| 4/4 [00:02<00:00,  1.70it/s]



Optimization Results (fixed 4 components):
Optimal covariance type: tied

Metrics with optimal covariance:
Silhouette Score: 0.121
Calinski-Harabasz Score: 133.039
Adjusted Rand Index: 0.080
Normalized Mutual Information: 0.113
BIC Score: 34789.077

Optimizing GMM for MFCC Basic Augmented...
Performing cross-validation for different covariance types...


100%|██████████| 4/4 [00:16<00:00,  4.11s/it]



Optimization Results (fixed 4 components):
Optimal covariance type: tied

Metrics with optimal covariance:
Silhouette Score: 0.116
Calinski-Harabasz Score: 724.051
Adjusted Rand Index: 0.073
Normalized Mutual Information: 0.084
BIC Score: 234604.425

Optimizing GMM for MFCC + Envelope Augmented...
Performing cross-validation for different covariance types...


 75%|███████▌  | 3/4 [00:23<00:07,  7.90s/it]


KeyboardInterrupt: 