In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

def get_participant_centroids(features, segment_info, participant_id='P1'):
    """
    Calculate initial centroids from a participant's data
    
    Args:
        features: Feature matrix (n_samples, n_features)
        segment_info: DataFrame containing segment information
        participant_id: ID of the participant to use for seeding
        
    Returns:
        initial_centroids: Array of shape (n_clusters, n_features)
    """
    # Create a mask for the participant's data
    participant_mask = segment_info['participant_id'] == participant_id
    
    # Get the participant's features and labels
    participant_features = features[participant_mask]
    participant_labels = segment_info.loc[participant_mask, 'instrument_label']
    
    # Print some information about the seed participant's data
    print(f"\nSeed Participant ({participant_id}) Data:")
    print("Number of examples per instrument:")
    print(participant_labels.value_counts())
    
    # Calculate mean feature vector for each instrument type
    centroids = []
    for instrument in ['hhc', 'hho', 'kd', 'sd']:
        instrument_mask = participant_labels == instrument
        if not any(instrument_mask):
            raise ValueError(f"No samples found for instrument {instrument} from participant {participant_id}")
        
        centroid = participant_features[instrument_mask].mean(axis=0)
        centroids.append(centroid)
        print(f"\nCentroid shape for {instrument}: {centroid.shape}")
    
    return np.array(centroids)

def seeded_kmeans(features_path, segment_info_path, seed_participant='P1', output_dir=''):
    """
    Perform k-means clustering with seeds from a specific participant's data
    
    Args:
        features_path: Path to the features .npy file
        segment_info_path: Path to the segment info CSV file
        seed_participant: Participant ID to use for seeding
        output_dir: Directory to save cluster assignments
    """
    # Load data
    print(f"Loading features from: {features_path}")
    print(f"Loading segment info from: {segment_info_path}")
    
    X = np.load(features_path)
    segment_info = pd.read_csv(segment_info_path)
    
    print(f"\nFeature matrix shape: {X.shape}")
    print(f"Number of segments: {len(segment_info)}")
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Get initial centroids from the seed participant
    print(f"\nGetting centroids from participant {seed_participant}")
    initial_centroids = get_participant_centroids(X_scaled, segment_info, seed_participant)
    print(f"Initial centroids shape: {initial_centroids.shape}")
    
    # Perform k-means clustering with the seeded centroids
    print("\nPerforming seeded k-means clustering...")
    kmeans = KMeans(
        n_clusters=4,
        init=initial_centroids,
        n_init=1,  # Since we're providing our own centroids
        random_state=42
    )
    
    cluster_labels = kmeans.fit_predict(X_scaled)
    
    # Save cluster assignments
    if output_dir:
        os.makedirs(os.path.dirname(output_dir), exist_ok=True)
        np.save(output_dir, cluster_labels)
        print(f"\nSaved cluster assignments to: {output_dir}")
    
    # Print cluster sizes
    print("\nCluster sizes:")
    for i in range(4):
        print(f"Cluster {i}: {np.sum(cluster_labels == i)} sounds")
    
    # Calculate clustering metrics
    silhouette = silhouette_score(X_scaled, cluster_labels)
    davies_bouldin = davies_bouldin_score(X_scaled, cluster_labels)
    calinski_harabasz = calinski_harabasz_score(X_scaled, cluster_labels)
    
    print("\nClustering Metrics:")
    print(f"Silhouette Score: {silhouette:.3f}")
    print(f"Davies-Bouldin Index: {davies_bouldin:.3f}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz:.3f}")
    
    # Create confusion matrix between true labels and clusters
    true_labels = segment_info['instrument_label']
    confusion_matrix = pd.crosstab(
        true_labels, 
        cluster_labels,
        normalize='index'
    )
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(confusion_matrix, annot=True, fmt='.2f', cmap='YlOrRd')
    plt.title(f'Normalized Confusion Matrix\nSeeded with Participant {seed_participant}')
    plt.xlabel('Cluster')
    plt.ylabel('True Label')
    
    # Save visualization
    if output_dir:
        viz_dir = str(Path(output_dir).parent.parent / 'visualization' / 'clustering_eval')
        os.makedirs(viz_dir, exist_ok=True)
        viz_path = f"{viz_dir}/seeded_kmeans_{seed_participant}_confusion.png"
        plt.savefig(viz_path, dpi=300, bbox_inches='tight')
        print(f"\nSaved confusion matrix visualization to: {viz_path}")
    plt.close()
    
    return cluster_labels, kmeans

def evaluate_all_participants(features_path, segment_info_path, base_output_dir):
    """
    Run seeded k-means with each participant as the seed and compare results
    """
    # Load segment info to get unique participants
    segment_info = pd.read_csv(segment_info_path)
    participants = segment_info['participant_id'].unique()
    
    # Store results
    results = []
    
    for participant in participants:
        print(f"\n{'='*50}")
        print(f"Evaluating with seed participant: {participant}")
        print('='*50)
        
        output_dir = f"{base_output_dir}/seeded_kmeans_{participant}_cluster.npy"
        
        try:
            cluster_labels, kmeans = seeded_kmeans(
                features_path=features_path,
                segment_info_path=segment_info_path,
                seed_participant=participant,
                output_dir=output_dir
            )
            
            # Calculate metrics
            X = np.load(features_path)
            X_scaled = StandardScaler().fit_transform(X)
            
            metrics = {
                'participant': participant,
                'silhouette': silhouette_score(X_scaled, cluster_labels),
                'davies_bouldin': davies_bouldin_score(X_scaled, cluster_labels),
                'calinski_harabasz': calinski_harabasz_score(X_scaled, cluster_labels)
            }
            
            results.append(metrics)
            
        except Exception as e:
            print(f"Error processing participant {participant}: {str(e)}")
            continue
    
    # Convert results to DataFrame and sort by silhouette score
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('silhouette', ascending=False)
    
    print("\nResults for all participants:")
    print(results_df)
    
    # Save results
    results_path = Path(base_output_dir).parent / 'results' / 'seeded_kmeans_comparison.csv'
    os.makedirs(results_path.parent, exist_ok=True)
    results_df.to_csv(results_path, index=False)
    print(f"\nSaved comparison results to: {results_path}")
    
    return results_df

if __name__ == "__main__":
    # Paths
    features_path = '../../../extracted_features/features/mfcc_extracted_features.npy'
    segment_info_path = '../../../segment_info/segment_info_base_names.csv'
    base_output_dir = '../../../cluster_assignments/seeded_kmeans'
    
    # Evaluate using all participants as seeds
    results_df = evaluate_all_participants(
        features_path=features_path,
        segment_info_path=segment_info_path,
        base_output_dir=base_output_dir
    )


Evaluating with seed participant: P1
Loading features from: ../../../extracted_features/features/mfcc_extracted_features.npy
Loading segment info from: ../../../segment_info/segment_info_base_names.csv

Feature matrix shape: (5714, 12)
Number of segments: 5714

Getting centroids from participant P1

Seed Participant (P1) Data:
Number of examples per instrument:
instrument_label
kd     38
hhc    32
hho    31
sd     27
Name: count, dtype: int64

Centroid shape for hhc: (12,)

Centroid shape for hho: (12,)

Centroid shape for kd: (12,)

Centroid shape for sd: (12,)
Initial centroids shape: (4, 12)

Performing seeded k-means clustering...

Saved cluster assignments to: ../../../cluster_assignments/seeded_kmeans/seeded_kmeans_P1_cluster.npy

Cluster sizes:
Cluster 0: 1348 sounds
Cluster 1: 910 sounds
Cluster 2: 1357 sounds
Cluster 3: 2099 sounds

Clustering Metrics:
Silhouette Score: 0.124
Davies-Bouldin Index: 2.139
Calinski-Harabasz Index: 752.866

Saved confusion matrix visualization to