# Music Genre Discovery - Code Implementation

**Project:** Unsupervised Music Genre Discovery Using Audio Feature Learning  
**Author:** Anirudh Sharma  
**Date:** November 2025

---

## Overview

This notebook implements unsupervised clustering algorithms for music genre discovery, including:
- K-Means Clustering
- MiniBatch K-Means
- Spectral Clustering
- DBSCAN
- Gaussian Mixture Models (GMM)

**Evaluation Metrics:**
- Silhouette Score
- Davies-Bouldin Index
- Calinski-Harabasz Index
- Normalized Mutual Information (NMI)
- Adjusted Rand Index (ARI)
- V-Measure
- Cluster Accuracy (Hungarian Algorithm)

## 1. Import Required Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from time import time

# Scikit-learn imports
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, MiniBatchKMeans, SpectralClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score,
    normalized_mutual_info_score,
    adjusted_rand_score,
    v_measure_score
)
from scipy.optimize import linear_sum_assignment

# Ignore warnings for cleaner output
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Create results directory if it doesn't exist
os.makedirs('results', exist_ok=True)

print("✓ Libraries imported successfully!")
print(f"  - pandas version: {pd.__version__}")
print(f"  - numpy version: {np.__version__}")
print(f"  - scikit-learn imported")

## 2. Configuration

In [None]:
# Configuration parameters
CONFIG = {
    'N_CLUSTERS': 10,
    'N_PCA_COMPONENTS': 20,
    'RANDOM_STATE': 42,
    'DBSCAN_EPS': 2.5,
    'DBSCAN_MIN_SAMPLES': 5,
    'SPLIT_RATIOS': [(50, 50), (60, 40), (70, 30), (80, 20)]
}

print("Configuration:")
print("=" * 60)
for key, value in CONFIG.items():
    print(f"  {key}: {value}")
print("=" * 60)

## 3. Load and Prepare Data

In [None]:
# Load cleaned dataset
df = pd.read_csv('gtzan/features_30_sec_cleaned.csv')

print("=" * 80)
print("DATASET LOADED")
print("=" * 80)
print(f"\nShape: {df.shape}")
print(f"Samples: {len(df)}")
print(f"Features: {df.shape[1] - 2} (excluding 'filename' and 'label')")
print(f"Genres: {df['label'].nunique()}")
print(f"\nGenre Distribution:")
print(df['label'].value_counts().sort_index())

In [None]:
# Prepare features and labels
features = [col for col in df.columns if col not in ['filename', 'label']]
X = df[features].values
y = df['label'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\n✓ Data prepared:")
print(f"  - Feature matrix shape: {X.shape}")
print(f"  - Label array shape: {y_encoded.shape}")
print(f"  - Genre mapping: {dict(enumerate(label_encoder.classes_))}")

## 4. Data Preprocessing

In [None]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("=" * 80)
print("FEATURE SCALING (StandardScaler)")
print("=" * 80)
print(f"\n✓ Features standardized")
print(f"  - Mean: {X_scaled.mean():.6f}")
print(f"  - Std: {X_scaled.std():.6f}")

In [None]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=CONFIG['N_PCA_COMPONENTS'], random_state=CONFIG['RANDOM_STATE'])
X_pca = pca.fit_transform(X_scaled)

variance_explained = pca.explained_variance_ratio_.sum()

print("=" * 80)
print("DIMENSIONALITY REDUCTION (PCA)")
print("=" * 80)
print(f"\n✓ PCA applied")
print(f"  - Original dimensions: {X_scaled.shape[1]}")
print(f"  - Reduced dimensions: {X_pca.shape[1]}")
print(f"  - Variance explained: {variance_explained*100:.2f}%")

In [None]:
# Visualize explained variance
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
         pca.explained_variance_ratio_, 'bo-', linewidth=2)
plt.xlabel('Principal Component', fontweight='bold')
plt.ylabel('Variance Explained', fontweight='bold')
plt.title('PCA - Explained Variance by Component', fontweight='bold')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
         np.cumsum(pca.explained_variance_ratio_), 'ro-', linewidth=2)
plt.axhline(y=variance_explained, color='g', linestyle='--', label=f'{variance_explained*100:.2f}%')
plt.xlabel('Number of Components', fontweight='bold')
plt.ylabel('Cumulative Variance Explained', fontweight='bold')
plt.title('PCA - Cumulative Variance Explained', fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('results/pca_variance.png', dpi=300, bbox_inches='tight')
print("\n✓ PCA variance plot saved: results/pca_variance.png")
plt.show()

## 5. Clustering Algorithms Implementation

### 5.1 Helper Functions

In [None]:
def calculate_cluster_accuracy(y_true, y_pred):
    """Calculate clustering accuracy using Hungarian algorithm."""
    # Create confusion matrix
    cm = np.zeros((len(np.unique(y_true)), len(np.unique(y_pred))), dtype=int)
    for i, j in zip(y_true, y_pred):
        cm[i, j] += 1
    
    # Apply Hungarian algorithm
    row_ind, col_ind = linear_sum_assignment(-cm)
    accuracy = cm[row_ind, col_ind].sum() / len(y_true)
    
    return accuracy * 100  # Return as percentage

def evaluate_clustering(X, y_true, labels, noise_points=0):
    """Evaluate clustering performance with multiple metrics."""
    # Filter out noise points (label = -1)
    valid_mask = labels != -1
    X_valid = X[valid_mask]
    y_true_valid = y_true[valid_mask]
    labels_valid = labels[valid_mask]
    
    if len(np.unique(labels_valid)) < 2:
        return None
    
    results = {
        'silhouette': silhouette_score(X_valid, labels_valid),
        'davies_bouldin': davies_bouldin_score(X_valid, labels_valid),
        'calinski_harabasz': calinski_harabasz_score(X_valid, labels_valid),
        'nmi': normalized_mutual_info_score(y_true_valid, labels_valid),
        'ari': adjusted_rand_score(y_true_valid, labels_valid),
        'v_measure': v_measure_score(y_true_valid, labels_valid),
        'cluster_accuracy': calculate_cluster_accuracy(y_true_valid, labels_valid),
        'valid_samples': len(labels_valid),
        'noise_points': noise_points
    }
    
    return results

print("✓ Helper functions defined")

### 5.2 K-Means Clustering

In [None]:
def kmeans_clustering(X_train, y_train, n_clusters):
    """Perform K-Means clustering."""
    print(f"\n→ Running K-Means (n_clusters={n_clusters})...")
    start_time = time()
    
    kmeans = KMeans(
        n_clusters=n_clusters,
        random_state=CONFIG['RANDOM_STATE'],
        n_init=10,
        max_iter=300
    )
    labels = kmeans.fit_predict(X_train)
    
    elapsed = time() - start_time
    print(f"  ✓ Completed in {elapsed:.2f}s")
    
    return labels, kmeans

print("✓ K-Means function defined")

### 5.3 MiniBatch K-Means Clustering

In [None]:
def minibatch_kmeans_clustering(X_train, y_train, n_clusters):
    """Perform MiniBatch K-Means clustering."""
    print(f"\n→ Running MiniBatch K-Means (n_clusters={n_clusters})...")
    start_time = time()
    
    mbkmeans = MiniBatchKMeans(
        n_clusters=n_clusters,
        random_state=CONFIG['RANDOM_STATE'],
        batch_size=100,
        n_init=10
    )
    labels = mbkmeans.fit_predict(X_train)
    
    elapsed = time() - start_time
    print(f"  ✓ Completed in {elapsed:.2f}s")
    
    return labels, mbkmeans

print("✓ MiniBatch K-Means function defined")

### 5.4 Spectral Clustering

In [None]:
def spectral_clustering(X_train, y_train, n_clusters):
    """Perform Spectral clustering."""
    print(f"\n→ Running Spectral Clustering (n_clusters={n_clusters})...")
    start_time = time()
    
    spectral = SpectralClustering(
        n_clusters=n_clusters,
        random_state=CONFIG['RANDOM_STATE'],
        affinity='nearest_neighbors',
        n_neighbors=10
    )
    labels = spectral.fit_predict(X_train)
    
    elapsed = time() - start_time
    print(f"  ✓ Completed in {elapsed:.2f}s")
    
    return labels, spectral

print("✓ Spectral Clustering function defined")

### 5.5 DBSCAN Clustering

In [None]:
def dbscan_clustering(X_train, y_train):
    """Perform DBSCAN clustering."""
    print(f"\n→ Running DBSCAN (eps={CONFIG['DBSCAN_EPS']}, min_samples={CONFIG['DBSCAN_MIN_SAMPLES']})...")
    start_time = time()
    
    dbscan = DBSCAN(
        eps=CONFIG['DBSCAN_EPS'],
        min_samples=CONFIG['DBSCAN_MIN_SAMPLES']
    )
    labels = dbscan.fit_predict(X_train)
    
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    
    elapsed = time() - start_time
    print(f"  ✓ Completed in {elapsed:.2f}s")
    print(f"  - Clusters found: {n_clusters}")
    print(f"  - Noise points: {n_noise}")
    
    return labels, dbscan

print("✓ DBSCAN function defined")

### 5.6 Gaussian Mixture Model (GMM)

In [None]:
def gmm_clustering(X_train, y_train, n_clusters):
    """Perform GMM clustering."""
    print(f"\n→ Running GMM (n_components={n_clusters})...")
    start_time = time()
    
    gmm = GaussianMixture(
        n_components=n_clusters,
        random_state=CONFIG['RANDOM_STATE'],
        covariance_type='full',
        max_iter=100
    )
    labels = gmm.fit_predict(X_train)
    
    elapsed = time() - start_time
    print(f"  ✓ Completed in {elapsed:.2f}s")
    
    return labels, gmm

print("✓ GMM function defined")

## 6. Run Clustering Experiments

In [None]:
# Storage for results
all_results = []
cluster_visualizations = {}

print("=" * 80)
print("RUNNING CLUSTERING EXPERIMENTS")
print("=" * 80)
print(f"\nAlgorithms: K-Means, MiniBatch K-Means, Spectral Clustering, GMM")
print(f"Split ratios: {CONFIG['SPLIT_RATIOS']}")
print(f"Total experiments: {len(CONFIG['SPLIT_RATIOS']) * 4}")

In [None]:
# Run experiments for each split ratio
for split_idx, (train_pct, test_pct) in enumerate(CONFIG['SPLIT_RATIOS'], 1):
    print(f"\n" + "=" * 80)
    print(f"EXPERIMENT SET {split_idx}: {train_pct}-{test_pct} Train-Test Split")
    print("=" * 80)
    
    # Split data
    test_size = test_pct / 100
    X_train, X_test, y_train, y_test = train_test_split(
        X_pca, y_encoded, 
        test_size=test_size,
        random_state=CONFIG['RANDOM_STATE'],
        stratify=y_encoded
    )
    
    print(f"\nTrain samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    
    split_name = f"{train_pct}-{test_pct}"
    
    # 1. K-Means
    labels_km, model_km = kmeans_clustering(X_train, y_train, CONFIG['N_CLUSTERS'])
    results_km = evaluate_clustering(X_train, y_train, labels_km)
    if results_km:
        results_km['algorithm'] = 'K-Means'
        results_km['split'] = split_name
        all_results.append(results_km)
        cluster_visualizations[f'kmeans_{split_name}'] = (X_train, labels_km, y_train)
    
    # 2. MiniBatch K-Means
    labels_mbkm, model_mbkm = minibatch_kmeans_clustering(X_train, y_train, CONFIG['N_CLUSTERS'])
    results_mbkm = evaluate_clustering(X_train, y_train, labels_mbkm)
    if results_mbkm:
        results_mbkm['algorithm'] = 'MiniBatch K-Means'
        results_mbkm['split'] = split_name
        all_results.append(results_mbkm)
    
    # 3. Spectral Clustering
    labels_sc, model_sc = spectral_clustering(X_train, y_train, CONFIG['N_CLUSTERS'])
    results_sc = evaluate_clustering(X_train, y_train, labels_sc)
    if results_sc:
        results_sc['algorithm'] = 'Spectral Clustering'
        results_sc['split'] = split_name
        all_results.append(results_sc)
        cluster_visualizations[f'spectral_{split_name}'] = (X_train, labels_sc, y_train)
    
    # 4. GMM
    labels_gmm, model_gmm = gmm_clustering(X_train, y_train, CONFIG['N_CLUSTERS'])
    results_gmm = evaluate_clustering(X_train, y_train, labels_gmm)
    if results_gmm:
        results_gmm['algorithm'] = 'GMM'
        results_gmm['split'] = split_name
        all_results.append(results_gmm)

print(f"\n" + "=" * 80)
print("✓ ALL EXPERIMENTS COMPLETED")
print("=" * 80)

## 7. Results Analysis

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(all_results)

# Reorder columns
column_order = ['algorithm', 'split', 'silhouette', 'davies_bouldin', 
                'calinski_harabasz', 'nmi', 'ari', 'v_measure', 
                'cluster_accuracy', 'valid_samples', 'noise_points']
results_df = results_df[column_order]

print("=" * 80)
print("CLUSTERING RESULTS")
print("=" * 80)
print("\nAll Results:")
print(results_df.to_string(index=False))

# Save results
results_df.to_csv('results/clustering_results.csv', index=False)
print("\n✓ Results saved: results/clustering_results.csv")

In [None]:
# Calculate average performance by algorithm
summary = results_df.groupby('algorithm').agg({
    'silhouette': 'mean',
    'davies_bouldin': 'mean',
    'calinski_harabasz': 'mean',
    'nmi': 'mean',
    'ari': 'mean',
    'v_measure': 'mean',
    'cluster_accuracy': 'mean'
}).round(4)

print("\n" + "=" * 80)
print("AVERAGE PERFORMANCE BY ALGORITHM")
print("=" * 80)
print(summary)

# Save summary
summary.to_csv('results/summary_table.csv')
print("\n✓ Summary saved: results/summary_table.csv")

In [None]:
# Find best algorithm for each metric
print("\n" + "=" * 80)
print("BEST ALGORITHM BY METRIC")
print("=" * 80)

metrics = ['silhouette', 'nmi', 'ari', 'v_measure', 'cluster_accuracy']
for metric in metrics:
    best_algo = summary[metric].idxmax()
    best_value = summary[metric].max()
    print(f"{metric:25s}: {best_algo:25s} ({best_value:.4f})")

# Davies-Bouldin (lower is better)
best_algo = summary['davies_bouldin'].idxmin()
best_value = summary['davies_bouldin'].min()
print(f"{'davies_bouldin':25s}: {best_algo:25s} ({best_value:.4f}) [lower is better]")

## 8. Visualizations

### 8.1 Metrics Comparison Heatmap

In [None]:
# Create metrics comparison heatmap
plt.figure(figsize=(14, 8))

# Prepare data for heatmap
metrics_to_plot = ['silhouette', 'nmi', 'ari', 'v_measure', 'cluster_accuracy']
heatmap_data = summary[metrics_to_plot].T

sns.heatmap(heatmap_data, annot=True, fmt='.4f', cmap='YlGnBu', 
           linewidths=0.5, cbar_kws={'label': 'Score'})
plt.title('Clustering Performance Metrics Comparison (Average Across All Splits)', 
         fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Algorithm', fontsize=12, fontweight='bold')
plt.ylabel('Metric', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('results/metrics_comparison.png', dpi=300, bbox_inches='tight')
print("✓ Metrics comparison saved: results/metrics_comparison.png")
plt.show()

### 8.2 Performance by Split Ratio

In [None]:
# Plot performance across different split ratios
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

metrics_to_plot = ['silhouette', 'davies_bouldin', 'nmi', 'ari', 'v_measure', 'cluster_accuracy']
metric_labels = ['Silhouette Score', 'Davies-Bouldin Index', 'NMI', 'ARI', 'V-Measure', 'Cluster Accuracy (%)']

for idx, (metric, label) in enumerate(zip(metrics_to_plot, metric_labels)):
    ax = axes[idx]
    
    for algo in results_df['algorithm'].unique():
        algo_data = results_df[results_df['algorithm'] == algo]
        ax.plot(algo_data['split'], algo_data[metric], marker='o', label=algo, linewidth=2)
    
    ax.set_xlabel('Train-Test Split', fontweight='bold')
    ax.set_ylabel(label, fontweight='bold')
    ax.set_title(f'{label} by Split Ratio', fontweight='bold')
    ax.legend(loc='best', fontsize=8)
    ax.grid(True, alpha=0.3)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('results/performance_by_split.png', dpi=300, bbox_inches='tight')
print("✓ Performance by split saved: results/performance_by_split.png")
plt.show()

### 8.3 Radar Chart - Algorithm Comparison

In [None]:
# Create radar chart for algorithm comparison
from math import pi

# Normalize metrics to 0-1 scale
normalized_summary = summary[metrics_to_plot].copy()
for col in normalized_summary.columns:
    min_val = normalized_summary[col].min()
    max_val = normalized_summary[col].max()
    if max_val > min_val:
        normalized_summary[col] = (normalized_summary[col] - min_val) / (max_val - min_val)

# Setup radar chart
categories = metrics_to_plot
N = len(categories)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))

for algo in normalized_summary.index:
    values = normalized_summary.loc[algo].values.tolist()
    values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label=algo)
    ax.fill(angles, values, alpha=0.15)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=11)
ax.set_ylim(0, 1)
ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'], fontsize=9)
ax.grid(True)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=11)
plt.title('Algorithm Performance Comparison (Normalized Metrics)', 
         fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
plt.savefig('results/radar_chart.png', dpi=300, bbox_inches='tight')
print("✓ Radar chart saved: results/radar_chart.png")
plt.show()

### 8.4 Cluster Visualizations (2D PCA)

In [None]:
# Visualize clusters for selected algorithms
def visualize_clusters_2d(X, labels, y_true, title, filename):
    """Visualize clusters in 2D using first 2 PCA components."""
    pca_2d = PCA(n_components=2, random_state=CONFIG['RANDOM_STATE'])
    X_2d = pca_2d.fit_transform(X)
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot predicted clusters
    scatter1 = axes[0].scatter(X_2d[:, 0], X_2d[:, 1], c=labels, 
                              cmap='tab10', alpha=0.6, edgecolors='k', s=50)
    axes[0].set_xlabel('First Principal Component', fontweight='bold', fontsize=11)
    axes[0].set_ylabel('Second Principal Component', fontweight='bold', fontsize=11)
    axes[0].set_title(f'{title}\nPredicted Clusters', fontweight='bold', fontsize=12)
    axes[0].grid(True, alpha=0.3)
    plt.colorbar(scatter1, ax=axes[0], label='Cluster')
    
    # Plot true labels
    scatter2 = axes[1].scatter(X_2d[:, 0], X_2d[:, 1], c=y_true, 
                              cmap='tab10', alpha=0.6, edgecolors='k', s=50)
    axes[1].set_xlabel('First Principal Component', fontweight='bold', fontsize=11)
    axes[1].set_ylabel('Second Principal Component', fontweight='bold', fontsize=11)
    axes[1].set_title(f'{title}\nTrue Labels', fontweight='bold', fontsize=12)
    axes[1].grid(True, alpha=0.3)
    plt.colorbar(scatter2, ax=axes[1], label='Genre')
    
    plt.tight_layout()
    plt.savefig(f'results/{filename}', dpi=300, bbox_inches='tight')
    print(f"✓ Cluster visualization saved: results/{filename}")
    plt.show()

# Visualize best performing algorithms
print("\nGenerating cluster visualizations...\n")
for key, (X, labels, y_true) in list(cluster_visualizations.items())[:2]:
    algo_name = key.split('_')[0].title()
    split = key.split('_')[1]
    visualize_clusters_2d(X, labels, y_true, 
                         f'{algo_name} Clustering ({split} split)',
                         f'cluster_viz_{key}.png')

## 9. Summary and Conclusions

In [None]:
print("=" * 80)
print("CLUSTERING IMPLEMENTATION SUMMARY")
print("=" * 80)

print("\n✓ Implementation Complete!")

print("\n1. Algorithms Implemented:")
print("   - K-Means Clustering")
print("   - MiniBatch K-Means")
print("   - Spectral Clustering")
print("   - Gaussian Mixture Model (GMM)")

print("\n2. Evaluation Metrics:")
print("   - Silhouette Score")
print("   - Davies-Bouldin Index")
print("   - Calinski-Harabasz Index")
print("   - Normalized Mutual Information (NMI)")
print("   - Adjusted Rand Index (ARI)")
print("   - V-Measure")
print("   - Cluster Accuracy (Hungarian Algorithm)")

print("\n3. Experiments:")
print(f"   - Total experiments run: {len(all_results)}")
print(f"   - Split ratios tested: {CONFIG['SPLIT_RATIOS']}")

print("\n4. Best Performing Algorithm:")
best_overall = summary['cluster_accuracy'].idxmax()
best_acc = summary.loc[best_overall, 'cluster_accuracy']
print(f"   - Algorithm: {best_overall}")
print(f"   - Average Accuracy: {best_acc:.2f}%")
print(f"   - Improvement over random: {(best_acc/10 - 1)*100:.2f}% (baseline: 10%)")

print("\n5. Generated Files:")
print("   - results/clustering_results.csv")
print("   - results/summary_table.csv")
print("   - results/pca_variance.png")
print("   - results/metrics_comparison.png")
print("   - results/performance_by_split.png")
print("   - results/radar_chart.png")
print("   - results/cluster_viz_*.png")

print("\n" + "=" * 80)
print("PROJECT COMPLETED SUCCESSFULLY!")
print("=" * 80)