# Unsupervised Music Genre Discovery Using Audio Feature Learning

**Author:** Music Genre Analysis Project  
**Date:** November 2025  
**Platform:** Kaggle Notebook

## Project Overview
This notebook implements comprehensive unsupervised learning algorithms for music genre discovery using Spotify audio features.

### Objectives:
- Perform exploratory data analysis on 170K+ music tracks
- Apply 4 clustering algorithms: K-Means, Spectral Clustering, DBSCAN, GMM
- Evaluate using 6+ metrics (Silhouette, Davies-Bouldin, Calinski-Harabasz, ARI, NMI, V-Measure)
- Compare performance across different train/test splits (50-50, 60-40, 70-30, 80-20)

## Step 1: Install and Import Required Libraries

In [None]:
# Install required packages (if needed)
!pip install plotly yellowbrick -q

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN, MiniBatchKMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    normalized_mutual_info_score,
    v_measure_score
)

# Statistical Analysis
from scipy import stats
import os
from datetime import datetime

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ All libraries imported successfully!")

## Step 2: Upload and Load Dataset

**Instructions:**
1. Upload your Spotify dataset CSV files to Kaggle
2. Update the path below to match your dataset location
3. The main dataset should be `data.csv` with all audio features

In [None]:
# Load the dataset
# Update this path to match your Kaggle dataset location
DATA_PATH = '/kaggle/input/spotify-dataset/data.csv'  # Modify as needed

# Alternative: if you have the folder structure
# DATA_PATH = '/kaggle/input/your-dataset-name/Spotify/data/data.csv'

print("Loading dataset...")
df = pd.read_csv(DATA_PATH)

print(f"‚úÖ Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

## Step 3: Data Preprocessing and Cleaning

In [None]:
# Define audio features to analyze
audio_features = [
    'acousticness', 'danceability', 'energy', 'instrumentalness',
    'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]

additional_features = ['duration_ms', 'popularity', 'key', 'mode']
all_features = audio_features + additional_features

print("\nüìä Data Preprocessing Steps:")
print("="*60)

# 1. Check initial data
print(f"\n1. Initial dataset shape: {df.shape}")
print(f"   Total tracks: {len(df):,}")

# 2. Check for missing values
print(f"\n2. Missing values check:")
missing = df[all_features].isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
    print(f"   ‚Üí Filling missing values with column mean...")
    for col in all_features:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].mean(), inplace=True)
else:
    print("   ‚úÖ No missing values found!")

# 3. Remove duplicates
original_len = len(df)
df = df.drop_duplicates(subset=['name', 'artists', 'duration_ms'])
duplicates_removed = original_len - len(df)
print(f"\n3. Duplicate removal:")
print(f"   Removed {duplicates_removed:,} duplicate tracks")
print(f"   Remaining: {len(df):,} tracks")

# 4. Remove outliers using IQR method
print(f"\n4. Outlier detection and removal:")
df_clean = df.copy()

for feature in all_features:
    Q1 = df_clean[feature].quantile(0.25)
    Q3 = df_clean[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    
    before = len(df_clean)
    df_clean = df_clean[
        (df_clean[feature] >= lower_bound) & 
        (df_clean[feature] <= upper_bound)
    ]

outliers_removed = len(df) - len(df_clean)
outlier_percentage = (outliers_removed / len(df)) * 100
print(f"   Removed {outliers_removed:,} outliers ({outlier_percentage:.2f}%)")
print(f"   Final clean dataset: {len(df_clean):,} tracks")

# 5. Add derived features
df_clean['duration_sec'] = df_clean['duration_ms'] / 1000
df_clean['decade'] = (df_clean['year'] // 10) * 10

print(f"\n‚úÖ Data cleaning completed!")
print(f"   Final shape: {df_clean.shape}")

# Store clean data
data = df_clean.copy()

## Step 4: Exploratory Data Analysis (EDA)

In [None]:
print("\nüìà EXPLORATORY DATA ANALYSIS")
print("="*60)

# Basic statistics
print("\n1. Descriptive Statistics:")
stats_df = data[all_features].describe()
print(stats_df)

# Calculate additional statistics
print("\n2. Statistical Measures for Each Feature:")
print("="*80)
print(f"{'Feature':<20} {'Mean':<10} {'Median':<10} {'Q1(25%)':<10} {'Q3(75%)':<10} {'Std':<10}")
print("="*80)

for feature in audio_features:
    mean_val = data[feature].mean()
    median_val = data[feature].median()
    q1 = data[feature].quantile(0.25)
    q3 = data[feature].quantile(0.75)
    std_val = data[feature].std()
    
    print(f"{feature:<20} {mean_val:<10.4f} {median_val:<10.4f} {q1:<10.4f} {q3:<10.4f} {std_val:<10.4f}")

# Check data distribution
print("\n3. Distribution Pattern Analysis:")
for feature in audio_features:
    skewness = data[feature].skew()
    kurtosis = data[feature].kurtosis()
    print(f"   {feature}: Skewness={skewness:.3f}, Kurtosis={kurtosis:.3f}")

### Visualization 1: Feature Distributions

In [None]:
# Plot distributions
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.ravel()

for idx, feature in enumerate(audio_features):
    axes[idx].hist(data[feature], bins=50, edgecolor='black', alpha=0.7)
    axes[idx].axvline(data[feature].mean(), color='red', linestyle='--', 
                     linewidth=2, label=f'Mean: {data[feature].mean():.3f}')
    axes[idx].axvline(data[feature].median(), color='green', linestyle='--', 
                     linewidth=2, label=f'Median: {data[feature].median():.3f}')
    axes[idx].set_title(f'{feature.capitalize()} Distribution', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(feature.capitalize())
    axes[idx].set_ylabel('Frequency')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Feature distribution plots created!")

### Visualization 2: Box Plots for Outlier Detection

In [None]:
# Box plots
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.ravel()

for idx, feature in enumerate(audio_features):
    box_data = axes[idx].boxplot(data[feature], vert=True, patch_artist=True,
                                 boxprops=dict(facecolor='lightblue', alpha=0.7),
                                 medianprops=dict(color='red', linewidth=2))
    axes[idx].set_title(f'{feature.capitalize()} Box Plot', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(feature.capitalize())
    axes[idx].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('box_plots.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Box plots created!")

### Visualization 3: Correlation Analysis

In [None]:
# Correlation heatmap
plt.figure(figsize=(14, 10))
correlation_matrix = data[all_features].corr()

sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nüìä Correlation Analysis:")
print("High positive correlations (> 0.5):")
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.5:
            print(f"   {correlation_matrix.columns[i]} ‚Üî {correlation_matrix.columns[j]}: {correlation_matrix.iloc[i, j]:.3f}")

print("\n‚úÖ Correlation heatmap created!")

## Step 5: Feature Preparation and Scaling

In [None]:
print("\nüîß FEATURE PREPARATION")
print("="*60)

# Select features for clustering
features = data[all_features].copy()
print(f"\n1. Selected features shape: {features.shape}")
print(f"   Features: {len(all_features)}")
print(f"   Samples: {len(features):,}")

# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
print(f"\n2. ‚úÖ Features scaled using StandardScaler")

# Apply PCA (optional - for dimensionality reduction)
n_components = min(features.shape[1], 13)
pca = PCA(n_components=n_components)
processed_data = pca.fit_transform(scaled_features)

print(f"\n3. PCA Dimensionality Reduction:")
print(f"   Original dimensions: {scaled_features.shape[1]}")
print(f"   Reduced dimensions: {processed_data.shape[1]}")
print(f"   Explained variance: {pca.explained_variance_ratio_.sum():.4f}")
print(f"\n   Variance per component:")
for i, var in enumerate(pca.explained_variance_ratio_[:5]):
    print(f"   PC{i+1}: {var:.4f} ({var*100:.2f}%)")

print(f"\n‚úÖ Feature preparation completed!")
print(f"   Ready for clustering with shape: {processed_data.shape}")

## Step 6: Clustering - K-Means Algorithm

In [None]:
print("\nüéØ CLUSTERING ALGORITHM 1: K-MEANS")
print("="*60)

n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10, max_iter=300)

print(f"\nFitting K-Means with {n_clusters} clusters...")
kmeans_labels = kmeans.fit_predict(processed_data)

print(f"\n‚úÖ K-Means clustering completed!")
print(f"   Clusters found: {len(set(kmeans_labels))}")
print(f"   Inertia: {kmeans.inertia_:.2f}")
print(f"\n   Cluster distribution:")

unique, counts = np.unique(kmeans_labels, return_counts=True)
for cluster, count in zip(unique, counts):
    percentage = (count / len(kmeans_labels)) * 100
    print(f"   Cluster {cluster}: {count:,} samples ({percentage:.2f}%)")

## Step 7: Clustering - MiniBatch K-Means

In [None]:
print("\nüéØ CLUSTERING ALGORITHM 2: MINIBATCH K-MEANS")
print("="*60)

mbkmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=1000)

print(f"\nFitting MiniBatch K-Means with {n_clusters} clusters...")
mbkmeans_labels = mbkmeans.fit_predict(processed_data)

print(f"\n‚úÖ MiniBatch K-Means clustering completed!")
print(f"   Clusters found: {len(set(mbkmeans_labels))}")
print(f"   Inertia: {mbkmeans.inertia_:.2f}")
print(f"\n   Cluster distribution:")

unique, counts = np.unique(mbkmeans_labels, return_counts=True)
for cluster, count in zip(unique, counts):
    percentage = (count / len(mbkmeans_labels)) * 100
    print(f"   Cluster {cluster}: {count:,} samples ({percentage:.2f}%)")

## Step 8: Clustering - Spectral Clustering (Sample-based for memory)

In [None]:
print("\nüéØ CLUSTERING ALGORITHM 3: SPECTRAL CLUSTERING")
print("="*60)

# Note: Spectral Clustering is memory-intensive
# We'll use a sample if dataset is too large
SPECTRAL_SAMPLE_SIZE = 20000  # Adjust based on available RAM

if len(processed_data) > SPECTRAL_SAMPLE_SIZE:
    print(f"\n‚ö†Ô∏è  Dataset too large for full Spectral Clustering")
    print(f"   Using sample of {SPECTRAL_SAMPLE_SIZE:,} points...")
    
    sample_indices = np.random.choice(len(processed_data), SPECTRAL_SAMPLE_SIZE, replace=False)
    spectral_data = processed_data[sample_indices]
    
    spectral = SpectralClustering(n_clusters=n_clusters, random_state=42, affinity='nearest_neighbors')
    spectral_labels_sample = spectral.fit_predict(spectral_data)
    
    # Create full labels array with -1 for non-sampled points
    spectral_labels = np.full(len(processed_data), -1)
    spectral_labels[sample_indices] = spectral_labels_sample
    
    print(f"\n‚úÖ Spectral Clustering completed on sample!")
    print(f"   Sampled points: {SPECTRAL_SAMPLE_SIZE:,}")
    print(f"   Clusters found: {len(set(spectral_labels_sample))}")
else:
    print(f"\nFitting Spectral Clustering with {n_clusters} clusters...")
    spectral = SpectralClustering(n_clusters=n_clusters, random_state=42)
    spectral_labels = spectral.fit_predict(processed_data)
    print(f"\n‚úÖ Spectral Clustering completed!")
    print(f"   Clusters found: {len(set(spectral_labels))}")

valid_labels = spectral_labels[spectral_labels != -1]
unique, counts = np.unique(valid_labels, return_counts=True)
print(f"\n   Cluster distribution:")
for cluster, count in zip(unique, counts):
    percentage = (count / len(valid_labels)) * 100
    print(f"   Cluster {cluster}: {count:,} samples ({percentage:.2f}%)")

## Step 9: Clustering - DBSCAN

In [None]:
print("\nüéØ CLUSTERING ALGORITHM 4: DBSCAN")
print("="*60)

# Try different eps values to find optimal clusters
eps_values = [0.3, 0.5, 0.8, 1.0]
best_dbscan = None
best_n_clusters = 1
best_eps = eps_values[0]
dbscan_labels = None  # Initialize to avoid NameError

print("\nTesting different eps values...")
for eps in eps_values:
    dbscan_test = DBSCAN(eps=eps, min_samples=5)
    labels_test = dbscan_test.fit_predict(processed_data)
    n_clusters_test = len(set(labels_test)) - (1 if -1 in labels_test else 0)
    n_noise = list(labels_test).count(-1)
    
    print(f"   eps={eps}: {n_clusters_test} clusters, {n_noise:,} noise points")
    
    # Always keep the last tested labels as fallback
    if dbscan_labels is None:
        dbscan_labels = labels_test
        best_eps = eps
        best_n_clusters = n_clusters_test
    
    # Update if we find better clustering (more clusters but not too many)
    if n_clusters_test > best_n_clusters and n_clusters_test <= 20:
        best_n_clusters = n_clusters_test
        best_eps = eps
        best_dbscan = dbscan_test
        dbscan_labels = labels_test

print(f"\n‚úÖ Using DBSCAN with eps={best_eps}")
print(f"   Clusters found: {best_n_clusters}")
print(f"   Noise points: {list(dbscan_labels).count(-1):,}")

if best_n_clusters > 1:
    unique, counts = np.unique(dbscan_labels[dbscan_labels != -1], return_counts=True)
    print(f"\n   Cluster distribution (excluding noise):")
    for cluster, count in zip(unique, counts):
        percentage = (count / len(dbscan_labels[dbscan_labels != -1])) * 100
        print(f"   Cluster {cluster}: {count:,} samples ({percentage:.2f}%)")

## Step 10: Clustering - Gaussian Mixture Model (GMM)

In [None]:
print("\nüéØ CLUSTERING ALGORITHM 5: GAUSSIAN MIXTURE MODEL")
print("="*60)

gmm = GaussianMixture(n_components=n_clusters, random_state=42, max_iter=100)

print(f"\nFitting GMM with {n_clusters} components...")
gmm_labels = gmm.fit_predict(processed_data)

print(f"\n‚úÖ GMM clustering completed!")
print(f"   Components: {len(set(gmm_labels))}")
print(f"   Converged: {gmm.converged_}")
print(f"   BIC Score: {gmm.bic(processed_data):.2f}")
print(f"   AIC Score: {gmm.aic(processed_data):.2f}")
print(f"\n   Cluster distribution:")

unique, counts = np.unique(gmm_labels, return_counts=True)
for cluster, count in zip(unique, counts):
    percentage = (count / len(gmm_labels)) * 100
    print(f"   Cluster {cluster}: {count:,} samples ({percentage:.2f}%)")

## Step 11: Evaluation Metrics - Internal Metrics

In [None]:
print("\nüìä CLUSTERING EVALUATION - INTERNAL METRICS")
print("="*80)

results = []

algorithms = {
    'K-Means': kmeans_labels,
    'MiniBatch K-Means': mbkmeans_labels,
    'Spectral Clustering': spectral_labels,
    'DBSCAN': dbscan_labels,
    'GMM': gmm_labels
}

print("\nCalculating evaluation metrics (this may take a few minutes)...\n")

for name, labels in algorithms.items():
    print(f"Evaluating {name}...")
    
    # Skip if too few clusters or errors
    n_clusters_found = len(set(labels)) - (1 if -1 in labels else 0)
    
    if n_clusters_found < 2:
        print(f"   ‚ö†Ô∏è  Skipping {name} - insufficient clusters ({n_clusters_found})")
        results.append({
            'Algorithm': name,
            'N_Clusters': n_clusters_found,
            'Silhouette_Score': np.nan,
            'Davies_Bouldin_Index': np.nan,
            'Calinski_Harabasz_Index': np.nan
        })
        continue
    
    # For algorithms with noise points, filter them out
    if -1 in labels:
        mask = labels != -1
        eval_data = processed_data[mask]
        eval_labels = labels[mask]
    else:
        eval_data = processed_data
        eval_labels = labels
    
    # Sample for faster computation if needed
    EVAL_SAMPLE_SIZE = 10000
    if len(eval_data) > EVAL_SAMPLE_SIZE:
        sample_idx = np.random.choice(len(eval_data), EVAL_SAMPLE_SIZE, replace=False)
        eval_data_sample = eval_data[sample_idx]
        eval_labels_sample = eval_labels[sample_idx]
    else:
        eval_data_sample = eval_data
        eval_labels_sample = eval_labels
    
    try:
        # Internal metrics
        silhouette = silhouette_score(eval_data_sample, eval_labels_sample)
        davies_bouldin = davies_bouldin_score(eval_data, eval_labels)
        calinski_harabasz = calinski_harabasz_score(eval_data, eval_labels)
        
        results.append({
            'Algorithm': name,
            'N_Clusters': n_clusters_found,
            'Silhouette_Score': silhouette,
            'Davies_Bouldin_Index': davies_bouldin,
            'Calinski_Harabasz_Index': calinski_harabasz
        })
        
        print(f"   ‚úÖ Silhouette Score: {silhouette:.4f}")
        print(f"   ‚úÖ Davies-Bouldin Index: {davies_bouldin:.4f}")
        print(f"   ‚úÖ Calinski-Harabasz Index: {calinski_harabasz:.2f}")
        
    except Exception as e:
        print(f"   ‚ùå Error: {str(e)}")
        results.append({
            'Algorithm': name,
            'N_Clusters': n_clusters_found,
            'Silhouette_Score': np.nan,
            'Davies_Bouldin_Index': np.nan,
            'Calinski_Harabasz_Index': np.nan
        })

# Create results DataFrame
results_df = pd.DataFrame(results)

print("\n" + "="*80)
print("EVALUATION RESULTS SUMMARY")
print("="*80)
print(results_df.to_string(index=False))
print("\n‚úÖ Evaluation completed!")

## Step 12: Comparison Visualization

In [None]:
# Plot comparison of algorithms
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Silhouette Score (higher is better)
results_df_clean = results_df.dropna()
axes[0].bar(results_df_clean['Algorithm'], results_df_clean['Silhouette_Score'], color='skyblue')
axes[0].set_title('Silhouette Score Comparison\n(Higher is Better)', fontweight='bold')
axes[0].set_ylabel('Silhouette Score')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3, axis='y')

# Davies-Bouldin Index (lower is better)
axes[1].bar(results_df_clean['Algorithm'], results_df_clean['Davies_Bouldin_Index'], color='coral')
axes[1].set_title('Davies-Bouldin Index Comparison\n(Lower is Better)', fontweight='bold')
axes[1].set_ylabel('Davies-Bouldin Index')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')

# Calinski-Harabasz Index (higher is better)
axes[2].bar(results_df_clean['Algorithm'], results_df_clean['Calinski_Harabasz_Index'], color='lightgreen')
axes[2].set_title('Calinski-Harabasz Index Comparison\n(Higher is Better)', fontweight='bold')
axes[2].set_ylabel('Calinski-Harabasz Index')
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('clustering_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Comparison visualization created!")

## Step 13: Train/Test Split Experiments

In [None]:
print("\nüß™ TRAIN/TEST SPLIT EXPERIMENTS")
print("="*80)

split_ratios = [0.5, 0.6, 0.7, 0.8]  # Train sizes: 50%, 60%, 70%, 80%
experiment_results = []

for train_size in split_ratios:
    print(f"\n{'='*80}")
    print(f"Experiment: {int(train_size*100)}-{int((1-train_size)*100)} Split")
    print(f"{'='*80}")
    
    # Split data
    X_train, X_test = train_test_split(processed_data, train_size=train_size, random_state=42)
    
    print(f"\nTrain size: {len(X_train):,} | Test size: {len(X_test):,}")
    
    # Test K-Means
    print(f"\nTesting K-Means...")
    kmeans_exp = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans_exp.fit(X_train)
    train_labels = kmeans_exp.predict(X_train)
    test_labels = kmeans_exp.predict(X_test)
    
    # Evaluate
    train_silhouette = silhouette_score(X_train[:10000], train_labels[:10000])  # Sample for speed
    test_silhouette = silhouette_score(X_test[:min(10000, len(X_test))], 
                                       test_labels[:min(10000, len(X_test))])
    
    experiment_results.append({
        'Split': f"{int(train_size*100)}-{int((1-train_size)*100)}",
        'Algorithm': 'K-Means',
        'Train_Silhouette': train_silhouette,
        'Test_Silhouette': test_silhouette,
        'Difference': abs(train_silhouette - test_silhouette)
    })
    
    print(f"   Train Silhouette: {train_silhouette:.4f}")
    print(f"   Test Silhouette: {test_silhouette:.4f}")
    print(f"   Difference: {abs(train_silhouette - test_silhouette):.4f}")

# Display experiment results
exp_df = pd.DataFrame(experiment_results)
print("\n" + "="*80)
print("EXPERIMENT RESULTS SUMMARY")
print("="*80)
print(exp_df.to_string(index=False))

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(exp_df))
width = 0.35

ax.bar(x - width/2, exp_df['Train_Silhouette'], width, label='Train', color='skyblue')
ax.bar(x + width/2, exp_df['Test_Silhouette'], width, label='Test', color='coral')

ax.set_xlabel('Train-Test Split', fontweight='bold')
ax.set_ylabel('Silhouette Score', fontweight='bold')
ax.set_title('Train vs Test Performance Across Different Splits', fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(exp_df['Split'])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('train_test_experiments.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úÖ Train/Test experiments completed!")

## Step 14: Final Report Generation

In [None]:
print("\n" + "="*80)
print("FINAL ANALYSIS REPORT")
print("="*80)

print(f"\nüìä DATASET SUMMARY:")
print(f"   Original tracks: {df.shape[0]:,}")
print(f"   After cleaning: {len(data):,}")
print(f"   Features analyzed: {len(all_features)}")
print(f"   Duplicates removed: {duplicates_removed:,}")
print(f"   Outliers removed: {outliers_removed:,} ({outlier_percentage:.2f}%)")

print(f"\nüéØ CLUSTERING SUMMARY:")
print(f"   Algorithms tested: 5")
print(f"   Target clusters (K): {n_clusters}")

print(f"\nüèÜ BEST PERFORMING ALGORITHM:")
if not results_df_clean.empty:
    best_algo = results_df_clean.loc[results_df_clean['Silhouette_Score'].idxmax()]
    print(f"   Algorithm: {best_algo['Algorithm']}")
    print(f"   Silhouette Score: {best_algo['Silhouette_Score']:.4f}")
    print(f"   Davies-Bouldin Index: {best_algo['Davies_Bouldin_Index']:.4f}")
    print(f"   Calinski-Harabasz Index: {best_algo['Calinski_Harabasz_Index']:.2f}")

print(f"\nüìà KEY FINDINGS:")
print(f"   1. Dataset contains {len(data):,} unique music tracks")
print(f"   2. {len(all_features)} audio features used for clustering")
print(f"   3. Successfully applied 5 clustering algorithms")
print(f"   4. Evaluated using 3 internal metrics")
print(f"   5. Tested on 4 different train/test splits")

print(f"\nüíæ OUTPUT FILES GENERATED:")
print(f"   ‚úÖ feature_distributions.png")
print(f"   ‚úÖ box_plots.png")
print(f"   ‚úÖ correlation_heatmap.png")
print(f"   ‚úÖ clustering_comparison.png")
print(f"   ‚úÖ train_test_experiments.png")

print(f"\n" + "="*80)
print("‚úÖ ANALYSIS COMPLETED SUCCESSFULLY!")
print("="*80)

# Save results to CSV
results_df.to_csv('clustering_results.csv', index=False)
exp_df.to_csv('experiment_results.csv', index=False)

print(f"\nüìÅ Results saved to:")
print(f"   - clustering_results.csv")
print(f"   - experiment_results.csv")

## Step 15: Save Results for Download

In [None]:
# Add cluster labels to original data
data['KMeans_Cluster'] = kmeans_labels
data['GMM_Cluster'] = gmm_labels
data['DBSCAN_Cluster'] = dbscan_labels

# Save enhanced dataset
data.to_csv('music_data_with_clusters.csv', index=False)

print("‚úÖ Enhanced dataset saved: music_data_with_clusters.csv")
print(f"   Includes cluster assignments for each algorithm")
print(f"\nYou can now download all generated files from Kaggle!")