In [None]:
# Clustering tracks using mood-related audio features

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Load the cleaned dataset containing audio features for songs
# This file should already have missing or inconsistent data handled
df = pd.read_csv("../data/audio_features_cleaned.csv")

# Define the audio features most relevant to mood and musical feel
# These will be used as input dimensions for clustering
features = ["danceability", "energy", "valence", "tempo", "acousticness", "instrumentalness"]

# Extract only the selected features, dropping any rows with missing values
X = df[features].dropna()

# Standardize the features so that each one contributes equally to distance calculations
# This is important because features like tempo and valence are on very different scales
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply KMeans clustering to group songs into mood-based clusters
# We arbitrarily choose 5 clusters to capture a variety of moods or styles
kmeans = KMeans(n_clusters=5, random_state=42, n_init="auto")
df["mood_cluster"] = kmeans.fit_predict(X_scaled)

# Save the clustered dataset to a new CSV file for future use
df.to_csv("../data/audio_features_clustered.csv", index=False)
print("Clustering done and saved to audio_features_clustered.csv")

# Visualize the resulting clusters using a pairplot
# This shows how songs are distributed across clusters along different feature axes
df_clustered = df.copy()
sns.pairplot(df_clustered[features + ["mood_cluster"]], hue="mood_cluster", palette="husl")
plt.suptitle("Mood Clusters based on Audio Features", y=1.02)
plt.show()