# Modeling and Evaluation

Train and evaluate a series of KMeans models to find the best performing model by choosing a value for **k**.

## Steps:

1. **Load the Clean, Combined Dataset**
   - Load the preprocessed and combined dataset containing the audio features.

2. **Select Audio Features Based on Description**
   - Choose the relevant audio features from the dataset for clustering.

3. **Scale the Dataset**
   - Apply scaling (e.g., StandardScaler) to normalize the features before training the model.

4. **Train a Range of Models with Different k Values**
   - Train multiple KMeans models using different values for **k** (e.g., k=2, 3, 4, ..., 10).

5. **Evaluate and Select the Top 2 Values for k**
   - Use the **Elbow Method** to visually inspect the optimal number of clusters.
   - Use the **Silhouette Score** to evaluate how well-defined the clusters are.
   
6. **Try a Live Test with the Selected Models**
   - Test the two top-performing models (based on the Elbow Method and Silhouette Score) in a live setting.
   - Select the best performing value of **k** based on the test results.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pickle
import os
import seaborn as sns
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('../data/clean/spotify_data_encoded.csv')

In [3]:
def display_basic_info(df):
    """
    Display basic information about the dataset including shape, data types, and missing values
    """
    print('Dataset Shape:', df.shape)
    print('\nData Types:')
    print(df.dtypes)
    print('\nMissing Values:')
    print(df.isnull().sum())

def display_numerical_summary(df):
    """
    Display summary statistics for numerical columns
    """
    print('Numerical Columns Summary:')
    print(df.describe())

def check_duplicates(df):
    """
    Check for duplicate entries in the dataset
    """
    duplicates = df.duplicated().sum()
    print(f'Number of duplicate entries: {duplicates}')
    
def display_unique_values(df, columns):
    """
    Display number of unique values for specified columns, with special handling for the genres column
    """
    print('Unique Values Count:')
    for col in columns:
        if col == 'genres':
            all_genres = []
            for genre_list in df[col].dropna():
                if isinstance(genre_list, str):
                    genre_list = eval(genre_list)
                all_genres.extend(genre_list)
            unique_genres = len(set(all_genres))
            print(f'{col}: {unique_genres} unique genres')
        else:
            print(f'{col}: {df[col].nunique()} unique values')

In [None]:
display_basic_info(df)

check_duplicates(df)

display_unique_values(df, ['artist', 'release_year', 'genres'])

In [5]:
data = df.copy()

#### Select Audio Features

In [6]:
# Select relevant audio features (music genres)
features = data[['rock', 'pop', 'blues', 'metal', 'hip-hop', 'country', 
                 'punk', 'jazz', 'rap', 'reggae', 'folk', 'soul', 'latin', 
                 'dance', 'indie', 'classical']]

#### Scale the features 

In [7]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
# save the scaler
with open('../models/genre_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# tTest loading the scaler
with open('../models/genre_scaler.pkl', 'rb') as f:
    loaded_scaler = pickle.load(f)
    
# check if it works
test_scaled = loaded_scaler.transform(features.head(1))
print('Verification - scaled features shape:', test_scaled.shape)

#### Train Models with Different k Values

In [None]:
# relevant features (encoded genres)
features = df[[
    'rock', 'pop', 'blues', 'metal', 'hip-hop', 'country', 
    'punk', 'jazz', 'rap', 'reggae', 'folk', 'soul', 'latin', 
    'dance', 'indie', 'classical'
]]

# scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# train models with different k values
k_values = [2, 3, 4, 5, 6, 7, 8, 9, 10]
models = [KMeans(n_clusters=k, random_state=42) for k in k_values]

# inertias for elbow method
inertias = []

for model in models:
    model.fit(scaled_features)
    inertias.append(model.inertia_)
    print(f'Model trained with k={model.n_clusters}')

#### Train Models and Collect Metrics

In [None]:
k_values = range(2, 147)
results = []

for k in k_values:
    # train model
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(scaled_features)
    
    # metrics
    inertia = model.inertia_
    silhouette = silhouette_score(scaled_features, model.labels_)
    
    results.append({
        'k': k,
        'inertia': inertia,
        'silhouette': silhouette
    })
    print(f'k={k}: inertia={inertia:.0f}, silhouette={silhouette:.3f}')

#### Evaluate using Elbow Method and Silhouette Score

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Elbow Method Plot
inertias = [r['inertia'] for r in results]
ax1.plot(k_values, inertias, 'bo-')
ax1.set_xlabel('k')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method')
ax1.grid(True)

# Silhouette Score Plot
silhouettes = [r['silhouette'] for r in results]
ax2.plot(k_values, silhouettes, 'ro-')
ax2.set_xlabel('k')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Analysis')
ax2.grid(True)

plt.tight_layout()
plt.show()

# Find top 2 k values based on silhouette score
top_k_values = sorted(results, key=lambda x: x['silhouette'], reverse=True)[:2]
print('\nTop 2 k values based on silhouette score:')
for result in top_k_values:
    print(f"k={result['k']}: silhouette={result['silhouette']:.3f}")

####  Train Final Models with Top k Values

In [None]:
final_models = {}
for result in top_k_values:
    k = result['k']
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(scaled_features)
    final_models[k] = model
    
    #cluster sizes
    unique, counts = np.unique(model.labels_, return_counts=True)
    print(f'\nCluster sizes for k={k}:')
    for cluster, size in zip(unique, counts):
        print(f'Cluster {cluster}: {size} songs')
    
    # most common genres per cluster
    cluster_centers = pd.DataFrame(
        model.cluster_centers_,
        columns=features.columns
    )
    
    print(f'\nDominant genres in each cluster (k={k}):')
    for i in range(k):
        top_genres = cluster_centers.iloc[i].nlargest(3)
        print(f"Cluster {i}: {', '.join([f'{g} ({v:.2f})' for g, v in top_genres.items()])}")

### Evaluate the models

In [None]:
def evaluate_clustering(scaled_features, model, feature_names):
    """
    Evaluate clustering results using multiple metrics and visualizations
    """
    labels = model.labels_
    centers = model.cluster_centers_
    
    # basic metrics
    silhouette = silhouette_score(scaled_features, labels)
    inertia = model.inertia_
    
    print(f'\nClustering Evaluation Metrics:')
    print(f'Silhouette Score: {silhouette:.3f}')
    print(f'Inertia: {inertia:.0f}')
    
    # cluster sizes
    # unique, counts = np.unique(labels, return_counts=True)
    # print('\nCluster Sizes:')
    # for cluster, size in zip(unique, counts):
    #     print(f'Cluster {cluster}: {size} songs ({size/len(labels)*100:.1f}%)')
    
    # analyze cluster characteristics
    cluster_centers = pd.DataFrame(
        centers,
        columns=feature_names
    )
    
    # print('\nDominant Genres per Cluster:')
    # for i in range(len(centers)):
    #     top_genres = cluster_centers.iloc[i].nlargest(3)
    #     print(f"Cluster {i}: {', '.join([f'{g} ({v:.2f})' for g, v in top_genres.items()])}")
    
    # visualizations
    plt.figure(figsize=(15, 10))
    
    # PCA visualization of clusters
    pca = PCA(n_components=2)
    reduced_features = pca.fit_transform(scaled_features)
    
    plt.subplot(2, 2, 1)
    scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], 
                         c=labels, cmap='viridis', alpha=0.6)
    plt.colorbar(scatter)
    plt.title('Cluster Visualization (PCA)')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    
    plt.subplot(2, 2, 2)
    plt.bar(unique, counts)
    plt.title('Cluster Size Distribution')
    plt.xlabel('Cluster')
    plt.ylabel('Number of Songs')
    
    plt.figure(figsize=(14, 20))
    plt.subplot(2, 2, (3, 4))
    sns.heatmap(cluster_centers, annot=True, cmap='Greens', fmt='.2f')
    plt.title('Cluster Centers Heatmap')
    plt.xlabel('Features')
    plt.ylabel('Cluster')
    
    plt.tight_layout()
    plt.show()
    
    return silhouette, inertia

best_k = 59
best_model = KMeans(n_clusters=best_k, random_state=42)
best_model.fit(scaled_features)

# evaluate the model
silhouette, inertia = evaluate_clustering(
    scaled_features, 
    best_model, 
    features.columns
)

# save best model
with open('../models/kmeans_genre_classifier.pkl', 'wb') as f:
    pickle.dump(best_model, f)