In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

"Libraries to help with jupyter notebook usage"
# Increases jupyter notebook display width
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from IPython.display import Image # Helps display images in notebook

In [2]:
import sys
"Appends both parent and grandparent dir to current path, to allow importing"
sys.path.append('..')
sys.path.append('../..')

from jupyternotebook_utils import *
from utils import *
from data_processing import *
from spotify_data import *
from spotify_utils import *

## Data Import
Import files containing information regarding tracks from over 6000 different genres and their audio features, roughly 50 tracks per genre

In [3]:
audio_features_by_genre_file_path = f'../../data/processed_data/genres/audio_features_by_genre.csv'
audio_features_by_genre_df = pd.read_csv(audio_features_by_genre_file_path)
original_audio_features_by_genre_df = audio_features_by_genre_df.copy()

clustered_audio_features_by_genre_file_path = f'../../data/processed_data/genres/audio_features_by_genre.csv'

## 2. Data Pre-processing
1. Remove irrelevant columns such as key and duration
2. Remove outliers for each audio features in `audio_features_df`
3. Perform One-Hot-Encoding (OHE) for `track_popularity` and `release_date` in `track_details_df`

## Utility for Pre-processing
Declaration of constants and functions to be used for data pre-processing

In [None]:
before_graph_audio_features_df = audio_features_by_genre_df.copy()
after_graph_audio_features_df = audio_features_by_genre_df.copy()

num_bins = 250

graph_height = 4
graph_width = 4

audio_features_list = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 
                       'acousticness', 'instrumentalness', 'liveness', 'valence',
                       'tempo', 'duration_ms']

def plot_graph(plot_title, graph_height, graph_width, graph_features_list = audio_features_list, graph_df = before_graph_audio_features_df):
    graph_height = graph_height
    graph_width = graph_width
    graph_count = len(graph_features_list)

    fig, axes = plt.subplots(1, graph_count, figsize=(graph_width * graph_count, graph_height))
    fig.suptitle(plot_title, fontsize=16)
    
    for i, audio_feature in enumerate(graph_features_list, start = 0):
        ax = axes[i]
        ax.hist(graph_df[audio_feature], bins=num_bins, range=(0, 1), edgecolor='none')
        ax.set_title(f'{str(audio_feature).capitalize()}')
        ax.set_xlabel(f'{audio_feature.capitalize()}')
        ax.set_ylabel('Frequency')

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

In [None]:
before_graph_audio_features_df.describe()

In [None]:
# Step 1: Min-max normalization for selected features
input_data_ranges = {
    'loudness': (-60, 0),
    'tempo': (0, 250),
    'key': (-1, 11)
}
for feature, (feature_min, feature_max) in input_data_ranges.items():
    after_graph_audio_features_df[feature] = (after_graph_audio_features_df[feature] - feature_min) / (feature_max - feature_min)

# Step 2: Keep selected audio features
selected_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo']
filtered_data = after_graph_audio_features_df[selected_features].copy()

# Step 3: Cube root transformation for selected features
cubed_features = ['speechiness', 'acousticness', 'liveness']
for feature in cubed_features:
    filtered_data[feature] = np.cbrt(filtered_data[feature])

# Step 4: Logarithmic transformation for selected features
log_features = ['energy', 'loudness']
for feature in log_features:
    filtered_data[feature] = np.log1p(filtered_data[feature])

# Step 5: IQR-filtering based on thresholds
lower_thresholds =  {'danceability': 0.16850000000000004, 'energy': 0.14936944969853527, 'loudness': 0.5734535346157168, 
                     'speechiness': 0.11183139278235099, 'acousticness': -0.35285124201451823, 'liveness': 0.20276263879283324, 
                     'valence': -0.24249999999999988, 'tempo': 0.16009000000000007}
upper_thresholds = {'danceability': 1.0525, 'energy': 0.8605850164229211, 'loudness': 0.698995265917933, 
                    'speechiness': 0.6924946680943335, 'acousticness': 1.280417950271011, 'liveness': 0.8754111662234604, 
                    'valence': 1.2494999999999998, 'tempo': 0.7998179999999999}

for feature in selected_features:
    filtered_data[feature] = np.where(
        (filtered_data[feature] >= lower_thresholds[feature]) &
        (filtered_data[feature] <= upper_thresholds[feature]),
        filtered_data[feature],
        np.nan
    )

# Step 6: Min-max Normalization
feature_min = {'danceability': 0.169, 'energy': 0.1501426584297195, 'loudness': 0.5734623376139655, 
               'speechiness': 0.27977873676275317, 'acousticness': 0.0, 'liveness': 0.21074564860592623, 
               'valence': 0.0, 'tempo': 0.164644}

feature_max = {'danceability': 0.991, 'energy': 0.6931471805599453, 'loudness': 0.6983917371326527, 
               'speechiness': 0.692435557262704, 'acousticness': 0.9986648849277057, 'liveness': 0.8750340122833274, 
               'valence': 1.0, 'tempo': 0.799812}

for feature in selected_features:
    if feature in feature_min:
        after_graph_audio_features_df[feature] = (after_graph_audio_features_df[feature] - feature_min[feature]) / (feature_max[feature] - feature_min[feature])

## Genre Clustering Using KMeans Algorithm

Here's an overview of the whole clustering process:
1. Identifying the optimal value of K for the Partitioning Clustering using the Elbow Method.
2. Run Kmeans algorithm using the optimal K value.
3. For each cluster, re-run Elbow Method to find the optimal K value for Hierachial Subclustering.

### Constant Declarations for KMeans Algorithm

In [None]:
from IPython.display import clear_output # clears output for better logging
from sklearn.cluster import KMeans
from collections import Counter # used to count frequency of genres in each cluster

min_clusters = 1
max_clusters = 11

# NOTE: key, and duration_ms is removed
audio_feature_columns = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo']

In [None]:
X = audio_features_by_genre_df.loc[:, audio_feature_columns]
inertia_values = []

for k in range(min_clusters, max_clusters):
    
    status = f'Attempting Cluster Size: {k}'
    print(status, end='\r')  # '\r' moves the cursor to the beginning of the same line, effectively overwriting previous line
    
    kmeans_main_model = KMeans(n_clusters=k, random_state=42)
    kmeans_main_model.fit(X)
    inertia_values.append(kmeans_main_model.inertia_)

clear_output(wait=True)

plt.figure(figsize=(10, 6))
plt.plot(range(min_clusters, max_clusters), inertia_values, marker='o')
plt.title('Determining Optimal Value for K')
plt.xlabel('Number of Clusters, K')
plt.ylabel('Inertia')
plt.xticks(range(min_clusters, max_clusters))
plt.show()

In [None]:
audio_features_kmeans_df = audio_features_by_genre_df.copy()

X = audio_features_kmeans_df.loc[:, audio_feature_columns]

num_clusters = 6
kmeans_main_model = KMeans(n_clusters=num_clusters, random_state=42)
kmeans_main_model.fit(X)

model_file_path = '../models/modlkmeans_model.joblib'
joblib.dump(kmeans_main_model, model_file_path)

print(f'KMeans model saved to {model_file_path}')

In [None]:
model_file_path = '../models/modlkmeans_model.joblib'
kmeans_main_model = joblib.load(model_file_path)

status = f'Kmeans Model succesfully loaded'
print(status, end='\r')

audio_features_kmeans_df['cluster'] = kmeans_main_model.labels_ + 1
audio_features_kmeans_df.sort_values(by='cluster', inplace=True)

selected_columns = ['genre', 'cluster', 'track_name', 'track_uri']
selected_df = audio_features_kmeans_df[selected_columns]
clustered_audio_features_by_genre_file_path = '../../data/processed_data/genres/clustered_audio_features_by_genre.csv'
selected_df.to_csv(clustered_audio_features_by_genre_file_path, index=False)

print(f'Sorted and selected data with cluster assignments written to {clustered_audio_features_by_genre_file_path}')

In [None]:
unique_clusters = audio_features_kmeans_df['cluster'].unique()

plt.figure(figsize=(10, 6))

for cluster in unique_clusters:
    status = f'Finding optimal value of K for Cluster {cluster}'
    print(status)
    
    # Filter data for the current cluster
    cluster_data = audio_features_kmeans_df[audio_features_kmeans_df['cluster'] == cluster]
    X = cluster_data.loc[:, audio_feature_columns]

    # Perform the elbow method to find the optimal K value
    inertia_values = []
    
    for k in range(min_clusters, max_clusters):
        status = f'Attempting Cluster Size: {k}'
        print(status, end='\r')
        
        kmeans_subcluster_model = KMeans(n_clusters=k, random_state=42)
        kmeans_subcluster_model.fit(X)
        inertia_values.append(kmeans_subcluster_model.inertia_)

    clear_output(wait=True)
    
    # Plot the elbow method results for each cluster on the same chart
    plt.plot(range(min_clusters, max_clusters), inertia_values, marker='o', label=f'Cluster {cluster}')
    plt.title('Elbow Method for Optimal K')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Inertia')
    plt.xticks(range(min_clusters, max_clusters))
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
num_subclusters_per_cluster = {
    1: 6,
    2: 5,
    3: 6,
    4: 6,
    5: 5,
    6: 5
}

for cluster, num_subclusters in num_subclusters_per_cluster.items():
    status = f'Building Subcluster Model for Cluster {cluster}'
    print(status, end='\r')
    
    cluster_data = audio_features_kmeans_df[audio_features_kmeans_df['cluster'] == cluster]

    X_subcluster = cluster_data.loc[:, audio_feature_columns]

    kmeans_subcluster_model = KMeans(n_clusters=num_subclusters, random_state=42)
    kmeans_subcluster_model.fit(X_subcluster)

    # Save the subcluster KMeans model
    subcluster_model_file_path = f'../models/subcluster_kmeans_model_cluster_{cluster}.joblib'
    joblib.dump(kmeans_subcluster_model, subcluster_model_file_path)

print("All subclustering models successfully built and saved.")

In [None]:
subcluster_models = {}

# Loading subcluster models into subcluster_models
for cluster in num_subclusters_per_cluster.keys():
    subcluster_model_file_path = f'../models/subcluster_kmeans_model_cluster_{cluster}.joblib'
    loaded_subcluster_model = joblib.load(subcluster_model_file_path)
    subcluster_models[cluster] = loaded_subcluster_model
    
status = f'All Subcluster Models succesfully loaded'
print(status, end='\r')

# Assigning subcluster labels to data
for cluster, kmeans_subcluster_model in subcluster_models.items():
    status = f'Processing Subclustering for Cluster {cluster}'
    print(status, end='\r')
    
    cluster_data = audio_features_kmeans_df[audio_features_kmeans_df['cluster'] == cluster]
    X_subcluster = cluster_data.loc[:, audio_feature_columns]
    subcluster_labels = kmeans_subcluster_model.predict(X_subcluster)
    audio_features_kmeans_df.loc[audio_features_kmeans_df['cluster'] == cluster, 'subcluster'] = subcluster_labels.astype(int) + 1

audio_features_kmeans_df.sort_values(by=['cluster', 'subcluster'], inplace=True)

selected_columns = ['genre', 'cluster', 'subcluster', 'track_name', 'track_uri']
clustered_audio_features_by_genre_file_path = '../../data/processed_data/genres/clustered_audio_features_by_genre.csv'
audio_features_kmeans_df[selected_columns].to_csv(clustered_audio_features_by_genre_file_path, index=False)

print("All subclustering processes completed.")

In [None]:
audio_features_kmeans_df = pd.read_csv(clustered_audio_features_by_genre_file_path)

# Convert cluster and subcluster columns to integers and then combine them as strings
audio_features_kmeans_df['genre_cluster'] = (
    audio_features_kmeans_df['cluster'].astype(int).astype(str) +
    ', ' +
    audio_features_kmeans_df['subcluster'].astype(int).astype(str)
)

audio_features_kmeans_df.to_csv(clustered_audio_features_by_genre_file_path, index=False)

print("Added 'genre_cluster' column and saved to CSV.")

### Breakdown of Cluster Details

In [None]:
cluster_counts = audio_features_kmeans_df['cluster'].value_counts().sort_index()
sizes = cluster_counts.values

plt.figure(figsize=(5, 5))
plt.pie(sizes, autopct='', startangle=140)
plt.axis('equal')

plt.title("Distribution of Occurrences across Clusters")
plt.legend(labels=[f"Cluster {cluster}" for cluster in cluster_counts.index], loc="best")
plt.show()

total_occurrences = len(audio_features_kmeans_df)

cluster_counts = audio_features_kmeans_df['cluster'].value_counts().sort_index()
for cluster, count in cluster_counts.items():
    percentage = (count / total_occurrences) * 100
    print(f"Cluster {cluster}: {count} occurrences ({percentage:.2f}%)")

### Breakdown of Sub-cluster Details

In [None]:
subcluster_counts = audio_features_kmeans_df['genre_cluster'].value_counts().sort_index()
total_subcluster_occurrences = len(audio_features_kmeans_df)
subcluster_percentages = (subcluster_counts / total_subcluster_occurrences) * 100

# Create a pie chart for subcluster distribution
subcluster_labels = [f"({subcluster})" for subcluster in subcluster_counts.index]
subcluster_sizes = subcluster_percentages.values

plt.figure(figsize=(8, 8))  # Adjust the figure size if needed

# Use the labeldistance parameter to control the label positions
plt.pie(subcluster_sizes, labels=subcluster_labels, autopct='', startangle=140, labeldistance=1.1)
plt.axis('equal')

plt.title("Distribution of Subclusters as a Percentage of Total")
plt.show()

# Subcluster breakdown within each cluster
for cluster in cluster_counts.index:
    cluster_data = audio_features_kmeans_df[audio_features_kmeans_df['cluster'] == cluster]
    subcluster_counts = cluster_data['subcluster'].value_counts().sort_index()
    print(f"Subclusters for Cluster {cluster}:")
    for subcluster, subcount in subcluster_counts.items():
        subpercentage = (subcount / total_subcluster_occurrences) * 100
        print(f"  Subcluster {subcluster}: {subcount} occurrences ({subpercentage:.2f}%)")

#### Most Common Genre in each Cluster/Subcluster

In [None]:
cluster_top_genres = audio_features_kmeans_df.groupby('cluster')['genre'].apply(lambda x: Counter(x).most_common(5))

for cluster, top_genres in cluster_top_genres.items():
    print(f"Cluster {cluster}: Top 5 Genres - {', '.join([genre for genre, _ in top_genres])}")

In [None]:
cluster_subgroup_genre_counts = audio_features_kmeans_df.groupby(['cluster', 'subcluster'])['genre'].apply(lambda x: [genre for genre, count in Counter(x).most_common(5)])

for (genre_cluster), top_genres in cluster_subgroup_genre_counts.items():
    top_genres_str = ", ".join(top_genres)
    print(f"Subcluster {genre_cluster}: Top 5 Genres - {top_genres_str}")

### Cluster/Subcluster Prediction Time

In [5]:
model_file_path = '../models/modlkmeans_model.joblib'
kmeans_main_model = joblib.load(model_file_path)

status = f'Kmeans Model succesfully loaded'
print(status, end='\r')

subcluster_models = {}

num_subclusters_per_cluster = {
    1: 6,
    2: 5,
    3: 6,
    4: 6,
    5: 5,
    6: 5
}

# Loading subcluster models into subcluster_models
for cluster in num_subclusters_per_cluster.keys():
    subcluster_model_file_path = f'../models/subcluster_kmeans_model_cluster_{cluster}.joblib'
    loaded_subcluster_model = joblib.load(subcluster_model_file_path)
    subcluster_models[cluster] = loaded_subcluster_model
    
status = f'All Subcluster Models succesfully loaded'
print(status, end='\r')

Kmeans Model succesfully loadedAll Subcluster Models succesfully loaded

In [22]:
sample_track = [{'id': '3Qaw8WaLG0iPXfwVS4cQ11', 'danceability': 0.311, 'energy': 0.311, 'key': 5, 'loudness': -11.516, 'speechiness': 0.0323, 'acousticness': 0.69, 'instrumentalness': 0, 'liveness': 0.195, 'valence': 0.369, 'tempo': 76.951, 'duration_ms': 239507}]

audio_features = sample_track[0]
sample_track_audio_features = [
    audio_features['danceability'],
    audio_features['energy'],
    audio_features['loudness'],
    audio_features['speechiness'],
    audio_features['acousticness'],
    audio_features['liveness'],
    audio_features['valence'],
    audio_features['tempo'],
]
# sample_track_audio_features = [0.5269, 0.4648, -8.3644, 0.039, 0.5454, 0.1439, 0.3232, 126.9117]

# Reshape the audio feature vector to match the input format
sample_track_audio_features = [sample_track_audio_features]

# Predict the cluster and subcluster for the sample track's audio features
predicted_cluster = kmeans_main_model.predict(sample_track_audio_features) + 1
kmeans_subcluster_model = subcluster_models[predicted_cluster[0]]
predicted_subcluster = kmeans_subcluster_model.predict(sample_track_audio_features) + 1  # Use the appropriate subcluster model

print(f"Sample track belongs to Cluster {predicted_cluster[0]}, Subcluster {predicted_subcluster[0]}")

Sample track belongs to Cluster 6, Subcluster 2


