In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# from sklearn.feature_extraction.text import TfidfVectorizer
# import nltk
# from wordcloud import WordCloud
# import re

"Libraries to help with jupyter notebook usage"
# Increases jupyter notebook display width
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from IPython.display import Image # Helps display images in notebook

In [3]:
"Import python util functions"
import sys
sys.path.append('..')  # Add the parent directory to the Python path

# Now you can import the util functions
from jupyternotebook_utils import *

## Data Import
Import files containing information regarding tracks from over 6000 different genres and their audio features, roughly 50 tracks per genre

In [None]:
audio_features_by_genre_file_path = f'../../data/processed_data/genres/audio_features_by_genre.csv'
audio_features_by_genre_df = pd.read_csv(audio_features_by_genre_file_path)
original_audio_features_by_genre_df = audio_features_by_genre_df.copy()

clustered_audio_features_by_genre_file_path = f'../../data/processed_data/genres/audio_features_by_genre.csv'

## 2. Data Pre-processing
1. Remove irrelevant columns such as key and duration
2. Remove outliers for each audio features in `audio_features_df`
3. Combining `audio_features_df` and `track_details_df` for easy data manipulation
4. Perform One-Hot-Encoding (OHE) for `track_popularity` and `release_date` in `track_details_df`

## Utility for Pre-processing
Declaration of constants and functions to be used for data pre-processing

In [None]:
before_graph_audio_features_df = audio_features_by_genre_df.copy()
after_graph_audio_features_df = audio_features_by_genre_df.copy()

num_bins = 250

graph_height = 4
graph_width = 4

audio_features_list = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'liveness', 'valence']

def plot_graph(plot_title, graph_height, graph_width, graph_features_list = audio_features_list, graph_df = before_graph_audio_features_df):
    graph_height = graph_height
    graph_width = graph_width
    graph_count = len(graph_features_list)

    fig, axes = plt.subplots(1, graph_count, figsize=(graph_width * graph_count, graph_height))
    fig.suptitle(plot_title, fontsize=16)
    
    for i, audio_feature in enumerate(graph_features_list, start = 0):
        ax = axes[i]
        ax.hist(graph_df[audio_feature], bins=num_bins, range=(0, 1), edgecolor='none')
        ax.set_title(f'{str(audio_feature).capitalize()}')
        ax.set_xlabel(f'{audio_feature.capitalize()}')
        ax.set_ylabel('Frequency')

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

### 2.1 Removing Irrelevant Columns
Columns such as `key` and `duration` do not provide useful insights to the analysis, hence they are removed.

For now, `instrumentalness` seems to have a large number of null values, it will be dropped too to prevent messing up the recommendation system.

In [None]:
# is this still needed?

audio_features_df = original_audio_features_df.drop(columns=['key', 'duration_ms', 'instrumentalness'])

### 2.2 Removing outliers from audio features
Despite the official Spotify API Documentation mentioning that most audio features are within the range of 0 to 1, a simple `track_details_df.describe()` shows its definitely not the case. To better visualize each audio feature's distribution, let's plot them out:

In [None]:
graph_title = 'Frequency Histograms of Audio Features'
plot_graph(graph_title, graph_height, graph_width, audio_features_list)

It's immediately noticeable that only audio features `Danceability`, `Energy`, and `Valence` are "normal" (excuse the pun). The distribution of `Loudness` is also noticeably messy due to its range of [-60, 0], let's fix that using min-max normalization. 

In [None]:
max_loudness = 0
min_loudness = -60
after_graph_audio_features_df['loudness'] = audio_features_by_genre_df['loudness']
after_graph_audio_features_df['normalized_loudness'] = (audio_features_by_genre_df['loudness'] - min_loudness) / (max_loudness - min_loudness)

features_list = ['loudness', 'normalized_loudness']

graph_title = 'Frequency Histograms of Loudness Before vs After Normalization'

plot_graph(graph_title, graph_height, graph_width, features_list, after_graph_audio_features_df)

before_graph_audio_features_df['loudness'] = after_graph_audio_features_df['normalized_loudness']
after_graph_audio_features_df = before_graph_audio_features_df.copy()

Another quite obvious issue is with the `instrumentalness` column, which for some reason contains a large amount of null values.


NOTE: since instrumentalness seems to have large amounts of null values, it will be dropped for now.

In [None]:
# graph_count = 2

# fig, axes = plt.subplots(1, graph_count, figsize=(graph_width * graph_count, graph_height))
# fig.suptitle('Frequency Histograms of Instrumentalness Before vs After Normalization', fontsize=16)

# instrumentalness = audio_features_df['instrumentalness']
# filtered_instrumentalness = instrumentalness[instrumentalness > 0].apply(lambda x : x * 10000)

# columns_list = [instrumentalness, filtered_instrumentalness]

# for i, columns in enumerate(columns_list):
#     ax = axes[i]

#     ax.hist(columns, bins=num_bins, range=(0, 1), edgecolor='none')
#     ax.set_xlabel('Instrumentalness')
#     ax.set_ylabel('Frequency')
    
# plt.tight_layout(rect=[0, 0.03, 1, 0.95])
# plt.show()

Looks so much better. Now that we know `Loudness` is heavily left-skewed, while `Speechiness`, `Acousticness` and `Liveness` are heavily right-skewed, we can perform square-transformation and square-root-transformation to them respectively.

In [None]:
# for left-skewed, use cube-normalization
after_graph_audio_features_df['loudness'] = before_graph_audio_features_df['loudness'].apply(lambda x : x**3)

# for right-skewed, use square-root-normalization
non_zero_columns = ['speechiness', 'acousticness', 'liveness']
for column in non_zero_columns:
    after_graph_audio_features_df[column] = before_graph_audio_features_df[column].apply(lambda x : np.sqrt(x))

    
# Printing before and after graph
features_list = ['loudness','speechiness', 'acousticness', 'liveness']

graph_title_before = 'Frequency Histogram of Skewed Audio Features Before Normalizing'
plot_graph(graph_title_before, graph_height, graph_width, features_list, before_graph_audio_features_df)

    
graph_title_after = 'Frequency Histogram of Skewed Audio Features After Normalizing'
plot_graph(graph_title_after, graph_height, graph_width, features_list, after_graph_audio_features_df)

graph_audio_features_df = after_graph_audio_features_df.copy()
audio_features_df = after_graph_audio_features_df.copy()

Although several audio features are yet to be close to normal, it is still way better than where we started from. Here's the final frequency histograms for every audio feature after various normalization techniques have been applied

In [None]:
graph_title = 'Original Frequency Histogram of Audio Features'
plot_graph(graph_title, graph_height, graph_width, audio_features_list, original_audio_features_by_genre_df)

graph_title = 'Frequency Histograms of Audio Features'
plot_graph(graph_title, graph_height, graph_width, audio_features_list, audio_features_df)

### 2.3 Combining audio_features_df and track_details_df

Due to some issues with spotify's song database, the audio_features for all songs still remain in the DB (presumably for song analysis purpose), but the track details (allbum, date of release, artist) have been removed. 

This caused quite a headache and took quite a long time to realize the issue, but alast it has been fixed, the rows with such missing data has been labelled as NA during the data retrieval process, for simple data pre-processing.

The `merged_df` now contains information from `audio_features_df` as well as `track_details_df`, joined on the track id

In [None]:
merged_df = audio_features_df.merge(track_details_df, on='id', how='inner')

tracks_with_missing_details = merged_df.loc[merged_df['track_uri'].isna(), 'id']

merged_df = merged_df.drop(index=merged_df[merged_df['id'].isin(tracks_with_missing_details)].index)

### 2.4 One-Hot-Encoding (OHE) for `track_popularity` and `release_date` in `track_details_df`
One-Hot Encoding (OHE) is used to convert categorical variables into a binary format, where each category is represented by a separate binary feature, making the data suitable for machine learning algorithms that require numerical inputs. Here's how we will be performing OHE:
- OHE for `track_popularity` can be done by grouping them into buckets of their nearest 10s
- OHE for `release_date` can be done by grouping them into the year released 

In [None]:
original_merged_df = merged_df.copy()

merged_df['release_date'] = original_merged_df['release_date'].apply(lambda date_str: date_str.split('-')[0] if isinstance(date_str, str) and '-' in date_str else date_str)

merged_df['track_popularity'] = original_merged_df['track_popularity'].apply(lambda popularity: (int(popularity // 10) * 10) if not pd.isna(popularity) else popularity)

# Data Visualization
## Scatter Plot 
Allows better visualization of how one audio feature tends to affect the rest

In [None]:
num_rows = 1
num_cols = len(audio_features_list)

subplot_size = 5
fig_row_size = subplot_size * num_cols
fig_col_size = subplot_size

for af in audio_features_list:
    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(fig_row_size, fig_col_size))
    
    for i, feature in enumerate(audio_features_list):
        axs[i].hexbin(x=audio_features_df[af], y=audio_features_df[feature], alpha=1, gridsize=25, cmap='terrain')
        
        axs[i].set_xlabel(af)
        axs[i].set_ylabel(feature)
        
        # Adding linear regression line
        slope, intercept = np.polyfit(audio_features_df[af], audio_features_df[feature], 1)
        x_values = [audio_features_df[af].min(), audio_features_df[af].max()]
        y_values = [slope * x + intercept for x in x_values]
        axs[i].plot(x_values, y_values, color='red', linewidth=2)
    
    plt.tight_layout()
    plt.savefig(f"../../resources/audio_feature_plots/plot_{af}.png", format="png", dpi=30)
    plt.show()

## Genre Clustering Using KMeans Algorithm

Here's an overview of the whole clustering process:
1. Identifying the optimal value of K for the Partitioning Clustering using the Elbow Method.
2. Run Kmeans algorithm using the optimal K value.
3. For each cluster, re-run Elbow Method to find the optimal K value for Hierachial Subclustering.

### Constant Declarations for KMeans Algorithm

In [None]:
from IPython.display import clear_output # clears output for better logging
from sklearn.cluster import KMeans
from collections import Counter # used to count frequency of genres in each cluster

min_clusters = 1
max_clusters = 11

# NOTE: key, and duration_ms is removed
audio_feature_columns = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo']

In [None]:
X = audio_features_by_genre_df.loc[:, audio_feature_columns]
inertia_values = []

for k in range(min_clusters, max_clusters):
    
    status = f'Attempting Cluster Size: {k}'
    print(status, end='\r')  # '\r' moves the cursor to the beginning of the same line, effectively overwriting previous line
    
    kmeans_main_model = KMeans(n_clusters=k, random_state=42)
    kmeans_main_model.fit(X)
    inertia_values.append(kmeans_main_model.inertia_)

clear_output(wait=True)

plt.figure(figsize=(10, 6))
plt.plot(range(min_clusters, max_clusters), inertia_values, marker='o')
plt.title('Determining Optimal Value for K')
plt.xlabel('Number of Clusters, K')
plt.ylabel('Inertia')
plt.xticks(range(min_clusters, max_clusters))
plt.show()

In [None]:
audio_features_kmeans_df = audio_features_by_genre_df.copy()

X = audio_features_kmeans_df.loc[:, audio_feature_columns]

num_clusters = 5
kmeans_main_model = KMeans(n_clusters=num_clusters, random_state=42)
kmeans_main_model.fit(X)

audio_features_kmeans_df['cluster'] = kmeans_main_model.labels_
audio_features_kmeans_df.sort_values(by='cluster', inplace=True)

selected_columns = ['genre', 'cluster', 'track_name', 'track_uri']
selected_df = audio_features_kmeans_df[selected_columns]
clustered_audio_features_by_genre_file_path = '../../data/processed_data/genres/clustered_audio_features_by_genre.csv'
selected_df.to_csv(clustered_audio_features_by_genre_file_path, index=False)

print(f'Sorted and selected data with cluster assignments written to {clustered_audio_features_by_genre_file_path}')

In [None]:
# Removed for now, to idenfity top 20 genres in each cluster

# from collections import Counter

# # Group the DataFrame by 'cluster' and aggregate genres
# clustered_genre_groups = selected_df.groupby('cluster')['genre'].apply(lambda x: list(x))

# for cluster, genres in clustered_genre_groups.items():
#     genre_counter = Counter(genres)
#     most_common_genres = genre_counter.most_common(20)
    
#     print(f"Cluster {cluster}:")
#     for genre, count in most_common_genres:
#         print(f"  {genre}: {count}")
#     print()

In [None]:
unique_clusters = audio_features_kmeans_df['cluster'].unique()

plt.figure(figsize=(10, 6))

for cluster in unique_clusters:
    status = f'Finding optimal value of K for Cluster {cluster}'
    print(status)
    
    # Filter data for the current cluster
    cluster_data = audio_features_kmeans_df[audio_features_kmeans_df['cluster'] == cluster]
    X = cluster_data.loc[:, audio_feature_columns]

    # Perform the elbow method to find the optimal K value
    inertia_values = []
    
    for k in range(min_clusters, max_clusters):
        status = f'Attempting Cluster Size: {k}'
        print(status, end='\r')
        
        kmeans_subcluster_model = KMeans(n_clusters=k, random_state=42)
        kmeans_subcluster_model.fit(X)
        inertia_values.append(kmeans_subcluster_model.inertia_)

    clear_output(wait=True)
    
    # Plot the elbow method results for each cluster on the same chart
    plt.plot(range(min_clusters, max_clusters), inertia_values, marker='o', label=f'Cluster {cluster}')
    plt.title('Elbow Method for Optimal K')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Inertia')
    plt.xticks(range(min_clusters, max_clusters))
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
num_subclusters_per_cluster = {
    0: 6,
    1: 6,
    2: 7,
    3: 7,
    4: 6,
}

subcluster_models = {}  # Dictionary to store subcluster KMeans models

# Iterate through each unique cluster label
for cluster, num_subclusters in num_subclusters_per_cluster.items():
    status = f'Processing Subclustering for Cluster {cluster}'
    print(status, end='\r')
    
    cluster_data = audio_features_kmeans_df[audio_features_kmeans_df['cluster'] == cluster]

    """
    - Clustering is done based on audio_features only, NOT cluster/subcluster value
    - Each cluster is fitted to its own kmeans_model, which are stored in subcluster_models dic
    """
    X_subcluster = cluster_data.loc[:, audio_feature_columns]  

    # Perform subclustering for the current cluster
    kmeans_subcluster_model = KMeans(n_clusters=num_subclusters, random_state=42)
    kmeans_subcluster_model.fit(X_subcluster)
    
    # Store the subcluster KMeans model for the current cluster
    subcluster_models[cluster] = kmeans_subcluster_model

# Use the stored subcluster models to assign subcluster labels back to the original DataFrame
for cluster, kmeans_subcluster_model in subcluster_models.items():
    cluster_data = audio_features_kmeans_df[audio_features_kmeans_df['cluster'] == cluster]
    X_subcluster = cluster_data.loc[:, audio_feature_columns]
    subcluster_labels = kmeans_subcluster_model.predict(X_subcluster)
    audio_features_kmeans_df.loc[audio_features_kmeans_df['cluster'] == cluster, 'subcluster'] = subcluster_labels.astype(int)

audio_features_kmeans_df.sort_values(by=['cluster', 'subcluster'], inplace=True)

selected_columns = ['genre', 'cluster', 'subcluster', 'track_name', 'track_uri']
audio_features_kmeans_df[selected_columns].to_csv(clustered_audio_features_by_genre_file_path, index=False)

print("All subclustering processes completed.")


In [None]:
# Commented out for now, no real reason to combine

audio_features_kmeans_df = pd.read_csv(clustered_audio_features_by_genre_file_path)

# Convert cluster and subcluster columns to integers and then combine them as strings
audio_features_kmeans_df['genre_cluster'] = (
    audio_features_kmeans_df['cluster'].astype(int).astype(str) +
    ', ' +
    audio_features_kmeans_df['subcluster'].astype(int).astype(str)
)

audio_features_kmeans_df.to_csv(clustered_audio_features_by_genre_file_path, index=False)

print("Added 'genre_cluster' column and saved to CSV.")

### Breakdown of Cluster Details

In [None]:
cluster_counts = audio_features_kmeans_df['cluster'].value_counts().sort_index()
sizes = cluster_counts.values

plt.figure(figsize=(5, 5))
plt.pie(sizes, autopct='', startangle=140)
plt.axis('equal')

plt.title("Distribution of Occurrences across Clusters")
plt.legend(labels=[f"Cluster {cluster}" for cluster in cluster_counts.index], loc="best")
plt.show()



total_occurrences = len(audio_features_kmeans_df)

cluster_counts = audio_features_kmeans_df['cluster'].value_counts().sort_index()
for cluster, count in cluster_counts.items():
    percentage = (count / total_occurrences) * 100
    print(f"Cluster {cluster}: {count} occurrences ({percentage:.2f}%)")

### Breakdown of Sub-cluster Details

In [None]:
subcluster_counts = audio_features_kmeans_df['genre_cluster'].value_counts().sort_index()
total_subcluster_occurrences = len(audio_features_kmeans_df)
subcluster_percentages = (subcluster_counts / total_subcluster_occurrences) * 100

# Create a pie chart for subcluster distribution
subcluster_labels = [f"({subcluster})" for subcluster in subcluster_counts.index]
subcluster_sizes = subcluster_percentages.values

plt.figure(figsize=(8, 8))  # Adjust the figure size if needed

# Use the labeldistance parameter to control the label positions
plt.pie(subcluster_sizes, labels=subcluster_labels, autopct='', startangle=140, labeldistance=1.1)
plt.axis('equal')

plt.title("Distribution of Subclusters as a Percentage of Total")
plt.show()

# Subcluster breakdown within each cluster
for cluster in cluster_counts.index:
    cluster_data = audio_features_kmeans_df[audio_features_kmeans_df['cluster'] == cluster]
    subcluster_counts = cluster_data['subcluster'].value_counts().sort_index()
    print(f"Subclusters for Cluster {cluster}:")
    for subcluster, subcount in subcluster_counts.items():
        subpercentage = (subcount / total_subcluster_occurrences) * 100
        print(f"  Subcluster {subcluster}: {subcount} occurrences ({subpercentage:.2f}%)")

#### Most Common Genre in each Cluster/Subcluster

In [None]:
cluster_top_genres = audio_features_kmeans_df.groupby('cluster')['genre'].apply(lambda x: Counter(x).most_common(5))

for cluster, top_genres in cluster_top_genres.items():
    print(f"Cluster {cluster}: Top 5 Genres - {', '.join([genre for genre, _ in top_genres])}")

In [None]:
cluster_subgroup_genre_counts = audio_features_kmeans_df.groupby(['cluster', 'subcluster'])['genre'].apply(lambda x: [genre for genre, count in Counter(x).most_common(5)])

for (genre_cluster), top_genres in cluster_subgroup_genre_counts.items():
    top_genres_str = ", ".join(top_genres)
    print(f"Subcluster {genre_cluster}: Top 5 Genres - {top_genres_str}")

### Cluster/Subcluster Prediction Time

In [None]:
sample_track = [{'id': '4OAuvHryIVv4kMDNSLuPt6', 'danceability': 0.516, 'energy': 0.777, 'key': 1, 'loudness': -4.908, 'speechiness': 0.0375, 'acousticness': 0.00108, 'instrumentalness': 1.62e-06, 'liveness': 0.0761, 'valence': 0.408, 'tempo': 125.047, 'duration_ms': 223093}]

audio_features = sample_track[0]
sample_track_audio_features = [
    audio_features['danceability'],
    audio_features['energy'],
    audio_features['loudness'],
    audio_features['speechiness'],
    audio_features['acousticness'],
    audio_features['liveness'],
    audio_features['valence'],
    audio_features['tempo'],
]

# Reshape the audio feature vector to match the input format
sample_track_audio_features = [sample_track_audio_features]

# Predict the cluster and subcluster for the sample track's audio features
predicted_cluster = kmeans_main_model.predict(sample_track_audio_features)
kmeans_subcluster_model = subcluster_models[predicted_cluster[0]]
predicted_subcluster = kmeans_subcluster_model.predict(sample_track_audio_features)  # Use the appropriate subcluster model

print(f"Sample track belongs to Cluster {predicted_cluster[0]}, Subcluster {predicted_subcluster[0]}")
# print(f"Sample track belongs to Cluster {predicted_cluster[0]}")