In [None]:
# clustering for the SpotipY API 
# the general IDEA is to use multi level clustering for the different audio features available 
# below is the idea behind the model 
# the clustering is based on the idea that we should have 4 levels of clsutering based on the following audio features 

"""Level 1: Broad Emotional and Genre Characteristics

Start with features that capture the overall feel and genre of the music.

Valence: Musical positiveness (happy, cheerful vs. sad, serious).
Energy: Intensity and activity (high-energy tracks are fast, loud, and noisy).
Acousticness: Presence of acoustic sounds (high values indicate more acoustic)."""


""""Level 2: Rhythmic and Dance Characteristics

Next, refine clusters based on rhythmic and dance-related attributes.

Danceability: Suitability for dancing (based on tempo, rhythm stability, beat strength).
Tempo: Speed or pace of the music (measured in beats per minute, BPM).
Liveness: Presence of a live audience (high values indicate live performance)."""


"""Level 3: Structural and Instrumental Characteristics

Further refine based on the structural and instrumental aspects of the music.

Instrumentalness: Presence of instrumental content (higher values indicate less vocal content).
Speechiness: Presence of spoken words (high values indicate more speech-like content).
Loudness: Overall loudness (measured in decibels)."""

"""Level 4: Musical Key and Mode

Finally, use musical key and mode for fine-tuning recommendations.

Key: The key in which the track is written (e.g., C major, A minor).
Mode: The modality of the track (major or minor)."""

In [None]:
import pandas as pd
from time import sleep
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import pickle
import seaborn as sns
import matplotlib as plt

In [None]:
tracks_and_features_df = pd.read_csv('tracks_and_features.csv')
features_df = pd.read_csv('features.csv')

In [None]:
features_df

In [None]:
audio_features_model_on = ['is_explicit','popularity', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature']
features_df = features_df[audio_features_model_on]

In [11]:

audio_features_model_on = ['is_explicit','popularity', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature']
features_df = features_df[audio_features_model_on]

# Grouping the audio features into cluster levels
features_L1 = ['valence', 'energy', 'acousticness']
features_L2 = ['danceability', 'tempo', 'liveness']
features_L3 = ['instrumentalness', 'speechiness', 'loudness']
features_L4 = ['key', 'mode']

# Scaling the features using standard scaler first (could be changed to min max scaler if needed)
scaler = StandardScaler()

def perform_clustering(data, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(data)
    return clusters, kmeans

# level  1 clustering 
level_L1_scaled = scaler.fit_transform(features_df[features_L1])
clusters_L1, kmeans_L1 = perform_clustering(level_L1_scaled, n_clusters=5)  # Level 1 =5

features_df['cluster_L1'] = clusters_L1

# Initialize the final clusters list
final_clusters = np.zeros(features_df.shape[0])

# Secondary clustering (Level 2)
cluster_counter = 0
kmeans_L2_models ={}
for cluster_L1_label in np.unique(clusters_L1):
    level_2_data = features_df[features_df['cluster_L1'] == cluster_L1_label][features_L2]
    level_2_scaled = scaler.fit_transform(level_2_data)
    clusters_L2, kmeans_L2 = perform_clustering(level_2_scaled, n_clusters=5)  # level 2 = 5
    
    features_df.loc[features_df['cluster_L1'] == cluster_L1_label, 'cluster_L2'] = clusters_L2 + cluster_counter * 5
    kmeans_L2_models[cluster_L1_label]= kmeans_L2
    cluster_counter += 1

#copying the same loop for the cluster 3 number of clusters = 2
cluster_counter = 0
kmeans_L3_models = {}
for cluster_L2_label in np.unique(features_df['cluster_L2']):
    level_3_data = features_df[features_df['cluster_L2'] == cluster_L2_label][features_L3]
    level_3_scaled = scaler.fit_transform(level_3_data)
    clusters_L3, kmeans_L3 = perform_clustering(level_3_scaled, n_clusters=2)  # level 3 = 2
    
    features_df.loc[features_df['cluster_L2'] == cluster_L2_label, 'cluster_L3'] = clusters_L3 + cluster_counter * 2
    kmeans_L3_models[cluster_L2_label] = kmeans_L3
    cluster_counter += 1

#copying the same loop for the cluster 4 number of clusters = 2
cluster_counter = 0
kmeans_L4_models = {}
for cluster_L3_label in np.unique(features_df['cluster_L3']):
    level_4_data = features_df[features_df['cluster_L3'] == cluster_L3_label][features_L4]
    level_4_scaled = scaler.fit_transform(level_4_data)
    clusters_L4, kmeans_L4 = perform_clustering(level_4_scaled, n_clusters=2)  # level 4 = 2
    
    features_df.loc[features_df['cluster_L3'] == cluster_L3_label, 'cluster_L4'] = clusters_L4 + cluster_counter
    kmeans_L4_models[cluster_L3_label] = kmeans_L4
    cluster_counter += 1

with open('kmeans_L1_models.pkl','wb') as handle:
    pickle.dump(kmeans_L1,handle)
with open('kmeans_L2_models.pkl','wb') as handle:
    pickle.dump(kmeans_L2_models,handle)
with open('kmeans_L3_models.pkl', 'wb') as handle:
    pickle.dump(kmeans_L3_models,handle)
with open('kmeans_L4_models','wb') as handle:
    pickle.dump(kmeans_L4_models,handle)


features_df.to_pickle('features_df_with_clusters.pkl')

In [None]:
features_df

In [None]:
features_df['cluster_L4'].nunique()

In [12]:
x =features_df.groupby(['cluster_L1','cluster_L2','cluster_L3','cluster_L4'])['popularity'].size()
pd.set_option('display.max_columns',None)
pd.set_option('display.max.rows',None)
x

cluster_L1  cluster_L2  cluster_L3  cluster_L4
0           0.0         0.0         0.0            126
                                    1.0            198
                        1.0         1.0            143
                                    2.0            329
            1.0         2.0         2.0            839
                                    3.0            580
                        3.0         3.0            118
                                    4.0             90
            2.0         4.0         4.0            609
                                    5.0            701
                        5.0         5.0            128
                                    6.0            145
            3.0         6.0         6.0             68
                                    7.0             58
                        7.0         7.0             79
                                    8.0             34
            4.0         8.0         8.0            115
                  

In [None]:
tracks_df = pd.read_csv('tracks.csv')

In [None]:
tracks_df

In [None]:
df_check = pd.concat([tracks_and_features_df,features_df], axis=1,join='inner')

In [None]:
def plot_clusters(data, cluster_column, title, features):
    # Perform PCA to reduce to 2 dimensions for visualization
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(data[features])
    
    # Create a DataFrame with PCA results and cluster assignments
    plot_df = pd.DataFrame(reduced_data, columns=['PCA1', 'PCA2'])
    plot_df['Cluster'] = data[cluster_column].values
    
    # Plotting
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', palette='viridis', data=plot_df, legend='full')
    plt.title(title)
    plt.show()


In [None]:
plot_clusters(features_df, 'cluster_L1', 'Level 1 Clusters', features_L1)
