Use this script to test a mockup K-means clustering model on preliminary Spotify track analysis data

In [19]:
# dependencies
import pandas as pd
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [22]:
# load the dataset
audio_df = pd.read_csv('../00_data/audio_features_test_data.csv')
print(audio_df.shape)
audio_df.head()

(1000, 67)


Unnamed: 0,artist_name,track_name,track_id,popularity,artist_uri,atl_hip_hop,baton_rouge_rap,cali_rap,canadian_contemporary_r&b,canadian_hip_hop,...,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Morgan Wallen,Whiskey Glasses,6foY66mWZN0pSRjZ408c00,82,spotify:artist:4oUHIQIBe0LHzYfvXNW4QM,0,0,0,0,0,...,0.68,2e-06,6,0.115,-4.58,1,0.0289,149.959,4,0.707
1,Morgan Wallen,Chasin' You,5MwynWK9s4hlyKHqhkNn4A,80,spotify:artist:4oUHIQIBe0LHzYfvXNW4QM,0,0,0,0,0,...,0.591,0.0,7,0.11,-5.785,1,0.0277,97.074,4,0.489
2,Morgan Wallen,The Way I Talk,21LRaD9rB3v7p7DDCZsW4y,74,spotify:artist:4oUHIQIBe0LHzYfvXNW4QM,0,0,0,0,0,...,0.879,0.0,4,0.0909,-3.388,1,0.0322,137.983,4,0.757
3,Morgan Wallen,Up Down (feat. Florida Georgia Line),3EWMoDIm6lzuR0zQKtuCJX,74,spotify:artist:4oUHIQIBe0LHzYfvXNW4QM,0,0,0,0,0,...,0.882,0.0,6,0.146,-3.039,1,0.035,129.915,4,0.793
4,Morgan Wallen,Talkin' Tennessee,7szRiHu6r91o2Po9GTPN4c,72,spotify:artist:4oUHIQIBe0LHzYfvXNW4QM,0,0,0,0,0,...,0.711,0.0,6,0.104,-6.697,0,0.0248,99.988,4,0.498


### Data setup

In [23]:
# save columns that will be dropped
artists = audio_df[['artist_name', 'artist_uri']].copy()
tracks = audio_df[['track_name', 'track_id']].copy()

# drop columns
audio_df = audio_df.drop(columns=['track_name','track_id','artist_uri'])

# remove rows with NAs
audio_df = audio_df.dropna()
audio_df.shape

(1000, 64)

In [25]:
# create dummy variables for artist names
X = pd.get_dummies(audio_df.copy(), columns=['artist_name'], prefix='artist')
print(X.shape)
X.head()

(1000, 521)


Unnamed: 0,popularity,atl_hip_hop,baton_rouge_rap,cali_rap,canadian_contemporary_r&b,canadian_hip_hop,canadian_pop,chicago_rap,conscious_hip_hop,contemporary_country,...,artist_girl in red,artist_grandson,artist_guardin,artist_iKON,artist_iamjakehill,artist_ilyTOMMY,artist_keshi,artist_lofi.samurai,artist_lovelytheband,artist_mxmtoon
0,82,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,80,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,74,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,74,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,72,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# standardize the dataset
audio_scaled = StandardScaler().fit_transform(X)

In [28]:
audio_scaled

array([[ 2.03794185, -0.20145155, -0.166581  , ..., -0.0316386 ,
        -0.0316386 , -0.0316386 ],
       [ 1.70186965, -0.20145155, -0.166581  , ..., -0.0316386 ,
        -0.0316386 , -0.0316386 ],
       [ 0.69365303, -0.20145155, -0.166581  , ..., -0.0316386 ,
        -0.0316386 , -0.0316386 ],
       ...,
       [-1.3227802 , -0.20145155, -0.166581  , ..., -0.0316386 ,
        -0.0316386 , -0.0316386 ],
       [-1.1547441 , -0.20145155, -0.166581  , ..., -0.0316386 ,
        -0.0316386 , -0.0316386 ],
       [-0.48259969, -0.20145155, -0.166581  , ..., -0.0316386 ,
        -0.0316386 , -0.0316386 ]])

### Principal component analysis

In [29]:
# PCA to reduce dimensionality
audio_pca = PCA(n_components=5).fit_transform(audio_scaled)

In [31]:
# save principal components to a dataframe
pca_df = pd.DataFrame(audio_pca, columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5'], index = audio_df.index)
print(pca_df.shape)
pca_df.head(10)

(1000, 5)


Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,1.189279,-1.40111,-0.175492,5.589756,-4.881027
1,1.249078,-1.605046,-0.52791,4.77719,-4.984074
2,1.184158,-1.263609,0.209001,6.363209,-4.78163
3,1.214732,-1.2617,0.247404,6.385191,-4.801758
4,1.152324,-1.578798,-0.246608,5.062658,-4.744541
5,1.129857,-1.53438,-0.324182,5.055818,-4.815049
6,1.433731,-0.410436,-1.517825,-2.332673,-2.968478
7,1.325442,-0.483541,-2.054914,-2.86504,-2.694491
8,1.327127,-0.636779,-1.653588,-2.917747,-2.713226
9,1.087452,-0.851587,-2.148919,-3.606592,-2.035706


### K-means: elbow curve

In [49]:
# look for the best K from 1-12

inertia = []
k_range = list(range(1, 13))

for i in k_range:
    km = KMeans(n_clusters = i, random_state = 0)
    km.fit(pca_df)
    inertia.append(km.inertia_)
    
# plot the curve
elbow_df = pd.DataFrame({"k": k_range, "inertia": inertia})
elbow_fig = px.line(elbow_df, x='k', y='inertia')
elbow_fig.show()

### K-means cluster analysis
- using k=7

In [38]:
# initialize the model
kmod = KMeans(n_clusters=7)

# fit the model
kmod.fit(pca_df)

# Predict clusters
kmod.predict(pca_df)

array([6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5,
       5, 5, 5, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 5, 5, 5, 2, 2, 1, 1, 1, 1, 6, 6, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 1, 0, 5, 5,
       0, 5, 5, 0, 5, 5, 0, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 1, 5, 0, 0, 0, 0, 0, 0, 0, 2, 3,
       1, 0, 0, 0, 5, 5, 5, 5, 5, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 3, 3, 3, 3, 0, 0, 0, 0, 0, 1, 5, 5, 5, 1, 0, 0, 1, 1, 0, 1,
       0, 3, 0, 6, 0, 1, 0, 0, 5, 5, 5, 5, 1, 1, 1, 2, 2, 2, 2, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 5, 5, 1, 5, 5, 5, 5, 5, 5, 0,
       5, 5, 5, 0, 1, 5, 5, 5, 5, 1, 0, 1, 6, 6, 6, 6, 6, 6, 6, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 5, 0, 0, 0, 5, 5, 5, 0, 2, 3,
       3, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 1, 0,

In [41]:
# save predictions to a dataframe

# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
cluster_df = pd.concat([audio_df, pca_df], axis=1)

cluster_df

# return track names to the dataset
cluster_df['track_name'] = tracks['track_name']

# add model predictions to the dataset
cluster_df['cluster'] = kmod.labels_

# Print the shape of the clustered_df
print(cluster_df.shape)
cluster_df.head(10)

(1000, 71)


Unnamed: 0,artist_name,popularity,atl_hip_hop,baton_rouge_rap,cali_rap,canadian_contemporary_r&b,canadian_hip_hop,canadian_pop,chicago_rap,conscious_hip_hop,...,tempo,time_signature,valence,PC1,PC2,PC3,PC4,PC5,track_name,cluster
0,Morgan Wallen,82,0,0,0,0,0,0,0,0,...,149.959,4,0.707,1.189279,-1.40111,-0.175492,5.589756,-4.881027,Whiskey Glasses,6
1,Morgan Wallen,80,0,0,0,0,0,0,0,0,...,97.074,4,0.489,1.249078,-1.605046,-0.52791,4.77719,-4.984074,Chasin' You,6
2,Morgan Wallen,74,0,0,0,0,0,0,0,0,...,137.983,4,0.757,1.184158,-1.263609,0.209001,6.363209,-4.78163,The Way I Talk,6
3,Morgan Wallen,74,0,0,0,0,0,0,0,0,...,129.915,4,0.793,1.214732,-1.2617,0.247404,6.385191,-4.801758,Up Down (feat. Florida Georgia Line),6
4,Morgan Wallen,72,0,0,0,0,0,0,0,0,...,99.988,4,0.498,1.152324,-1.578798,-0.246608,5.062658,-4.744541,Talkin' Tennessee,6
5,Morgan Wallen,70,0,0,0,0,0,0,0,0,...,99.934,4,0.672,1.129857,-1.53438,-0.324182,5.055818,-4.815049,If I Know Me,6
6,The Weeknd,91,0,0,0,1,0,1,0,0,...,113.26,4,0.155,1.433731,-0.410436,-1.517825,-2.332673,-2.968478,I Was Never There,1
7,The Weeknd,90,0,0,0,1,0,1,0,0,...,134.17,3,0.175,1.325442,-0.483541,-2.054914,-2.86504,-2.694491,Call Out My Name,1
8,The Weeknd,75,0,0,0,1,0,1,0,0,...,92.026,4,0.107,1.327127,-0.636779,-1.653588,-2.917747,-2.713226,Try Me,1
9,The Marías,70,0,0,0,1,0,1,0,0,...,113.982,4,0.66,1.087452,-0.851587,-2.148919,-3.606592,-2.035706,Cariño,1


In [50]:
cluster_df['cluster'].value_counts()

0    517
1    216
5    170
4     26
2     24
3     24
6     23
Name: cluster, dtype: int64