In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv("top2018.csv")

In [3]:
# select parameters to cluster, number of clusters, and fit via KMeans
params = ["danceability","energy","key","loudness","speechiness","acousticness","instrumentalness","liveness","valence","tempo","duration_ms"]
n = 5
kmeans = KMeans(n_clusters=n, random_state=0).fit(df[params])

In [4]:
df['label'] = kmeans.labels_

In [12]:
# Top 25 songs of the year sorted by their cluster label
df.head(25).sort_values(by=['label'])

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genre,genre2,label
0,6DCZcSspjsKoFjzjrWoCd,God's Plan,Drake,0.754,0.449,7,-9.211,1,0.109,0.0332,8.3e-05,0.552,0.357,77.169,198973,4,hip-hop,hip-hop/rap,0
1,3ee8Jmje8o58CHK66QrVC,SAD!,XXXTENTACION,0.74,0.613,8,-4.88,1,0.145,0.258,0.00372,0.123,0.473,75.023,166606,4,hip-hop,hip-hop/rap,0
19,4qKcDkK6siZ7Jp1Jb4m0a,Look Alive (feat. Drake),BlocBoy JB,0.922,0.581,10,-7.495,1,0.27,0.00104,5.9e-05,0.105,0.595,140.022,181263,4,rap,hip-hop/rap,0
17,39N9RPD9MRb5WmoLzNzPe,X,Nicky Jam,0.595,0.773,9,-4.736,0,0.0549,0.0364,0.00108,0.334,0.711,180.073,173628,4,latin,latin,0
14,09IStsImFySgyp0pIQdqA,The Middle,Zedd,0.753,0.657,7,-3.061,1,0.0449,0.171,0.0,0.112,0.437,107.01,184732,4,edm,electronic/dance,0
9,08bNPGLD8AhKpnnERrAc6,FRIENDS,Marshmello,0.626,0.88,9,-2.384,0,0.0504,0.205,0.0,0.128,0.534,95.079,202621,4,pop,pop,0
23,2iUXsYOEPhVqEBwsqP70r,Youngblood,5 Seconds of Summer,0.596,0.854,7,-5.114,0,0.463,0.0169,0.0,0.124,0.152,120.274,203418,4,rock,rock,0
6,58q2HKrzhC3ozto2nDdN4,I Like It,Cardi B,0.816,0.726,5,-3.998,0,0.129,0.099,0.0,0.372,0.65,136.048,253390,4,hip-hop,hip-hop/rap,1
20,5CLGzJsGqhCEECcpnFQA8,"These Days (feat. Jess Glynne, Macklemore & Da...",Rudimental,0.653,0.809,0,-4.057,1,0.0474,0.194,0.0,0.165,0.55,92.213,210773,4,dance,electronic/dance,2
16,2qT1uLXPVPzGgFOx4jtEu,no tears left to cry,Ariana Grande,0.699,0.713,9,-5.507,0,0.0594,0.04,3e-06,0.294,0.354,121.993,205920,4,pop,pop,2


In [6]:
# cluster centroids for each parameter

for i in range(n):
    print("Cluster ",i)
    for j in range(len(params)):
        print(params[j],": ",kmeans.cluster_centers_[i][j])
        
    print("\n")

Cluster  0
danceability :  0.7224634146341463
energy :  0.6736585365853659
key :  5.439024390243903
loudness :  -5.604390243902439
speechiness :  0.11845853658536586
acousticness :  0.19039419512195122
instrumentalness :  0.0034054065853658536
liveness :  0.1459219512195122
valence :  0.48646341463414633
tempo :  119.53573170731707
duration_ms :  186997.65853658534


Cluster  1
danceability :  0.6980000000000001
energy :  0.6174285714285714
key :  4.714285714285714
loudness :  -5.413428571428572
speechiness :  0.14718571428571428
acousticness :  0.17810428571428571
instrumentalness :  2.168404344971009e-19
liveness :  0.17535714285714288
valence :  0.40800000000000003
tempo :  121.51614285714285
duration_ms :  270596.2857142857


Cluster  2
danceability :  0.6971395348837209
energy :  0.6773023255813954
key :  5.27906976744186
loudness :  -5.427139534883721
speechiness :  0.09609302325581395
acousticness :  0.16269627906976744
instrumentalness :  0.0003370718604651158
liveness :  0.159

In [10]:
i = df.index

fig1 = go.Figure(data=go.Scatter(x=df['genre2'],
                                y=df['label'],
                                mode='markers',
                                marker_color=df['label'],
                                text=df['name'])) # hover text goes here

fig1.update_layout(title='Top 100 Spotify Songs 2018')
fig1.show()

In [8]:
# two clusters - 0 and 2 - seem to encompass all contemporary genres.
# songs in 0 are speechier and more instrumental than 2 
# more hard-hitting trap songs like HUMBLE. by Kendrick Lamar and in 0
# more pop songs like Congratulations by Post Malone, Happier by Marshmellow in 2
# 0 tends to be a little dancier, instrumental, and wordy, while 2 are more catchy and heartfelt tunes.
# 3 is the XXXTentacion cluster, 4 is the Te Bote cluster
# 1 has two tracks with the least instrumentalness, and above average liveliness and duration than other clusters


In [11]:
i = df.index

fig1 = go.Figure(data=go.Scatter(x=df['label'],
                                y=df['danceability'],
                                mode='markers',
                                marker_color=df['label'],
                                text=df['name'])) # hover text goes here

fig1.update_layout(title='Top 100 Spotify Songs 2018')
fig1.show()

In [None]:
# This data is quite flawed 
# the mode (major or minor key) column is wildly inaccurate, had to remove from cluster fitting
# ratings for certain songs just don't make sense.
# heartwrenching emo rap songs like Moonlight and Jocelyn Flores are rated more danceable than Drake's Nice For What
# (in my opinion, one of the most danceable songs to come out in 2018!)