In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.cluster import KMeans

In [None]:
# source: https://www.kaggle.com/nadintamer/top-spotify-tracks-of-2018
# I took the liberty to add genres for each song based on the first tag on last.fm
# genre2 is my combining of similar genres
df = pd.read_csv("top2018.csv")

In [None]:
# select parameters to cluster, number of clusters, and fit via KMeans
params = ["danceability","energy","key","loudness","speechiness","acousticness","instrumentalness","liveness","valence","tempo","duration_ms"]
n = 5
kmeans = KMeans(n_clusters=n, random_state=0).fit(df[params])

In [None]:
df['label'] = kmeans.labels_

In [None]:
# Top 25 songs of the year sorted by their cluster label
df.head(25).sort_values(by=['label'])

In [None]:
# cluster centroids for each parameter

for i in range(n):
    print("Cluster ",i)
    for j in range(len(params)):
        print(params[j],": ",kmeans.cluster_centers_[i][j])
        
    print("\n")

In [None]:
fig1 = go.Figure(data=go.Scatter(x=df['genre2'],
                                y=df['label'],
                                mode='markers',
                                marker_color=df['label'],
                                text=df['name'])) # hover text goes here

fig1.update_layout(title='Top 100 Spotify Songs 2018')
fig1.show()

In [None]:
# FOR N=5

# two clusters - 0 and 2 - seem to encompass all contemporary genres.
# songs in 0 are speechier and more instrumental than 2 
# more hard-hitting trap songs like HUMBLE. by Kendrick Lamar and in 0
# more pop songs like Congratulations by Post Malone, Happier by Marshmellow in 2
# 0 tends to be a little dancier, instrumental, and wordy, while 2 are more catchy and heartfelt tunes.
# 3 is the XXXTentacion cluster (quiet, short, acoustic, high liveness), 4 is the Te Bote cluster
# 1 has two tracks with the least instrumentalness, and above average liveliness and duration than other clusters


In [None]:
fig2 = go.Figure(data=go.Scatter(x=df['label'],
                                y=df['danceability'],
                                mode='markers',
                                marker_color=df['label'],
                                text=df['name'])) # hover text goes here

fig2.update_layout(title='Top 100 Spotify Songs 2018')
fig2.show()

In [None]:
# This data is quite flawed 
# the mode (major or minor key) column is wildly inaccurate, had to remove from cluster fitting
# ratings for certain songs just don't make sense.
# Stir Fry by Migos is not 182 BPM, Sicko Mode by Travis Scott is not 155 BPM 
# these tracks are half those values
# sad, emo rap songs like Moonlight and Jocelyn Flores are rated more danceable than Drake's Nice For What
# (in my opinion, one of the most danceable songs of 2018!)

In [None]:
# trying to visualize the clusters by plotting (hypothetically) contradictory parameters

In [None]:
fig3 = go.Figure(data=go.Scatter(x=df['acousticness'], 
                                 y=df['energy'], 
                                 mode='markers', 
                                 marker_color=df['label'], 
                                 text=df['name'])) # hover text goes here

fig3.update_layout(title='Top 100 Spotify Songs 2018') 
fig3.show()