### Load relevant packages and sign into Spotify instance

In [2]:
import os, sys, json, requests, urllib.parse
import pandas as pd

import spotipy
import spotipy.util as util

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [3]:
# set API keys
apikeys = json.load(open("data/api-keys.json"))
os.environ["SPOTIPY_CLIENT_ID"]     = apikeys["spotipy-client-id"]
os.environ["SPOTIPY_CLIENT_SECRET"] = apikeys["spotipy-client-secret"]
os.environ["SPOTIPY_REDIRECT_URI"]  = apikeys["redirect-url"]

# set my user_id
user_id = '129874447'

In [4]:
# connect to spotify
token = util.prompt_for_user_token(user_id, \
                                   scope = 'user-library-read, playlist-modify-public, playlist-modify-private')
sp = spotipy.Spotify(auth = token)

### Define helper functions for interfacing with Spotify

In [5]:
### function to load user's saved tracks
def pull_saved_tracks(limit = 50, offset = 0):
    saved_tracks = [ ]
    
    # pull in list of tracks to determine length
    saved_tracks_obj = sp.current_user_saved_tracks(limit = limit, offset = offset)
    num_saved_tracks = saved_tracks_obj['total']
    
    # loop through to pull in all saved tracked
    while (offset < num_saved_tracks):
        saved_tracks_obj = sp.current_user_saved_tracks(limit = limit, offset = offset)
        
        # add track information to running list
        for track_obj in saved_tracks_obj['items']:
            saved_tracks.append({
                'name': track_obj['track']['name'],
                'artists': ', '.join([artist['name'] for artist in track_obj['track']['artists']]),
                'track_id': track_obj['track']['id']
            })
            
        offset += limit
        
    return saved_tracks

### function to load tracks from a specified playlist
def pull_playlist_tracks(user_id, playlist_id, limit = 100, offset = 0):
    playlist_tracks = [ ]
    
    # pull in playlist to determine length
    playlist_obj = sp.user_playlist_tracks(user = user_id, playlist_id = playlist_id, \
                                           limit = limit, offset = offset)
    num_playlist_tracks = playlist_obj['total']
    
    # loop through to pull in all playlist tracks
    while (offset < num_playlist_tracks):
        playlist_obj = sp.user_playlist_tracks(user = user_id, playlist_id = playlist_id, \
                                               limit = limit, offset = offset)

        # add track information to running list
        for track_obj in playlist_obj['items']:
            playlist_tracks.append({
                'name': track_obj['track']['name'],
                'artists': ', '.join([artist['name'] for artist in track_obj['track']['artists']]),
                'track_id': track_obj['track']['id']
            })
            
        offset += limit
        
    return playlist_tracks

### function to load spotify audio features when given a list of track ids
def pull_audio_features(track_ids):
    saved_tracks_audiofeat = [ ]
    
    # iterate through track_ids in groups of 50
    for ix in range(0,len(track_ids),50):
        audio_feats = sp.audio_features(track_ids[ix:ix+50])
        saved_tracks_audiofeat += audio_feats
        
    return saved_tracks_audiofeat

### Try clustering on pre-made Spotify playlists

In [6]:
# pull tracks for "ambient chill" playlist
testA_tracks    = pull_playlist_tracks(user_id = 'spotify', playlist_id = '37i9dQZF1DX3Ogo9pFvBkY')
testA_tracks_df = pd.DataFrame(testA_tracks)
testA_tracks_df['playlist'] = "ambient chill"

# pull tracks for "hardstyle hits" playlist
testB_tracks    = pull_playlist_tracks(user_id = 'spotify', playlist_id = '37i9dQZF1DX0pH2SQMRXnC')
testB_tracks_df = pd.DataFrame(testB_tracks)
testB_tracks_df['playlist'] = "hardstyle hits"

# stack all tracks together
testAB_tracks_df = testA_tracks_df.append(testB_tracks_df)
testAB_tracks_df.head()

Unnamed: 0,artists,name,track_id,playlist
0,August Wilhelmsson,Now Is The Time To Leave,6wxi6j0tpjGJGczGrLeDYD,ambient chill
1,Yuki Sakura,Stillness Speaks,3fzTpBMSQKjo9sW0T8Tw2O,ambient chill
2,Eleanor Arroway,Trancendent Sleep,1NY2tT1A7uZwchez1vBShQ,ambient chill
3,chillchild,Gratitude,2ek6LLrUwT6p2zQCCvFmAW,ambient chill
4,Primer Dia,Astral Therapy,17Wjmh3nZzAywevzFHqJnx,ambient chill


In [7]:
# get audio features for stacked set of songs
_testAB_audiofeat    = pull_audio_features(track_ids = list(testAB_tracks_df['track_id']))
_testAB_audiofeat_df = pd.DataFrame(_testAB_audiofeat).drop(['analysis_url', 'track_href', 'type', 'uri'], axis = 1)

_testAB_audiofeat_df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0.988,0.2,128000,0.0449,6wxi6j0tpjGJGczGrLeDYD,0.915,0,0.102,-20.741,0,0.0428,72.278,5,0.048
1,0.993,0.419,167091,0.00181,3fzTpBMSQKjo9sW0T8Tw2O,0.927,3,0.138,-33.652,1,0.0408,69.862,4,0.146
2,0.981,0.353,257463,0.0993,1NY2tT1A7uZwchez1vBShQ,0.928,5,0.121,-25.317,1,0.0372,138.135,3,0.123
3,0.969,0.231,159706,0.217,2ek6LLrUwT6p2zQCCvFmAW,0.968,9,0.275,-21.127,0,0.0488,112.062,4,0.0405
4,0.949,0.239,137172,0.0472,17Wjmh3nZzAywevzFHqJnx,0.935,4,0.0802,-24.228,0,0.0562,69.666,3,0.0389


In [8]:
# normalize audio features before merging/clustering
testAB_audiofeat_scaler = StandardScaler()

testAB_audiofeat    = testAB_audiofeat_scaler.fit_transform(_testAB_audiofeat_df.drop(['id'], axis = 1))
testAB_audiofeat_df = pd.DataFrame(testAB_audiofeat, columns = _testAB_audiofeat_df.drop('id', axis = 1).columns)
testAB_audiofeat_df['id'] = _testAB_audiofeat_df['id']

testAB_audiofeat_df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,id
0,0.965232,-0.739829,-0.994808,-0.96598,0.810399,-1.175325,-0.523828,-0.612784,-1.321022,-0.301368,-1.226364,1.67506,-0.759296,6wxi6j0tpjGJGczGrLeDYD
1,0.977137,0.533462,-0.419151,-1.082851,0.839429,-0.368729,-0.265242,-2.025394,0.75699,-0.338052,-1.29699,0.295599,0.040835,3fzTpBMSQKjo9sW0T8Tw2O
2,0.948564,0.14973,0.911675,-0.818434,0.841849,0.169001,-0.387352,-1.11345,0.75699,-0.404082,0.698803,-1.083862,-0.146951,1NY2tT1A7uZwchez1vBShQ
3,0.91999,-0.559592,-0.527903,-0.499201,0.938617,1.244461,0.718826,-0.655016,-1.321022,-0.191318,-0.063377,0.295599,-0.820531,2ek6LLrUwT6p2zQCCvFmAW
4,0.872368,-0.513079,-0.859741,-0.959742,0.858783,-0.099864,-0.680417,-0.994301,-1.321022,-0.055589,-1.30272,-1.083862,-0.833594,17Wjmh3nZzAywevzFHqJnx


In [9]:
# merge track info with audio features
testAB_tracks_plus_df = testAB_tracks_df.merge(testAB_audiofeat_df, how = 'left', \
                                               left_on = 'track_id', right_on = 'id')
testAB_tracks_plus_df.head()

Unnamed: 0,artists,name,track_id,playlist,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,id
0,August Wilhelmsson,Now Is The Time To Leave,6wxi6j0tpjGJGczGrLeDYD,ambient chill,0.965232,-0.739829,-0.994808,-0.96598,0.810399,-1.175325,-0.523828,-0.612784,-1.321022,-0.301368,-1.226364,1.67506,-0.759296,6wxi6j0tpjGJGczGrLeDYD
1,Yuki Sakura,Stillness Speaks,3fzTpBMSQKjo9sW0T8Tw2O,ambient chill,0.977137,0.533462,-0.419151,-1.082851,0.839429,-0.368729,-0.265242,-2.025394,0.75699,-0.338052,-1.29699,0.295599,0.040835,3fzTpBMSQKjo9sW0T8Tw2O
2,Eleanor Arroway,Trancendent Sleep,1NY2tT1A7uZwchez1vBShQ,ambient chill,0.948564,0.14973,0.911675,-0.818434,0.841849,0.169001,-0.387352,-1.11345,0.75699,-0.404082,0.698803,-1.083862,-0.146951,1NY2tT1A7uZwchez1vBShQ
3,chillchild,Gratitude,2ek6LLrUwT6p2zQCCvFmAW,ambient chill,0.91999,-0.559592,-0.527903,-0.499201,0.938617,1.244461,0.718826,-0.655016,-1.321022,-0.191318,-0.063377,0.295599,-0.820531,2ek6LLrUwT6p2zQCCvFmAW
4,Primer Dia,Astral Therapy,17Wjmh3nZzAywevzFHqJnx,ambient chill,0.872368,-0.513079,-0.859741,-0.959742,0.858783,-0.099864,-0.680417,-0.994301,-1.321022,-0.055589,-1.30272,-1.083862,-0.833594,17Wjmh3nZzAywevzFHqJnx


In [10]:
# try clustering full stack of songs into two playlists
num_clusters = 2
kmeans = KMeans(n_clusters = num_clusters).fit(testAB_tracks_plus_df.drop(['track_id', 'id', 'name', 'artists', \
                                                                           'playlist'], axis = 1))
testAB_tracks_plus_df['cluster'] = pd.Series(kmeans.labels_) + 1

# see if successful
testAB_tracks_plus_df[['track_id', 'playlist', 'cluster']].groupby(['playlist', 'cluster']).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,track_id
playlist,cluster,Unnamed: 2_level_1
ambient chill,2,90
hardstyle hits,1,50


Since the playlists were grouped into mutually exclusive clusters, we can see that this approach works, at least on quite different sounding sets of songs.

As a secondary experiment, before trying it on my own library, I want to see how it performs on two more similar playlists.

But first, going to throw this code into some functions for later use.

In [11]:
### function to create "tracks plus" df (including normalized audio features) when given a tracks df
def build_tracks_plus_df(tracks_df, normalize = True):
    # pull raw audio features
    _audiofeat    = pull_audio_features(track_ids = list(tracks_df['track_id']))
    _audiofeat_df = pd.DataFrame(_audiofeat).drop(['analysis_url', 'track_href', 'type', 'uri'], axis = 1)
    
    # scale audio features (if desired)
    if normalize:
        scaler = StandardScaler()
        audiofeat    = scaler.fit_transform(_audiofeat_df.drop(['id'], axis = 1))
        audiofeat_df = pd.DataFrame(audiofeat, columns = _audiofeat_df.drop('id', axis = 1).columns)
        audiofeat_df['id'] = _audiofeat_df['id']
    else:
        audiofeat_df = _audiofeat_df
    
    # merge audio features with tracks_df
    tracks_plus_df = tracks_df.merge(audiofeat_df, how = 'left', left_on = 'track_id', right_on = 'id')
    return(tracks_plus_df)

### function to cluster tracks based on normalized audio features
def cluster_tracks_plus_df(tracks_plus_df, num_clusters, drop_vars = None):
    kmeans = KMeans(n_clusters = num_clusters).fit(tracks_plus_df.drop(['track_id', 'id', 'name', 'artists'] + \
                                                                       (drop_vars if drop_vars != None else []), \
                                                                       axis = 1))
    tracks_plus_df['cluster'] = pd.Series(kmeans.labels_) + 1
    return(tracks_plus_df)

In [12]:
# pull tracks for "lo-fi indie" playlist
testC_tracks    = pull_playlist_tracks(user_id = 'spotify', playlist_id = '37i9dQZF1DX0CIO5EOSHeD')
testC_tracks_df = pd.DataFrame(testC_tracks)
testC_tracks_df['playlist'] = "lo-fi indie"

# pull tracks for "hardstyle hits" playlist
testD_tracks    = pull_playlist_tracks(user_id = 'spotify', playlist_id = '37i9dQZF1DX6uhsAfngvaD')
testD_tracks_df = pd.DataFrame(testD_tracks)
testD_tracks_df['playlist'] = "dreampop"

# stack all tracks together
testCD_tracks_df = testC_tracks_df.append(testD_tracks_df)
testCD_tracks_df.head()

# build plus df and cluster
testCD_tracks_plus_df = cluster_tracks_plus_df(build_tracks_plus_df(testCD_tracks_df), 2, drop_vars = ['playlist'])
testCD_tracks_plus_df[['track_id', 'playlist', 'cluster']].groupby(['playlist', 'cluster']).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,track_id
playlist,cluster,Unnamed: 2_level_1
dreampop,1,45
dreampop,2,16
lo-fi indie,1,28
lo-fi indie,2,23


The results aren't as clean on this try, but that makes sense because these are similar sounding playlists. On my own listening, it wasn't entirely obvious how one song would fall into one or another, so it would be a lot to expect the clustering algorithm to do it.

Nonetheless, the goal of this experiment is not to make perfectly partitioned playlists purely based on Spotify's own genres, but instead to create playlists that have similar vibes, hopefully grouping together songs that are not entirely obvious at first listen.

### Pull track and audiofeature information for all of my saved tracks

In [13]:
# pull in list of saved songs
saved_tracks_df      = pd.DataFrame(pull_saved_tracks())
saved_tracks_plus_df = build_tracks_plus_df(saved_tracks_df, )

In [15]:
saved_tracks_clustered1_df = cluster_tracks_plus_df(saved_tracks_plus_df, 100)
saved_tracks_clustered1_df[saved_tracks_clustered1_df['cluster'] == 1]

Unnamed: 0,artists,name,track_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,id,cluster
138,The Rubens,All My Dollars,4mFOvVE2O70m818aEbcPZM,-0.334336,-0.566657,3.142202,0.480868,-0.409291,0.880152,-0.667188,0.14773,0.557102,1.260942,-0.478447,0.269713,-1.199133,4mFOvVE2O70m818aEbcPZM,1
312,NEEDTOBREATHE,Wanted Man - Live From The Woods,48964GSffAAsOPSc0jh1xn,-1.023861,-1.894914,3.300346,1.404396,-0.407758,1.158529,1.025328,1.030671,0.557102,-0.041208,0.529748,0.269713,-0.956161,48964GSffAAsOPSc0jh1xn,1
316,NEEDTOBREATHE,"Oh, Carolina - Live From The Woods",41aqxnmtTbKOlETPtOodUA,-0.840235,-1.846987,5.076596,1.111354,-0.292181,-1.346865,0.938405,0.444782,0.557102,-0.101165,1.391787,0.269713,-0.123114,41aqxnmtTbKOlETPtOodUA,1
459,Logic,Under Pressure,2JEqqPu4UYnzcNxDy9JVu3,-0.303676,1.158708,4.811167,1.213475,-0.409308,1.715283,-0.495825,0.70447,0.557102,2.475462,-1.207629,0.269713,0.575431,2JEqqPu4UYnzcNxDy9JVu3,1
889,Coldplay,Up&Up,31L9yLXSj6LpCFupyMV6CR,-0.887759,-0.35441,2.521873,0.600749,-0.407701,0.601775,-0.520661,0.70553,0.557102,0.03566,1.459015,0.269713,-0.773932,31L9yLXSj6LpCFupyMV6CR,1
991,"Lil Dicky, Brain",Pillow Talking (feat. Brain),4lh1PamTsomWbFpkOPyfrD,0.723451,2.630746,6.09293,-0.979905,-0.409308,0.880152,-0.321979,-0.054986,0.557102,4.843008,-0.507779,0.269713,-0.062371,4lh1PamTsomWbFpkOPyfrD,1
1335,Gary Clark Jr.,When My Train Pulls In,2mdxGlwrhtkuxgzbH7LOIh,-1.024044,-1.689513,3.422075,1.759599,-0.123518,1.715283,0.50379,1.553227,-1.795003,1.091832,1.525311,0.269713,-0.465878,2mdxGlwrhtkuxgzbH7LOIh,1
1344,Gary Clark Jr.,Third Stone From The Sun/If You Love Me Like Y...,6B7aI3crr9n5B2bXkrAfVN,-1.020549,-0.956918,5.08531,1.191274,0.719798,-0.233357,-0.290935,1.228882,0.557102,0.401554,-1.120368,0.269713,-0.799965,6B7aI3crr9n5B2bXkrAfVN,1
1378,The Districts,Young Blood,2wb2FptR9dbAw9TIQpMfZ1,-1.000589,-1.771674,4.236368,0.902672,0.110737,-0.233357,-0.927957,0.685921,0.557102,0.891974,0.972117,0.269713,0.197957,2wb2FptR9dbAw9TIQpMfZ1,1
1808,Coldplay,Yes,04zfFfRMXegKi4mMkGMeze,-1.009603,-2.538502,2.838562,1.000353,1.258584,0.323397,0.007086,0.4893,0.557102,-0.064269,1.386187,0.269713,-0.882402,04zfFfRMXegKi4mMkGMeze,1


In [None]:
### do pca analysis
num_components = 5

pca = PCA(n_components = num_components)
pca.fit(saved_tracks_plus_df.drop(['track_id', 'id', 'name', 'artists', 'duration_ms', 'cluster'], axis = 1, errors = 'ignore')) # TODO: should normalize data (set mean = 0, etc.)
print(pca.components_)

In [None]:
### make playlist from one cluster
playlist_name = 'k-means music'

# pull in all playlists
playlist_limit  = 50
playlist_offset = 0

playlists_obj = sp.user_playlists(user_id, limit = playlist_limit, offset = playlist_offset)
num_playlists = playlists_obj['total']
all_playlists = [ ]

while (playlist_offset < num_playlists):
    playlists_obj = sp.user_playlists(user_id, limit = playlist_limit, offset = playlist_offset)
    all_playlists += [{'name': playlist['name'], 'id': playlist['id']} for playlist in playlists_obj['items']]
    playlist_offset += playlist_limit

# check if playlist already exists
if (playlist_name not in [playlist['name'] for playlist in all_playlists]):
    playlist = sp.user_playlist_create(user = user_id, name = 'k-means music', public = True)
else:
    playlist_id = [playlist['id'] for playlist in all_playlists if playlist['name'] == playlist_name][0]
    playlist = sp.user_playlist(user = user_id, playlist_id = playlist_id)

# remove any existing tracks in playlist
while (playlist['tracks']['total'] > 0):
    sp.user_playlist_remove_all_occurrences_of_tracks(user_id, playlist['id'], tracks = [track['track']['id'] for track in playlist['tracks']['items']])
    playlist = sp.user_playlist(user = user_id, playlist_id = playlist_id)

# add tracks from cluster
cluster_ix = 50
sp.user_playlist_add_tracks(user_id, playlist_id = playlist['id'], tracks = list(saved_tracks_plus_df.ix[saved_tracks_plus_df['cluster'] == cluster_ix]['id']))