#### Instructions
It's the moment to perform clustering on the songs you collected. Remember that the ultimate goal of this little project is to improve the recommendations of artists. Clustering the songs will allow the recommendation system to limit the scope of the recommendations to only songs that belong to the same cluster - songs with similar audio features.

The experiments you did with the Spotify API and the Billboard web scraping will allow you to create a pipeline such that when the user enters a song, you:

1- Check whether or not the song is in the Billboard Hot 200.

2- Collect the audio features from the Spotify API.

After that, you want to send the Spotify audio features of the submitted song to the clustering model, which should return a cluster number.

We want to have as many songs as possible to create the clustering model, so we will add the songs you collected to a bigger dataset available on Kaggle containing 160 thousand songs.

In [5]:
!pip install spotipy



In [6]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd

In [7]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="d3e2b5055c184ea8bc380f2e44742819",
                                                           client_secret="1b6ccee7c75d4689ae36a30c8e48ed4f"))

lucretia_my_reflection = sp.search(q='track:Lucretia My Reflection', limit=10, type='track')
lucretia_my_reflection

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=track%3ALucretia+My+Reflection&type=track&offset=0&limit=10',
  'items': [{'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4HxBVyHaUa60eCSsJWxwWR'},
       'href': 'https://api.spotify.com/v1/artists/4HxBVyHaUa60eCSsJWxwWR',
       'id': '4HxBVyHaUa60eCSsJWxwWR',
       'name': 'Sisters of Mercy',
       'type': 'artist',
       'uri': 'spotify:artist:4HxBVyHaUa60eCSsJWxwWR'}],
     'available_markets': ['AR',
      'AU',
      'AT',
      'BE',
      'BO',
      'BR',
      'BG',
      'CA',
      'CL',
      'CO',
      'CR',
      'CY',
      'CZ',
      'DK',
      'DO',
      'DE',
      'EC',
      'EE',
      'SV',
      'FI',
      'FR',
      'GR',
      'GT',
      'HN',
      'HK',
      'HU',
      'IS',
      'IE',
      'IT',
      'LV',
      'LT',
      'LU',
      'MY',
      'MT',
      'MX',
      'NL',
      'NZ',
      'NI',
      'NO',
    

In [8]:
lucretia_my_reflection.keys()

dict_keys(['tracks'])

In [9]:
lucretia_my_reflection['tracks'].keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [10]:
lucretia_my_reflection["tracks"]["items"][0].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

In [11]:
lucretia_my_reflection["tracks"]["items"][0]['uri']

'spotify:track:20goDx14UZviYtCPtLbqvs'

In [12]:
lucretia_my_reflection['tracks']['total']

76

In [13]:
track_ids = [track['uri'] for track in lucretia_my_reflection['tracks']['items']]

In [14]:
start = 0
df = pd.DataFrame()

for stop in range(0, len(track_ids)+100, 100):
    if start != stop:
        print(start, stop)
        new_df = pd.json_normalize(sp.audio_features(track_ids[start:stop]))
        df = pd.concat([new_df, df])
        start = stop

df.reset_index(inplace=True)

0 100


In [15]:
df.head(10)

Unnamed: 0,index,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0,0.601,0.768,9,-8.718,1,0.0329,0.0785,0.712,0.0771,0.887,130.342,audio_features,20goDx14UZviYtCPtLbqvs,spotify:track:20goDx14UZviYtCPtLbqvs,https://api.spotify.com/v1/tracks/20goDx14UZvi...,https://api.spotify.com/v1/audio-analysis/20go...,295250,4
1,1,0.602,0.776,9,-9.204,1,0.0311,0.129,0.77,0.193,0.923,129.458,audio_features,1FyfPd5AMMjrorTdu3wASG,spotify:track:1FyfPd5AMMjrorTdu3wASG,https://api.spotify.com/v1/tracks/1FyfPd5AMMjr...,https://api.spotify.com/v1/audio-analysis/1Fyf...,297160,4
2,2,0.623,0.787,0,-6.629,1,0.0365,0.000257,0.686,0.0408,0.768,129.99,audio_features,2XrIWcOPQMwXAxCYQam6jN,spotify:track:2XrIWcOPQMwXAxCYQam6jN,https://api.spotify.com/v1/tracks/2XrIWcOPQMwX...,https://api.spotify.com/v1/audio-analysis/2XrI...,314427,4
3,3,0.591,0.791,9,-7.421,0,0.033,0.00238,0.723,0.0612,0.815,130.142,audio_features,52PMeKpMnfn7w0aETAMD3a,spotify:track:52PMeKpMnfn7w0aETAMD3a,https://api.spotify.com/v1/tracks/52PMeKpMnfn7...,https://api.spotify.com/v1/audio-analysis/52PM...,523989,4
4,4,0.488,0.98,7,-2.842,1,0.0884,0.00516,0.765,0.0875,0.441,147.066,audio_features,6LaGLwZapKGZLGWvfu2k3Y,spotify:track:6LaGLwZapKGZLGWvfu2k3Y,https://api.spotify.com/v1/tracks/6LaGLwZapKGZ...,https://api.spotify.com/v1/audio-analysis/6LaG...,267787,4
5,5,0.579,0.827,5,-5.662,1,0.0298,6.8e-05,0.0345,0.122,0.353,130.045,audio_features,2yUkX5Ea2NwodDJPvvsTxR,spotify:track:2yUkX5Ea2NwodDJPvvsTxR,https://api.spotify.com/v1/tracks/2yUkX5Ea2Nwo...,https://api.spotify.com/v1/audio-analysis/2yUk...,278360,4
6,6,0.601,0.768,9,-8.718,1,0.0329,0.0785,0.712,0.0771,0.887,130.342,audio_features,7hUJwSsOySplSXvagzQwQZ,spotify:track:7hUJwSsOySplSXvagzQwQZ,https://api.spotify.com/v1/tracks/7hUJwSsOySpl...,https://api.spotify.com/v1/audio-analysis/7hUJ...,295250,4
7,7,0.532,0.98,7,-7.074,1,0.0363,0.000403,0.0532,0.0763,0.614,137.965,audio_features,1AhJWvjtWiVBZkQZWmqIML,spotify:track:1AhJWvjtWiVBZkQZWmqIML,https://api.spotify.com/v1/tracks/1AhJWvjtWiVB...,https://api.spotify.com/v1/audio-analysis/1AhJ...,277386,4
8,8,0.61,0.798,2,-9.248,0,0.038,0.00511,0.775,0.0533,0.783,130.351,audio_features,3kJDfttDmnaMEU4gKyZRJU,spotify:track:3kJDfttDmnaMEU4gKyZRJU,https://api.spotify.com/v1/tracks/3kJDfttDmnaM...,https://api.spotify.com/v1/audio-analysis/3kJD...,591288,4
9,9,0.636,0.661,9,-8.477,1,0.0308,0.00179,0.525,0.0839,0.921,130.191,audio_features,7meEHt7fPaPqFaHOgk0zcS,spotify:track:7meEHt7fPaPqFaHOgk0zcS,https://api.spotify.com/v1/tracks/7meEHt7fPaPq...,https://api.spotify.com/v1/audio-analysis/7meE...,333013,4


In [34]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import joblib
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [36]:
client_id = 'd3e2b5055c184ea8bc380f2e44742819'
client_secret = '1b6ccee7c75d4689ae36a30c8e48ed4f'

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

def get_audio_features(track_ids):
    audio_features = []
    for start in range(0, len(track_ids), 100):
        stop = min(start + 100, len(track_ids))
        try:
            new_df = pd.json_normalize(sp.audio_features(track_ids[start:stop]))
            audio_features.append(new_df)
        except ConnectionError:
            print("Error de conexi√≥n.")
    return pd.concat(audio_features)

track_ids = ['spotify:track:20goDx14UZviYtCPtLbqvs']  

audio_features_df = get_audio_features(track_ids)

audio_features_df = audio_features_df.select_dtypes(include=['number'])

audio_features_df.dropna(inplace=True)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(audio_features_df)

if len(audio_features_df) < 10:
    raise ValueError("Not enough.")

kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(scaled_features)

audio_features_df['cluster'] = kmeans.labels_

joblib.dump(kmeans, 'kmeans_model.pkl')

ValueError: Not enough.