In [1]:
# Dependencies and Setup
import base64
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import base64
from requests.auth import HTTPBasicAuth
import scipy.stats as st
from pprint import pprint

# Import Spotify API key
from api_keys import spotify_client_id, spotify_client_secret

In [2]:
# Create playlist dataframe from csv file that contains a list of playlist IDs that we want to search tracks for
play_df = pd.read_csv("datasets/playlist_top_5_input.csv")
play_df.head()

# Prepare output csv file path to save the playlists and tracks data retrieved from the Spotify API
spotify_playlists_csv = "datasets/playlist_top_5_output.csv"

In [3]:
"""
Code to manage Spotify's Client Credentials Flow as described here:
https://developer.spotify.com/documentation/general/guides/authorization-guide/#client-credentials-flow
"""
# Build the Spotify Client ID and Secret combination
spotify_client_id_secret = spotify_client_id + ":" + spotify_client_secret
# Generate the base64 encoded string that contains the client ID and client secret key
base64_encoding = base64.b64encode(spotify_client_id_secret.encode()).decode()

# Build the Spotify API URL to get the Bearer token
spotify_token_url = 'https://accounts.spotify.com/api/token'

# Build the payload, i.e. the request body parameters that need to be passed to the token API URL
payload = 'grant_type=client_credentials'
# Build the header for Content-Type and Basic Auth containing the base64 encoded string so we can generate the Spotify API token
token_api_headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Authorization': 'Basic %s' % base64_encoding }

# Call the token API URL and save as a JSON response object
spotify_token_response = requests.post(spotify_token_url, headers=token_api_headers, data=payload).json()
# Retrieve the Bearer token from the response object
spotify_access_token = spotify_token_response['access_token']

In [4]:
"""
Code to retrieve a Spotify Playlist's Tracks and related data:
https://developer.spotify.com/documentation/web-api/reference/playlists/get-playlists-tracks/
"""
# Sample Playlist ID
playlist_ids = play_df["List ID"]

# Initialize lists to save playlist data
track_ids = []
track_added_at = []
track_names = []
track_popularity = []
track_durations = []
artists = []
albums = []
album_types = []
release_dates = []
num_available_markets = []

# Loop through each Playlist to grab the individual tracks in it
for playlist_id in playlist_ids:

    # Handle any exceptions for Spotify's Get Playlist's Tracks API
    try:
        # Build Spotify's Get Playlist's Tracks URL
        spotify_playlists_url = f'https://api.spotify.com/v1/playlists/{playlist_id}/tracks'
        # Build the header for Bearer token containing the base64 encoded string so we can retrieve the playlist's tracks 
        playlist_api_headers = { 'Authorization': 'Bearer %s' % spotify_access_token }
        # Call the Get Playlist's Tracks API URL and save as a JSON response object
        spotify_playlists_response = requests.get(spotify_playlists_url, headers=playlist_api_headers, data={}).json()
        
        # Get total tracks contained in playlist
        total_tracks = spotify_playlists_response['total']
        # Number of tracks that can be retrieved (Spotify limit = 100)
        track_limit = len(spotify_playlists_response['items'])

        # Playlist returns a track object which we will parse to retrieve all the track, album and artist data
        for item in range(track_limit):
            
            # Check if a track has a video
            if spotify_playlists_response['items'][item]['track'] is not None:
                # Get the date and time the track was added to the playlist 
                track_added_at.append(spotify_playlists_response['items'][item]['added_at'])
                # Get the ID of the track 
                track_ids.append(spotify_playlists_response['items'][item]['track']['id'])
                # Get the name of the track
                track_names.append(spotify_playlists_response['items'][item]['track']['name'])
                # Get the popularity score of the track
                track_popularity.append(spotify_playlists_response['items'][item]['track']['popularity'])
                # Get the track duration (song length) in milliseconds
                track_durations.append(spotify_playlists_response['items'][item]['track']['duration_ms'])
                # Get the number of markets that the track is available in
                num_available_markets.append(len(spotify_playlists_response['items'][item]['track']['available_markets']))
                # Get the album name
                albums.append(spotify_playlists_response['items'][item]['track']['album']['name'])
                # Get the album type: single, album or compilation
                album_types.append(spotify_playlists_response['items'][item]['track']['album']['album_type'])
                # Get the album release date
                release_dates.append(spotify_playlists_response['items'][item]['track']['album']['release_date'])

                # Each track could be associated with one or more artists
                # Parse through each artist object to retrieve their names
                artist_name = ''
                artists_count = len(spotify_playlists_response['items'][item]['track']['artists'])
                for artist in range(artists_count):
                    artist_name = artist_name + '|' + spotify_playlists_response['items'][item]['track']['artists'][artist]['name']

                artists.append(artist_name)
                
        print(f"Finished processing Playlist {playlist_id} with total tracks: {total_tracks} | Retrieval limited to {track_limit}")

    # Catch any exceptions thrown by the Spotify API
    except Exception as e:
        print(e)
        pass

Finished processing Playlist 37i9dQZF1DXcBWIGoYBM5M with total tracks: 50 | Retrieval limited to 50
Finished processing Playlist 37i9dQZF1DWUa8ZRTfalHk with total tracks: 75 | Retrieval limited to 75
Finished processing Playlist 37i9dQZF1DX4JAvHpjipBk with total tracks: 96 | Retrieval limited to 96
Finished processing Playlist 37i9dQZF1DXbYM3nMM0oPk with total tracks: 75 | Retrieval limited to 75
Finished processing Playlist 37i9dQZF1DX0b1hHYQtJjp with total tracks: 75 | Retrieval limited to 75
Finished processing Playlist 37i9dQZF1DX0XUsuxWHRQd with total tracks: 54 | Retrieval limited to 54
Finished processing Playlist 37i9dQZF1DWY4xHQp97fN6 with total tracks: 100 | Retrieval limited to 100
Finished processing Playlist 37i9dQZF1DX6GwdWRQMQpq with total tracks: 50 | Retrieval limited to 50
Finished processing Playlist 37i9dQZF1DX2RxBh64BHjQ with total tracks: 100 | Retrieval limited to 100
Finished processing Playlist 37i9dQZF1DX7Mq3mO5SSDc with total tracks: 48 | Retrieval limited to

In [5]:
"""
Code to retrieve a Track's Audio Features using the Spotify API as described here:
https://developer.spotify.com/documentation/web-api/reference/tracks/get-audio-features/
"""
# Initialize audio track feature lists and variables to save data to
track_key = []
track_mode = []
track_time_signature = []
track_acousticness = []
track_danceability = []
track_energy = []
track_instrumentalness = []
track_liveness = []
track_loudness = []
track_speechiness = []
track_valence = []
track_tempo = []
track_count = 0

# Loop through each track to grab the individual audio features associated with it
print (f"Started retrieving audio features data for all {len(track_ids)} tracks across {len(playlist_ids)} different playlists")
print ("This may take a while, so hang tight...")

for track_id in track_ids:
    
    # Increment track counter for using in the print log
    track_count = track_count + 1

    # Handle any exceptions for Spotify's Get Audio Features API
    try:
        # Build Spotify's Get Playlist's Tracks URL
        spotify_tracks_url = f'https://api.spotify.com/v1/audio-features/{track_id}'
        # Build the header for Bearer token containing the base64 encoded string so we can retrieve the playlist's tracks 
        audio_features_api_headers = { 'Authorization': 'Bearer %s' % spotify_access_token }
        # Call the Get Playlist's Tracks API URL and save into response objects
        audio_features_status = requests.get(spotify_tracks_url, headers=audio_features_api_headers, data={})
        audio_features_response = audio_features_status.json()

        # If API returns successful response go ahead and retrieve the data 
        if  audio_features_status.status_code == 200:
            # Grab all the audio features from the track and save to respective lists
            track_key.append(audio_features_response['key'])
            track_mode.append(audio_features_response['mode'])
            track_time_signature.append(audio_features_response['time_signature'])
            track_acousticness.append(audio_features_response['acousticness'])
            track_danceability.append(audio_features_response['danceability'])
            track_energy.append(audio_features_response['energy'])
            track_instrumentalness.append(audio_features_response['instrumentalness'])
            track_liveness.append(audio_features_response['liveness'])
            track_loudness.append(audio_features_response['loudness'])
            track_speechiness.append(audio_features_response['speechiness'])
            track_valence.append(audio_features_response['valence'])
            track_tempo.append(audio_features_response['tempo'])
        
        # If API response results in a HTTP status code 429 it means Too Many Requests - Rate limiting has been applied
        # We check the Retry-After header in the response to get the number of seconds to wait, before retrying the next request
        elif audio_features_status.status_code == 429:
            retry_after = int(audio_features_status.headers["Retry-After"])
            print (f"Waiting for {retry_after} seconds before attempting the next request")
            time.sleep(retry_after)
        # Catch any exceptions thrown by the Spotify API
    except Exception as e:
        print(e)
        pass
    
    if track_count % 100 == 0:
        print(f"Done retrieving data for {track_count} tracks; moving on to the next set, keep hanging tight...")

print (f"Finished gathering audio features data for a total of {len(track_ids)} tracks across {len(playlist_ids)} different playlists!")

Started retrieving audio features data for all 4407 tracks across 60 different playlists
This may take a while, so hang tight...
Done retrieving data for 100 tracks; moving on to the next set, keep hanging tight...
Done retrieving data for 200 tracks; moving on to the next set, keep hanging tight...
Done retrieving data for 300 tracks; moving on to the next set, keep hanging tight...
Done retrieving data for 400 tracks; moving on to the next set, keep hanging tight...
Done retrieving data for 500 tracks; moving on to the next set, keep hanging tight...
Done retrieving data for 600 tracks; moving on to the next set, keep hanging tight...
Done retrieving data for 700 tracks; moving on to the next set, keep hanging tight...
Done retrieving data for 800 tracks; moving on to the next set, keep hanging tight...
Done retrieving data for 900 tracks; moving on to the next set, keep hanging tight...
Done retrieving data for 1000 tracks; moving on to the next set, keep hanging tight...
Done retri

In [30]:
#playlist_data_df = pd.DataFrame(, 'Valence': track_valence, 'Tempo': track_tempo } )
track_ids_series = pd.Series(track_ids, name='Track ID')
track_names_series = pd.Series(track_names, name='Track Name')
track_added_at_series = pd.Series(track_added_at, name='Track Added At')
track_popularity_series = pd.Series(track_popularity, name='Track Popularity')
track_durations_series = pd.Series(track_durations, name='Track Duration')
num_available_markets_series = pd.Series(num_available_markets, name='Available Markets')
albums_series = pd.Series(albums, name='Album')
album_types_series = pd.Series(album_types, name='Album Type')
artists_series = pd.Series(artists, name='Artist')
release_dates_series = pd.Series(release_dates, name='Release Date')
track_key_series = pd.Series(track_key, name='Key')
track_mode_series = pd.Series(track_mode, name='Mode')
track_time_signature_series = pd.Series(track_time_signature, name='Time Signature')
track_acousticness_series = pd.Series(track_acousticness, name='Acousticness')
track_danceability_series = pd.Series(track_danceability, name='Danceability')
track_energy_series = pd.Series(track_energy, name='Energy')
track_instrumentalness_series = pd.Series(track_instrumentalness, name='Instrumentalness')
track_liveness_series = pd.Series(track_liveness, name='Liveness')
track_loudness_series = pd.Series(track_loudness, name='Loudness')
track_speechiness_series = pd.Series(track_speechiness, name='Speechiness')
track_valence_series = pd.Series(track_valence, name='Valence')
track_tempo_series = pd.Series(track_tempo, name='Tempo')

df = pd.concat([track_ids_series,track_names_series,track_added_at_series,track_popularity_series,track_durations_series,num_available_markets_series,albums_series,album_types_series,artists_series,release_dates_series,track_key_series,track_mode_series,track_time_signature_series,track_acousticness_series,track_danceability_series,track_energy_series,track_instrumentalness_series,track_liveness_series,track_loudness_series,track_speechiness_series,track_valence_series,track_tempo_series,track_speechiness_series,track_valence_series,track_tempo_series], axis=1)
df_clean=df.dropna()
df_clean                         


Unnamed: 0,Track ID,Track Name,Track Added At,Track Popularity,Track Duration,Available Markets,Album,Album Type,Artist,Release Date,...,Energy,Instrumentalness,Liveness,Loudness,Speechiness,Valence,Tempo,Speechiness.1,Valence.1,Tempo.1
0,2usxQITOSDqvkYiI0oIwao,You should be sad,2020-01-29T19:04:50Z,89,205473,79,You should be sad,single,|Halsey,2020-01-10,...,0.585,0.000000,0.1090,-6.350,0.0277,0.324,110.940,0.0277,0.324,110.940
1,0nbXyq5TXYPCO7pr3N8S4I,The Box,2020-01-29T19:04:50Z,100,196652,78,Please Excuse Me For Being Antisocial,album,|Roddy Ricch,2019-12-06,...,0.586,0.000000,0.7900,-6.687,0.0559,0.642,116.971,0.0559,0.642,116.971
2,2Fxmhks0bxGSBdJ92vM42m,bad guy,2020-01-29T19:04:50Z,95,194087,79,"WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?",album,|Billie Eilish,2019-03-29,...,0.425,0.130000,0.1000,-10.965,0.3750,0.562,135.128,0.3750,0.562,135.128
3,0sf12qNH5qcw8qpgymFOqD,Blinding Lights,2020-01-29T19:04:50Z,98,201573,79,Blinding Lights,single,|The Weeknd,2019-11-29,...,0.796,0.000209,0.0938,-4.075,0.0629,0.345,171.017,0.0629,0.345,171.017
4,4TnjEaWOeW0eKTKIEvJyCa,Falling,2020-01-29T19:04:50Z,98,159381,79,Falling,single,|Trevor Daniel,2018-10-05,...,0.430,0.000000,0.0887,-8.756,0.0364,0.236,127.087,0.0364,0.236,127.087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4401,4jVhSbMMHctghoOTFuLPB2,"Swing, Swing",2020-01-20T10:54:16Z,10,233266,0,The All-American Rejects,album,|The All-American Rejects,2002-01-01,...,0.843,0.000000,0.1050,-5.618,0.0401,0.446,128.991,0.0401,0.446,128.991
4402,0zFBxmGGFbVIBdg9OSpUxM,Last Young Renegade,2020-01-20T10:54:16Z,40,214093,77,Last Young Renegade,single,|All Time Low,2017-03-23,...,0.994,0.000000,0.3080,-2.702,0.0821,0.615,165.027,0.0821,0.615,165.027
4403,17u52ksnAFZOWhzEvuzdG6,Playing Fiction,2020-01-20T10:54:16Z,51,160831,79,Great Heights & Nosedives,album,|ROAM,2017-10-13,...,0.849,0.000004,0.1100,-4.953,0.0811,0.781,151.988,0.0811,0.781,151.988
4404,5vxX2POnp7NBbaCAIVQix7,Adrenaline,2020-01-20T10:54:16Z,55,201496,78,Strange Love,single,|Simple Creatures,2019-03-29,...,0.834,0.000102,0.0993,-5.531,0.0447,0.422,110.065,0.0447,0.422,110.065


In [6]:
"""
Code to save Playlist and Track data to a Data Frame + Export to CSV
"""
# Save the retrieved playlists' track data into a Data Frame
playlist_data_df = pd.DataFrame( {'Track ID': track_ids, 'Track Name': track_names, 'Track Added At': track_added_at, 'Track Popularity': track_popularity, 'Track Duration': track_durations, 'Available Markets': num_available_markets, 'Album': albums, 'Album Type': album_types, 'Artist': artists, 'Release Date': release_dates, 'Key': track_key, 'Mode': track_mode, 'Time Signature': track_time_signature, 'Acousticness': track_acousticness, 'Danceability': track_danceability, 'Energy': track_energy, 'Instrumentalness': track_instrumentalness, 'Liveness': track_liveness, 'Loudness': track_loudness, 'Speechiness': track_speechiness, 'Valence': track_valence, 'Tempo': track_tempo } )
l1 = track_ids

# Export the playlists' tracks data to a csv
playlist_data_df.to_csv(spotify_playlists_csv)

# Confirm the DataFrame counts look OK
print (playlist_data_df.count())

# Display the playlists' tracks data frame
playlist_data_df.head()

ValueError: arrays must all be same length