In [149]:
import spotipy
import spotipy.util as util
import requests
import pandas as pd
import matplotlib.pyplot as plt
import json
import math
import time
import seaborn as sns

'''Get user token to access Spotify API. Token is refreshed periodically, so it's important 
to call this function each time requests are made.'''

username = '1242062883'
client_id ='e6265a912d9c4be18688eee8093bb4e8'
client_secret = 'fc27bdc4c3654450960bbb60c38b3fd0'
redirect_uri = 'http://localhost:7777/callback'
scope = 'playlist-read-private playlist-modify-private user-read-private'

token = util.prompt_for_user_token(username=username, 
                                   scope=scope, 
                                   client_id=client_id,   
                                   client_secret=client_secret,     
                                   redirect_uri='http://localhost:7777/callback')

### Generic Spotify API Calls
The following 3 functions help me do basic Spotify API calls that I will use in more complex functions downstream:
1. Get a track id given a track title and artist
2. Get a track title and artist given a track id
3. Run a keyword search to return all track ids from n playlists searched 

In [150]:
def get_song_id(song_title, artist, token):
    '''
    Parameters:
        - track title
        - artist
        - Spotify access token
        
    Function: Spotify API GET request using track title and artist as query parameters to get corresponding
    Spotify track id
    
    Return: track id
    '''

    query = 'track:' + song_title + ' artist:' + artist

    headers = {
    'Accept': 'application/json',
    'Content-Type': 'application/json',
    'Authorization': f'Bearer ' + token,
    }
    params = [
    ('q', query),
    ('type', 'track'),
    ]
    try:
        response = requests.get('https://api.spotify.com/v1/search', 
                    headers = headers, params = params, timeout = 5)
        json = response.json()
        first_result = json['tracks']['items'][0]
        track_id = first_result['id']
        return track_id
    except:
        return None

In [151]:
def get_track_artist(track_id, token):
    '''
    Parameters:
        - track id
        - token
        - Spotify access token
        
    Function: Spotify API GET request using track id as query parameters to get corresponding
    track name and corresponding artist
    
    Return: track name and corresponding artist (list format)
    '''
    
    headers = {
    'Accept': 'application/json',
    'Content-Type': 'application/json',
    'Authorization': f'Bearer ' + token,
    }
    
    try:
        response = requests.get('https://api.spotify.com/v1/tracks/' + track_id, 
                    headers = headers, timeout = 5)
        json = response.json()
        track_name = json['name']
        # Assume first artist on the track is the main artist
        artist_name = json['artists'][0]['name']
        track_artist = {'track': track_name, 'artist': artist_name}
        return track_artist
    except:
        return None

In [152]:
def get_playlist_track_ids(search_string, num_entries, token) -> str:
    '''
    Parameters:
        - search string (name of the playlist you want to create)
        - Spotify access token
        
    Function: Spotify API GET request using search string to perform keyword search across all Spotify playlists. Once
    playlists are obtained, use consecutive GET request to extract all track ids from each playlist
    
    Return: unique track ids from all playlists (list format)
    '''

    # Spotify API can only query up to 50 entries at a time, so to get > 50 entries,
    # need to run a new entry that is offset by the number of entries that have already been returned
    limit = 50
    num_queries = math.ceil(num_entries / limit)

    headers = {
    'Accept': 'application/json',
    'Content-Type': 'application/json',
    'Authorization': f'Bearer ' + token,
    }

    playlist_ids = []
    
    # Iteratively perform keyword search for playlists using Spotify API, increasing offset by 50 each round
    for i in range(num_queries):
        params_pl = [
        ('q', search_string),
        ('type', 'playlist'),
        ('limit', str(limit)),
        ('offset', str(i * limit))
        ]
        try:
            response = requests.get('https://api.spotify.com/v1/search', 
                        headers = headers, params = params_pl, timeout = 5)
            json = response.json()
            for playlist in json['playlists']['items']:
                playlist_ids.append(playlist['id'])
        except:
            print('Playlist search failure')
            return None
    print('Playlists found:', len(playlist_ids))
    
    # Run a new Spotify API query for each playlist ID to get list of song ID's in that playlist
    track_ids = []
    for playlist_id in playlist_ids:
        try:
            response = requests.get('https://api.spotify.com/v1/playlists/' + playlist_id + '/tracks', 
                        headers = headers, timeout = 5)
            json = response.json()
            for item in json['items']:
                track_ids.append(item['track']['id'])
        except:
            continue
    print('Songs found:', len(track_ids))
    return list(set(track_ids))

### Getting the features
After I run a Spotify search through the API to get a bunch of track ID's labeled according to what playlists they've been attributed to, I need to be able to get song features for each track ID. Here I'll call the get_audio_features function to pull audio features for my full list of track_ids.

<b>Note:</b> To minimize API calls for performance, I call the get_audio_features 100 track IDs at a time.

In [153]:
def get_audio_features(track_ids, token):
    '''
    Parameters:
        - List track_ids to get audio features for
        - Spotify access token
        
    Function: Call Spotify API to get engineered audio features for all track_ids passed in. Spotify API
    allows requests for up to 100 songs at a time, which lowers latency by requiring 100x less API 
    calls for a given list of track_ids.
    
    Return: Dataframe of track ID's with features, which can be joined with dataframe of labels later.
    
    Dataframe format: 
    trackid |feature1|feature2|feature3|feature4|...
    --------+--------+--------+--------+--------+...
    1efae1j |   0.4  |    2   |   1.4  |  0.23  |...
    '''
    sp = spotipy.Spotify(auth=token)
    try:
        # Get a list of json objects where each object = features for one track
        ##features_raw = sp.audio_features(track_ids)
        # Features of interest
        ##audio_features_keys = [
        ##    'danceability', 
        ##    'energy', 
        ##    'loudness', 
        ##    'speechiness', 
        ##    'acousticness', 
        ##    'instrumentalness', 
        ##    'liveness', 
        ##    'valence', 
        ##    'tempo'
        ##    ]
        
        ##audio_features_list = []
        
        ####EXPERIMENT###
        feature_columns = [
            'trackid',
            'danceability', 
            'energy', 
            'loudness', 
            'speechiness', 
            'acousticness', 
            'instrumentalness', 
            'liveness', 
            'valence', 
            'tempo']
        features_df = pd.DataFrame(columns=feature_columns)
        num_iter = math.ceil(len(track_ids)/100)
        i = 0
        for i in range(num_iter):
            start = i*100
            end = start + 100
            # Generate a 100-element long segment of the features_df each iteration
            if len(track_ids[start:]) >= 100:
                try:
                    features_df_segment = sp.audio_features(track_ids[start:end])
                except:
                    print(len(track_ids[start:end]))
                    continue
            else:
                features_df_segment = sp.audio_features(track_ids[start:])
            
            id_index = 0 

            for features in features_df_segment:
                try:
                    features_filtered = {key:features[key] for key in feature_columns if key in features}
                    audio_features = {'trackid': track_ids[id_index]}
                    audio_features.update(features_filtered)
                    
                except:
                    continue
                
                try:
                    features_df = features_df.append(audio_features, ignore_index=True)
                except:
                    print(id_index)
                    
                id_index += 1
            
            i += 1            
        
        ###/EXPERIMENT###
        
        #id_index = 0
        #for feature_set in features_raw:
        #    audio_features_filtered = {key:feature_set[key] for key in audio_features_keys if key in feature_set}
        #    audio_features = {'trackid': track_ids[id_index]}
        #    audio_features.update(audio_features_filtered)
        #    audio_features_list.append(audio_features)
        #    id_index+=1
            
        return features_df
        
    except:
        print('some error happened')
        return None

In [154]:
def search_and_label(pl_names, num_pl_search, token):
    '''
    Parameters:
        - List of playlist names user wants to create
        - Max # of playlists to return per keyword search
        
    Function: Run a Spotify API keyword search for playlists matching the playlist names, then
    store all of the unique songs from each playlist by track id, retaining which keyword search 
    the song came from (input 1 in the corresponding label column if the label applies, else 0). 
    
    Return: Dataframe of track ID's  with labels formatted for multi-label classification.
    Note that by definition, labels aren't mutually exclusive - the same track can have multiple labels.
    
    Dataframe format: 
    trackid | label1 | label2 | label3 |...
    --------+--------+--------+--------+...
    1efae1j |    1   |    0   |    1   |...
    '''
    
    columns = ['trackid']
    
    for name in pl_names:
        columns.append(name)
    
    song_labels = pd.DataFrame(columns=columns)

    for name in pl_names:
        other_names = [pl_name for pl_name in pl_names if pl_name != name]
        queried_track_ids = set(get_playlist_track_ids(name, num_pl_search, token))
        
        pl_search_results = []
        pl_audio_features = []
        
        for track_id in queried_track_ids:
            try:
                search_result = {'trackid': track_id, name: 1}
            except:
                continue
            
            # Add a 0 for all other label columns other than the relevant column
            for other_name in other_names:
                search_result[other_name] = 0
            pl_search_results.append(search_result)
        
        song_labels = song_labels.append(pl_search_results, ignore_index=True)
        song_labels = song_labels.groupby('trackid', as_index=False).sum()
    
    return song_labels

In [155]:
# User-generated list of playlist names (hardcoded for development / testing)
pl_names = ['hype workout', 'edm', 'vibey']

In [161]:
def get_data(pl_names, num_pl_search, token):
    '''
    Parameters:
        - List of playlist names user wants to create
        - Max # of playlists to return per keyword search
        - Token
        
    Function: Call search_and_label() to get dataframe with track ids and labels, then call
    get_audio_features() on track ids to get dataframe with track ids and features. Merge the two
    dataframes to get complete dataset for train / test split
    
    Return: Dataframe of track ID's  with features and labels formatted for multi-label classification.
    Note that by definition, labels aren't mutually exclusive - the same track can have multiple labels.
    
    Dataframe format: 
    trackid |feature1|feature2|feature3|feature4| label1 | label2 | label3 |...
    --------+--------+--------+--------+--------+--------+--------+--------+...
    1e3ae1j |   0.4  |    2   |   1.4  |  0.23  |    1   |    0   |    1   |...
    '''

    track_labels = search_and_label(pl_names, num_pl_search, token)
    track_ids = track_labels['trackid'].to_list()
    track_features = get_audio_features(track_ids, token)
    
    track_data = pd.concat([track_features, track_labels], join='inner', axis=1)
    track_data.reset_index()
    track_data = track_data.loc[:,~track_data.columns.duplicated()]
    
    return track_data

In [163]:
start = time.time()
data = get_data(pl_names, 50, token).head()
end = time.time()
print("Runtime:", end-start, "seconds")
print(data.head())

Playlists found: 50
Songs found: 3973
Playlists found: 50
Songs found: 3255
Playlists found: 50
Songs found: 3362
Runtime: 98.66457796096802 seconds
                  trackid  danceability  energy  loudness  speechiness  \
0  00580mfpVKVKn7olzo7NBF         0.775   0.723    -6.500       0.0517   
1  00ANnYctEGGhcmOJ5omaj8         0.384   0.909    -4.260       0.5240   
2  00GYNdeBpo0QtByS8MIi9q         0.517   0.660    -6.175       0.0315   
3  00JoqMc5KZFLpdrtymo6f0         0.564   0.897    -5.794       0.0814   
4  00QyLmjxaSEE8qIZQjBXBj         0.554   0.899    -4.573       0.4080   

   acousticness  instrumentalness  liveness  valence    tempo  hype workout  \
0      0.028500             0.944    0.0924   0.3870   95.044             0   
1      0.181000             0.000    0.2230   0.5420  166.877             1   
2      0.000034             0.252    0.5710   0.0701   90.007             0   
3      0.001060             0.674    0.0967   0.2260  139.999             1   
4      0.05