In [1]:
import requests
import json
import collections
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from bs4 import BeautifulSoup

# NOTE: tokens only last 1 hour. we need a new access token every time we want to run this
access_token = 'BQD4gU8hhO6dlFZZSephkDGpBwaXnuvINkpPeDm57C90TmIJbM3wK72D6A6Jhe8lbdfg_Rs95lebJv9aYM1mcbmFvx7WBo2-YYY_93e3gsUl-4n2XGJEGqGbx6CwJgaKgQh2jF4gj4c1A39Ua3Hamebt'

headers = {
    'Authorization': f'Bearer {access_token}',
    'Content-Type': 'application/json',
    'Accept': 'application/json'
}

In [2]:
def get_user_playlists_endpoint(user_id):
    """
    endpoint generator used to get list of playlist objects for given user. This function gets the URL,
    simplification to playlist_id's happens elsewhere

    :param user_id: base 62 identifier found at end of spotify URI, unique to each user
    :return: fstring url, used to get user's playlists
    """
    # suffix of URL: gets items list from paging obj, gets playlist_id's from each item
    # note: wrapped in a paging object i think
    return f"https://api.spotify.com/v1/users/{user_id}/playlists?limit=50&fields=items(id)"
  

def get_playlist_tracks_endpoint(playlist_id):
    """
    endpoint generator used to get list of track objects for given playlist

    :param playlist_id: base 62 identifier found at end of spotify URI, unique to each playlist
    :return: fstring url used to get a list of track objects for given playlist
    """
    # gets items list from paging obj
    return f"https://api.spotify.com/v1/playlists/{playlist_id}/tracks?fields=items"


def get_tracks_audio_features_endpoint(tracks_str):
    """
    produces a url endpoint to retrieve audio features for a list of tracks,
    supplied as a comma separated list of Spotify IDs

    :param tracks_str: a comma separated list of Spotify IDs that point to tracks
    :return: fstring url used to get list of track objects
    """
    return f"https://api.spotify.com/v1/audio-features?ids={tracks_str}"

In [3]:
def get_user_playlist_data(username):
    """
    Given a username, pulls all public playlist data for that user
    Args:
        username (str): Spotify username for the user for which we want recommendations
    Returns:
        req_data_json (json list): json equivalent of request data from user playlists endpoint
    """
    # print string to show that it's working
    print(f'Collecting audio features for {username}:')

    # TODO catch exception if bad username
    # gets an endpoint for playlist objects corresponding to this user's playlists
    user_playlists_endpoint = get_user_playlists_endpoint(username)
    
    # gets json list of playlist objects (this user's playlists) from Spotify API
    user_request_data = requests.get(user_playlists_endpoint, headers=headers)
    
    # convert given json array of dicts wrapped in a paging object to json
    req_data_json = user_request_data.json() 
    
    return req_data_json

In [4]:
def get_track_ids(req_data_json):
    """
    Given user playlists request data, compiles a list of track IDs, pulling up to
    100 tracks from each of the given public playlists. A list allows for duplicates,
    could use a set if we want to exclude duplicates in a later version.
    Args:
        req_data_json (json list): json list of playlist objects (representing this user's public playlists)
    Returns:
        track_list (array): a list of track IDs representing a user's entire public library
    """
    
    # get items field from json, array of dicts
    playlist_dict = req_data_json.get('items') 

    # NOTE: if this is slow, use some numpy thing instead
    track_list = []
    if playlist_dict == None:
        print("there's nothing in the playlist dict. you probably need a new access token, they expire quickly")
    
    else:
        for playlist_id_dict in playlist_dict:
            
            # get playlist Spotify ID string
            playlist_id = playlist_id_dict.get('id') 
            
            # pass playlist_id to endpoint generator, get all songs in playlist
            # TODO catch exception if bad playlist id
            playlist_tracks_endpoint = get_playlist_tracks_endpoint(playlist_id)
            
            # request a playlist object, metadata and json list of track_id's
            playlist_req_data = requests.get(playlist_tracks_endpoint, headers=headers)
            playlist_paging_obj_dict = playlist_req_data.json()
            track_dict = playlist_paging_obj_dict.get('items')

            # get all tracks from this playlist
            for track in track_dict:
                track_id = track.get('track').get('id')
                track_list.append(track_id)

    # print("Num tracks for user: " + str(len(track_set)))
    return track_list

In [5]:
def get_audio_features(user_tracks):
    """Given a list of track IDs, generates a dataframe with audio feature data 
        for each track. Note that we get track audio features in batches. Adds
        the results to a dataframe.
    Args:
        user_tracks (array): a list of track IDs representing a user's entire public library        
    Returns:
        audio_features_df (pd.DataFrame): each row is a track, columns are audio features
    """
    
    # spotify api is quirky and can only handle batches of requests. 
    # split user_tracks list into chunks small enough for spotify api to accept
    split_user_tracks = [user_tracks[i:i + 50] for i in range(0, len(user_tracks), 50)]  
    
    # init dataframe, rows are tracks, columns are audio features
    audio_features_df = pd.DataFrame()
    
    # for each sublist of all the tracks collected from the user
    for track_sublist in split_user_tracks:
        
        # make list of spotify URIs separated by commas because that's what spotify wants for the endpoint
        sublist_str = ','.join(track_sublist)
        
        # get endpoint to retrieve audio features for our batch of 50 tracks
        tracks_endpoint = get_tracks_audio_features_endpoint(sublist_str)
        
        # execute the get request
        sublist_req = requests.get(tracks_endpoint, headers=headers)
        print(f'sublist_req: {sublist_req}')
        
        # for each track in the sublist response data
        for track_req in sublist_req.json()['audio_features']:
            
            # convert track_req into a pandas series
            row_series = pd.Series(track_req)
            # print(json.dumps(track_req, indent=2))
            
            """
            HEY this is a bad idea. make a list, add dicts to a list, convert
            to a dataframe at the end. should do it this way instead:
            https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it/56746204#56746204
            """
            # append row_series to our accumulator audio_features_df
            audio_features_df = audio_features_df.append(row_series, ignore_index=True)

    return audio_features_df

In [6]:
def get_user_audio_features(username):
    """A convenience function that wraps our pipeline for each user. 
    Given a username, pulls all public playlist data for that user. Then, 
    compiles a list of track IDs from all the user's public playlists. Last,
    gets track audio features and adds the results to a dataframe. 
    Args:
        username (str): Spotify username for the user for which we want recommendations
    Returns:
        audio_features_df (pd.DataFrame): each row is a track, columns are audio features
 """
    # Pulls all public playlist data for user
    user_req_data = get_user_playlist_data(username)
    
    # gets track IDs for all tracks in user's public playlists, as a list
    user_tracks = get_track_ids(user_req_data)
    
    # gets dataframe of audio features for each track
    audio_features_df = get_audio_features(user_tracks)
    
    return audio_features_df

### testing our get audio features function, and loading the dataframes that we got to CSVs 

In [7]:
# Pulls all public playlist data for user 'dudu.massud' (eddie)
eddie_df = get_user_audio_features('dudu.massud')
eddie_df.to_csv('eddie_song_data.csv')

# Pulls all public playlist data for user 'briannannaj' (bri)
bri_df = get_user_audio_features('briannannaj')
bri_df.to_csv('bri_song_data.csv')

# Pulls all public playlist data for user 'rafalapetina' (rafa)
rafa_df = get_user_audio_features('rafalapetina')
rafa_df.to_csv('rafa_song_data.csv')

# Pulls all public playlist data for user 'brenderman3' (brendan)
brendan_df = get_user_audio_features('brenderman3')
brendan_df.to_csv('brendan_song_data.csv')

# Pulls all public playlist data for user 'dvhgzdwi45q7g1bn7znolytqu' (aditya)
aditya_df = get_user_audio_features('dvhgzdwi45q7g1bn7znolytqu')
aditya_df.to_csv('aditya_song_data.csv')

Collecting audio features for dudu.massud:
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
Collecting audio features for briannannaj:
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
Collecting audio features for rafalapetina:
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>
sublist_req: <Response [200]>


In [8]:
# load our dataframes from the CSVs we saved

df_eddie = pd.read_csv('eddie_song_data.csv')
df_bri = pd.read_csv('bri_song_data.csv')
df_rafa = pd.read_csv('rafa_song_data.csv')
df_brendan = pd.read_csv('brendan_song_data.csv')
df_aditya = pd.read_csv('aditya_song_data.csv')

df_aditya

Unnamed: 0.1,Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0,0.57800,https://api.spotify.com/v1/audio-analysis/3Gnb...,0.674,440547.0,0.650,3GnbxcMXDR2hozHyHnmF7n,0.000,5.0,0.6580,-12.121,1.0,0.3670,91.823,4.0,https://api.spotify.com/v1/tracks/3GnbxcMXDR2h...,audio_features,spotify:track:3GnbxcMXDR2hozHyHnmF7n,0.538
1,1,0.86300,https://api.spotify.com/v1/audio-analysis/5pDP...,0.629,251760.0,0.495,5pDPGSHEm3lJCEM2VVX34G,0.000,9.0,0.1130,-7.690,1.0,0.0316,134.906,3.0,https://api.spotify.com/v1/tracks/5pDPGSHEm3lJ...,audio_features,spotify:track:5pDPGSHEm3lJCEM2VVX34G,0.608
2,2,0.51700,https://api.spotify.com/v1/audio-analysis/11Ba...,0.502,184981.0,0.608,11BaK9F73UdAYntBskn3vZ,0.000,8.0,0.0713,-5.045,0.0,0.0847,79.610,4.0,https://api.spotify.com/v1/tracks/11BaK9F73UdA...,audio_features,spotify:track:11BaK9F73UdAYntBskn3vZ,0.532
3,3,0.52800,https://api.spotify.com/v1/audio-analysis/3gtD...,0.639,177150.0,0.731,3gtDqUfvbpKcznHXXidLkb,0.000,8.0,0.1000,-4.151,1.0,0.1780,102.985,4.0,https://api.spotify.com/v1/tracks/3gtDqUfvbpKc...,audio_features,spotify:track:3gtDqUfvbpKcznHXXidLkb,0.801
4,4,0.00502,https://api.spotify.com/v1/audio-analysis/6sKy...,0.560,234943.0,0.737,6sKyTORhJY0F4Lz7R7wOIK,0.856,9.0,0.0946,-3.403,1.0,0.0389,122.021,4.0,https://api.spotify.com/v1/tracks/6sKyTORhJY0F...,audio_features,spotify:track:6sKyTORhJY0F4Lz7R7wOIK,0.332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578,578,0.73100,https://api.spotify.com/v1/audio-analysis/7eJM...,0.726,173333.0,0.431,7eJMfftS33KTjuF7lTsMCx,0.000,8.0,0.6960,-8.765,0.0,0.1350,144.026,4.0,https://api.spotify.com/v1/tracks/7eJMfftS33KT...,audio_features,spotify:track:7eJMfftS33KTjuF7lTsMCx,0.348
579,579,0.05710,https://api.spotify.com/v1/audio-analysis/62z2...,0.519,173975.0,0.731,62z2Su8BBGvR50aOvdjPzW,0.000,11.0,0.3810,-3.993,1.0,0.0402,82.967,4.0,https://api.spotify.com/v1/tracks/62z2Su8BBGvR...,audio_features,spotify:track:62z2Su8BBGvR50aOvdjPzW,0.296
580,580,0.93600,https://api.spotify.com/v1/audio-analysis/42i8...,0.555,189419.0,0.202,42i8mO4cQdsUmn5In7jony,0.000,2.0,0.0895,-8.128,1.0,0.0402,135.300,4.0,https://api.spotify.com/v1/tracks/42i8mO4cQdsU...,audio_features,spotify:track:42i8mO4cQdsUmn5In7jony,0.522
581,581,0.06110,https://api.spotify.com/v1/audio-analysis/6eXF...,0.739,139810.0,0.717,6eXFl418w8hsxKcLgv1jwJ,0.000,1.0,0.2060,-5.123,1.0,0.0565,98.047,4.0,https://api.spotify.com/v1/tracks/6eXFl418w8hs...,audio_features,spotify:track:6eXFl418w8hsxKcLgv1jwJ,0.529
