Use this notebook to get Spotify audio data

Useful resources:
- https://stmorse.github.io/journal/spotify-api.html
- https://medium.com/@lorelablaka/extract-data-using-spotify-api-889222835bf4

In [186]:
# dependencies
import requests
import json
import pandas as pd
import re
import timeit
from IPython.display import clear_output

In [187]:
# Spotify API credentials
from config import spotify_client_ID as sp_client, spotify_client_secret as sp_secret

In [188]:
# read in top40 data
top40_df = pd.read_csv("../00_data/top40_1997_2022_raw.csv")

# store columns as lists
week = top40_df['week']
position = top40_df['position']
song = top40_df['song']
artist = top40_df['artist']

# practice list of 10 artists
artist = artist[0:7].tolist() + artist[8:11].tolist()
song = song[0:7].tolist() + song[8:11].tolist()

In [246]:
# get an authorization token
# https://developer.spotify.com/documentation/general/guides/authorization/client-credentials/

auth_token = 'https://accounts.spotify.com/api/token'

# POST token request

auth_response = requests.post(auth_token, {
    'grant_type': 'client_credentials',
    'client_id': sp_client,
    'client_secret': sp_secret,
})

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']

In [247]:
# https://developer.spotify.com/documentation/general/guides/authorization/use-access-token/

# send a GET request to the API server with the access token in the header

headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}

# paste final function here:

In [140]:
# function to get audio features for all top40 tracks
# input: a dataframe of top40 charts with [week, position, song, artist] columns

def get_audio_features(weekly_charts):

    # start timer
    start = timeit.default_timer()
    
    # store dataframe columns as lists
    week = weekly_charts['week']
    position = weekly_charts['position']
    song = weekly_charts['song']
    artist = weekly_charts['artist']


    ################################
    # get artist and song URIs
    ################################

    # use an 'item' search for artist and track
    # https://developer.spotify.com/documentation/web-api/reference/#/operations/search

    # base URL for API calls
    base_url = 'https://api.spotify.com/v1/'
    
    # empty lists for search results
    song_ids = []
    artist_ids = []
    
    for w in range(len(weekly_charts)):
        
        # format song/artist names to suit the query input format
        query_song = re.sub('\s', '%20', song[w])
        query_artist = re.sub('\s', '%20', artist[w])

        # make the API call
        query = f'search?q=track:{query_song}&artist:{query_artist}&type=track'
        query_data = requests.get(base_url + query, headers=headers).json()

        # json output format varies by result, so navigating jsons doesn't work
        # instead, convert the whole output to a string, use regex to find desired URIs
        data_str = str(query_data)
        song_regex = '(?:spotify.com/track/)(\w+)'
        artist_regex = '(?:spotify.com/artist/)(\w+)'
        
        # extract IDs using regex
        song_id = re.search(song_regex, data_str).group(1)
        artist_id = re.search(artist_regex, data_str).group(1)
        #song_uri = 'spotify:track:' +  re.search(song_regex, data_str).group(1)
        #artist_uri = 'spotify:artist:' +  re.search(artist_regex, data_str).group(1)

        # save URIs to their respective lists
        song_ids.append(song_id)
        artist_ids.append(artist_id)

        # log progress
        checkpoint = timeit.default_timer()
        clear_output()
        print('Fetching artist and song URIs')
        print(f'Saved {w+1}/{len(weekly_charts)} URIs ({(w+1)/len(weekly_charts) * 100:.2f}% complete)')
        print(f'{(checkpoint-start)/60:.2f} minutes elapsed')


    ##################################
    # get audio features for all songs
    ##################################
    
    # empty lists to hold desired audio feature scores
    acousticness = []
    duration = []
    danceability = []
    energy = []
    instrumentalness = []
    key = []
    liveness = []
    loudness = []
    mode = []
    speechiness = []
    tempo = []
    time_signature = []
    valence = []

    # make the API call for track audio features
    # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features

    for track in song_ids:
        
        # API call
        audio = requests.get(base_url + 'audio-features/' + track, headers=headers).json()

        # save audio features to their respective lists
        acousticness.append(audio['acousticness'])
        duration.append(audio['duration_ms'])
        danceability.append(audio['danceability'])
        energy.append(audio['energy'])
        instrumentalness.append(audio['instrumentalness'])
        key.append(audio['key'])
        liveness.append(audio['liveness'])
        loudness.append(audio['loudness'])
        mode.append(audio['mode'])
        speechiness.append(audio['speechiness'])
        tempo.append(audio['tempo'])
        time_signature.append(audio['time_signature'])
        valence.append(audio['valence'])
        
        # log progress
        clear_output()
        checkpoint = timeit.default_timer()
        print(f'Saved audio features for {len(acousticness)}/{len(song_ids)} tracks. ({len(acousticness)/len(song_ids) *100:.2f}% complete)')
        print(f'{(checkpoint-start)/60:.2f} minutes elapsed')


    ########################
    # get artist genre data
    ########################

    # empty list of genres
    all_genres = []

    # empty dictionary to hold all genres associated with each artist
    artist_genre_dict = {}

    # get genres associated with all artists        
    for a, artist in enumerate(artist_ids):

        # get the artist's genre(s)
        # Spotify 'artist' search
        # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-an-artist
        
        try:
            # API call
            get_artist = requests.get(base_url + 'artists/' + artist, headers=headers).json()
            artist_genres = get_artist['genres']
            # format genre strings
            artist_genres = [x.replace(' ', '_').lower() for x in artist_genres]
        
        # for artists with 'NA' URIs
        except:
            artist_genres = ['no_genre_data']

 
        artist_genre_dict[artist] = artist_genres

        # save new genres to the list of all genres
        for genre in artist_genres:
            if genre not in all_genres:
                all_genres.append(genre)
        
        # log progress
        clear_output()
        checkpoint = timeit.default_timer()
        print(f'Found genres for {(a+1)}/{len(artist_ids)} artists ({(a+1)/len(artist_ids)*100:.2f}% complete)')
        print(f'{(checkpoint-start)/60:.2f} minutes elapsed')

    print('Saving data...')
    
    # fill the new genre dataframe
    all_genres_dict = {}
    for genre in all_genres:
        all_genres_dict[genre] = [0] * len(weekly_charts)

    # create dataframe with genre data, sort df columns
    genre_df = pd.DataFrame(all_genres_dict)
    genre_df.columns = sorted(genre_df.columns.tolist())
    genre_df['artist_id'] = artist_ids

    # assign 1's to each artist's genre                
    for column in genre_df:

        for artist in artist_ids:

            for genre in artist_genre_dict[artist]:

                if genre == column:

                    genre_df.loc[artist, column] = 1
    
    
#    #######################################
#    # drop uncommon genres from the dataset
#    #######################################
#
#    # get the sum of each genre column (exclude the last column)
#    genre_counts = full_df.sum(axis=0)[:-2]
#    genre_counts_df = pd.DataFrame(genre_counts, columns=['count'])
#
#    # get genres to drop from the dataset
#    # keeping only the top 50 most common genres
#    drop_genres = genre_counts_df.sort_values(['count'],ascending=False)[50:].index
#
#    # remove low-count genres from the dataset
#    genres_df = full_df.drop(columns=drop_genres)


    ################################
    # output all data to a dataframe
    ################################

    # make a copy of the input dataframe
    output_df = weekly_charts.copy()
    
    output_df['artist_id'] = artist_ids
    output_df['song_id'] = song_ids
    
    output_df['acousticness'] = acousticness
    output_df['duration'] = duration
    output_df['danceability'] = danceability
    output_df['energy'] = energy
    output_df['instrumentalness'] = instrumentalness
    output_df['key'] = key
    output_df['liveness'] = liveness
    output_df['loudness'] = loudness
    output_df['mode'] = mode
    output_df['speechiness'] = speechiness
    output_df['tempo'] = tempo
    output_df['time_signature'] = time_signature
    output_df['valence'] = valence
    
    # merge all data into a final dataframe
    full_df = output_df.merge(genre_df, how="inner", on="artist_id")

    # log progress
    checkpoint = timeit.default_timer()
    print('Data saved')
    print(f'Total runtime: {(checkpoint-start)/60:.2f} minutes')
    print('Done!')
    #print(type(output_df))
    #print(type(genre_df))
    #print(type(full_df))

    # return the dataframe
    return full_df, genre_df
    

In [111]:
top40_audio = get_audio_features(top40_df)

Fetching artist and song URIs
Saved 210/52768 URIs (0.40% complete)
1.23 minutes elapsed


KeyboardInterrupt: 

# TEST RUN

In [235]:

# function to get audio features for all top40 tracks
# input: a dataframe of top40 charts with [week, position, song, artist] columns

def get_audio_features(weekly_charts):

    # start timer
    start = timeit.default_timer()
    
    # store dataframe columns as lists
    week = weekly_charts['week']
    position = weekly_charts['position']
    song = weekly_charts['song']
    artist = weekly_charts['artist']


    ################################
    # get artist and song URIs
    ################################

    # use an 'item' search for artist and track
    # https://developer.spotify.com/documentation/web-api/reference/#/operations/search

    # base URL for API calls
    base_url = 'https://api.spotify.com/v1/'
    
    # empty lists for search results
    song_ids = []
    artist_ids = []
    
    for w in range(len(weekly_charts)):
        
        # format song/artist names to suit the query input format
        query_song = re.sub('\s', '%20', song[w])
        query_artist = re.sub('\s', '%20', artist[w])

        # make the API call
        query = f'search?q=track:{query_song}&artist:{query_artist}&type=track'
        query_data = requests.get(base_url + query, headers=headers).json()

        # json output format varies by result, so navigating jsons doesn't work
        # instead, convert the whole output to a string, use regex to find desired URIs
        data_str = str(query_data)
        song_regex = '(?:spotify.com/track/)(\w+)'
        artist_regex = '(?:spotify.com/artist/)(\w+)'
        
        # extract IDs using regex
        song_id = re.search(song_regex, data_str).group(1)
        artist_id = re.search(artist_regex, data_str).group(1)
        #song_uri = 'spotify:track:' +  re.search(song_regex, data_str).group(1)
        #artist_uri = 'spotify:artist:' +  re.search(artist_regex, data_str).group(1)

        # save URIs to their respective lists
        song_ids.append(song_id)
        artist_ids.append(artist_id)

        # log progress
        checkpoint = timeit.default_timer()
        clear_output()
        print('Fetching artist and song URIs')
        print(f'Saved {w+1}/{len(weekly_charts)} URIs ({(w+1)/len(weekly_charts) * 100:.2f}% complete)')
        print(f'{(checkpoint-start)/60:.2f} minutes elapsed')


    ##################################
    # get audio features for all songs
    ##################################
    
    # empty lists to hold desired audio feature scores
    acousticness = []
    duration = []
    danceability = []
    energy = []
    instrumentalness = []
    key = []
    liveness = []
    loudness = []
    mode = []
    speechiness = []
    tempo = []
    time_signature = []
    valence = []

    # make the API call for track audio features
    # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features

    for track in song_ids:
        
        # API call
        audio = requests.get(base_url + 'audio-features/' + track, headers=headers).json()

        # save audio features to their respective lists
        acousticness.append(audio['acousticness'])
        duration.append(audio['duration_ms'])
        danceability.append(audio['danceability'])
        energy.append(audio['energy'])
        instrumentalness.append(audio['instrumentalness'])
        key.append(audio['key'])
        liveness.append(audio['liveness'])
        loudness.append(audio['loudness'])
        mode.append(audio['mode'])
        speechiness.append(audio['speechiness'])
        tempo.append(audio['tempo'])
        time_signature.append(audio['time_signature'])
        valence.append(audio['valence'])
        
        # log progress
        clear_output()
        checkpoint = timeit.default_timer()
        print(f'Saved audio features for {len(acousticness)}/{len(song_ids)} tracks. ({len(acousticness)/len(song_ids) *100:.2f}% complete)')
        print(f'{(checkpoint-start)/60:.2f} minutes elapsed')


    ########################
    # get artist genre data
    ########################

    # empty list of genres
    all_genres = []

    # empty dictionary to hold all genres associated with each artist
    artist_genre_dict = {}

    # get genres associated with all artists        
    for a, artist in enumerate(artist_ids):

        # get the artist's genre(s)
        # Spotify 'artist' search
        # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-an-artist
        
        try:
            # API call
            get_artist = requests.get(base_url + 'artists/' + artist, headers=headers).json()
            artist_genres = get_artist['genres']
            # format genre strings
            artist_genres = [x.replace(' ', '_').lower() for x in artist_genres]
        
        # for artists with 'NA' URIs
        except:
            artist_genres = ['no_genre_data']

 
        artist_genre_dict[artist] = artist_genres

        # save new genres to the list of all genres
        for genre in artist_genres:
            if genre not in all_genres:
                all_genres.append(genre)
        
        # log progress
        clear_output()
        checkpoint = timeit.default_timer()
        print(f'Found genres for {(a+1)}/{len(artist_ids)} artists ({(a+1)/len(artist_ids)*100:.2f}% complete)')
        print(f'{(checkpoint-start)/60:.2f} minutes elapsed')

    print('Saving data...')
    
    # fill the new genre dataframe
    all_genres_dict = {}
    for genre in all_genres:
        all_genres_dict[genre] = [0] * len(weekly_charts)

    # create dataframe with genre data, sort df columns
    genre_df = pd.DataFrame(all_genres_dict)
    genre_df.columns = sorted(genre_df.columns.tolist())
    #genre_df['artist_id'] = artist_ids
    genre_df.index = artist_ids

    # assign 1's to each artist's genre                
    for column in genre_df:

        for artist in genre_df.index:

            for genre in artist_genre_dict[artist]:

                if genre == column:

                    genre_df.loc[artist, column] = 1

    # new column to enable merging dataframes
    genre_df['artist_id'] = genre_df.index
                
#    #######################################
#    # drop uncommon genres from the dataset
#    #######################################
#
#    # get the sum of each genre column (exclude the last column)
#    genre_counts = genre_df.sum(axis=0)
#    genre_counts_df = pd.DataFrame(genre_counts, columns=['count'])
#
#    # get genres to drop from the dataset
#    # keeping only the top 50 most common genres
#    drop_genres = genre_counts_df.sort_values(['count'],ascending=False)[50:].index
#
#    # remove low-count genres from the dataset
#    genres_df = full_df.drop(columns=drop_genres)


    ################################
    # output all data to a dataframe
    ################################

    # make a copy of the input dataframe
    output_df = weekly_charts.copy()
    
    output_df['artist_id'] = artist_ids
    output_df['song_id'] = song_ids
    
    output_df['acousticness'] = acousticness
    output_df['duration'] = duration
    output_df['danceability'] = danceability
    output_df['energy'] = energy
    output_df['instrumentalness'] = instrumentalness
    output_df['key'] = key
    output_df['liveness'] = liveness
    output_df['loudness'] = loudness
    output_df['mode'] = mode
    output_df['speechiness'] = speechiness
    output_df['tempo'] = tempo
    output_df['time_signature'] = time_signature
    output_df['valence'] = valence
    
    # merge all data into a final dataframe
    full_df = output_df.merge(genre_df, how="inner", on="artist_id")

    # log progress
    checkpoint = timeit.default_timer()
    print('Data saved')
    print(f'Total runtime: {(checkpoint-start)/60:.2f} minutes')
    print('Done!')
    #print(type(output_df))
    #print(type(genre_df))
    #print(type(full_df))

    # return the dataframe
    return full_df
    

In [236]:
top40_test = get_audio_features(top40_df.head(10))
top40_test

Found genres for 10/10 artists (100.00% complete)
0.13 minutes elapsed
Saving data...
Data saved
Total runtime: 0.13 minutes
Done!


Unnamed: 0,week,position,song,artist,artist_id,song_id,acousticness,duration,danceability,energy,...,lilith,mellow_gold,new_wave_pop,pop_rock,post-grunge,rap,singer-songwriter,soft_rock,west_coast_rap,yacht_rock
0,1997-07-07,1,Mmm Bop,Hanson,0SdiiPkr02EUdekHZJkt58,0lnxrQAd9ZxbhBBe7d8FO8,0.00481,268653,0.683,0.937,...,0,0,0,1,0,0,0,0,0,0
1,1997-07-07,2,Bitch,Meredith Brooks,7pEVNuJRlpDXlsrcq32yHp,06q3sUgkq2k7uhdsi1fczX,0.399,163156,0.79,0.66,...,0,0,0,0,0,0,0,0,0,0
2,1997-07-07,3,Say You'll Be There,Spice Girls,0uq5PttqEjj3IH1bzwcrXF,1yTQ39my3MoNROlFw3RDNy,0.0149,235973,0.726,0.679,...,0,0,0,0,0,0,0,0,0,0
3,1997-07-07,4,Return Of The Mack,Mark Morrison,6V3F8MZrOKdT9fU686ybE9,3jDdpx9PMlfMBS5tOBHFm9,0.00631,213093,0.715,0.833,...,0,0,0,0,0,0,0,0,0,0
4,1997-07-07,5,Sunny Came Home,Shawn Colvin,0K7VN4aHxHcEb7PqkfoIVA,4mOxpj82q6n3EO7HBZCelX,0.342,264200,0.558,0.579,...,1,1,1,0,0,0,1,0,0,0
5,1997-07-07,6,Do You Know,Robyn,3Mcii5XWf6E0lrY3Uky4cA,2Ih217RCGAmyQR68Nn7Cqo,0.0977,232773,0.747,0.52,...,0,0,0,0,0,1,0,0,1,0
6,1997-07-07,7,The Freshmen,Verve Pipe,242iqFnwNhlidVBMI9GYKp,21jEuMn2lf37715rwjow2M,0.0757,269467,0.571,0.505,...,0,0,0,1,1,0,0,0,0,0
7,1997-07-07,8,Hard To Say I'm Sorry,Az Yet &amp; Peter Cetera,3iDD7bnsjL9J4fO298r0L0,5nDSJO4909uNzMcZH3CggS,0.695,221827,0.537,0.417,...,0,1,0,0,0,0,1,1,0,1
8,1997-07-07,9,I Want You,Savage Garden,0LyfQWJT6nXafLPZqxe9Of,7mykoq6R3BArsSpNDjFQTm,0.00237,246653,0.49,0.738,...,0,0,0,0,0,0,0,0,0,0
9,1997-07-07,10,Quit Playing Games (with My Heart),Backstreet Boys,5rSXSAkZ67PYJSvpUpkOr7,1nRwyxNsqCLeA17qR8Nfxx,0.0615,233533,0.8,0.875,...,0,0,0,0,0,0,0,0,0,0


In [259]:
# spot check: Az Yet artist_id is messed up

azyet_spotify = '4UGMQyNcbGHYg5CDMKkSw3' # copied manually from spotify
azyet_df = '3iDD7bnsjL9J4fO298r0L0' # copied from top40_test df

try1 = requests.get(base_url + 'artists/' + azyet_spotify, headers=headers).json()
try2 = requests.get(base_url + 'artists/' + azyet_df, headers=headers).json()

print(try1['name'])
print(try1['genres'])
print('')
print(try2['name'])
print(try2['genres'])

Az Yet
['contemporary r&b', 'r&b', 'urban contemporary']

Chicago
['adult standards', 'album rock', 'classic rock', 'folk rock', 'mellow gold', 'singer-songwriter', 'soft rock', 'yacht rock']
