Use this notebook to get Spotify audio data

Useful resources:
- https://stmorse.github.io/journal/spotify-api.html
- https://medium.com/@lorelablaka/extract-data-using-spotify-api-889222835bf4

In [1]:
# dependencies
import requests
import json
import pandas as pd
import numpy as np
import re
import timeit
from IPython.display import clear_output

In [2]:
# Spotify API credentials
from config import track_client_ids, track_client_secrets, non_track_client_ids, non_track_client_secrets

In [3]:
# read in top40 data
top40_df = pd.read_csv("../02_Data Cleaning/Resources/top_40_cleaned_unique_songs.csv")

In [5]:
# get an authorization token
# https://developer.spotify.com/documentation/general/guides/authorization/client-credentials/

auth_token = 'https://accounts.spotify.com/api/token'

# POST token request

auth_response = requests.post(auth_token, {
    'grant_type': 'client_credentials',
    'client_id': sp_client,
    'client_secret': sp_secret,
})

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']

NameError: name 'sp_client' is not defined

In [4]:
# functions to cycle between api clients, get new authorization tokens
# https://developer.spotify.com/documentation/general/guides/authorization/client-credentials/

def get_track_tokens(iteration):
    
    auth_token = 'https://accounts.spotify.com/api/token'
        
    # post a token request
    auth_response = requests.post(auth_token, {
    'grant_type': 'client_credentials',
    'client_id': track_client_ids[iteration],
    'client_secret': track_client_secrets[iteration]
    })

    # convert the response to JSON
    auth_response_data = auth_response.json()

    # save the access token
    access_token = auth_response_data['access_token']

    # send a GET request to the API server with the access token in the header
    # https://developer.spotify.com/documentation/general/guides/authorization/use-access-token/
    headers = {
        'Authorization': 'Bearer {token}'.format(token=access_token)
    }

    return headers


def get_non_track_tokens(iteration):
    
    auth_token = 'https://accounts.spotify.com/api/token'
        
    # post a token request
    auth_response = requests.post(auth_token, {
    'grant_type': 'client_credentials',
    'client_id': non_track_client_ids[iteration],
    'client_secret': non_track_client_secrets[iteration]
    })

    # convert the response to JSON
    auth_response_data = auth_response.json()

    # save the access token
    access_token = auth_response_data['access_token']

    # send a GET request to the API server with the access token in the header
    # https://developer.spotify.com/documentation/general/guides/authorization/use-access-token/
    headers = {
        'Authorization': 'Bearer {token}'.format(token=access_token)
    }

    return headers


In [5]:
# https://developer.spotify.com/documentation/general/guides/authorization/use-access-token/

# send a GET request to the API server with the access token in the header

headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}

# paste final function here:

In [391]:
# function to get genres and audio features for all top40 tracks
# input: a dataframe of top40 charts with [week, position, song, artist] columns

def top40_dict(weekly_charts):
    
    start = timeit.default_timer()
    
    # change input dataframe to a dictionary
    top40_dict = weekly_charts.to_dict(orient='index') 
    
    # base URL for API calls
    base_url = 'https://api.spotify.com/v1/'

    # empty lists to store all genres, artist IDs in the dataset
    all_genres = []
    all_artist_genres = []
    artist_ids = []

    for c, chart_entry in enumerate(top40_dict.items()):

        ################################
        # search artists
        ################################

        # use an 'item' search for artist and track
        # https://developer.spotify.com/documentation/web-api/reference/#/operations/search
        query_artist = re.sub('\s', '%20', chart_entry[1]['artist'])
        query = f'search?q=artist:{query_artist}&type=artist&limit=5'
        artist_search_results = requests.get(base_url + query, headers=headers).json()
        
        
        ################################
        # search artists' top tracks
        ################################

        # save all IDs from artist search results
        id_list = []
        #for result in artist_search_results['artists']['items']:
        for result in artist_search_results['artists']['items']:
            artist_id = result['id']
            id_list.append(artist_id)
        chart_entry[1]['id_list'] = id_list
        
        # for each artist ID, search that artist's top tracks
        # https://api.spotify.com/v1/artists/{id}/top-tracks
        hit_list = []
        for artist in id_list:
            
            artist_hits = requests.get(base_url + 'artists/' + artist + '/top-tracks?market=US', 
                                       headers=headers).json()
            hit_list.append(artist_hits)
            chart_entry[1]['hit_list'] = hit_list


        ################################
        # save artist and track URIs
        ################################

        # search each artist's hits for the track of interest
        for track in hit_list[0]['tracks']:
            
            chart_entry[1]['artist_uri'] = 'NA'
            chart_entry[1]['song_uri'] = 'NA'

            # Spotify track and artist info
            track_artist = track['artists'][0]['name']
            track_artist_uri = track['artists'][0]['uri']
            track_title = track['name']
            track_title_uri = track['uri']
            
            # simplify artist/song strings to search chart instances against spotify instances
            compare_track_artist = track_artist.replace(' ', '').casefold()
            compare_track_title = track_title.replace(' ', '').casefold()
            
            compare_chart_artist = chart_entry[1]['artist'].replace(' ', '').casefold()
            compare_chart_song = chart_entry[1]['song'].replace(' ', '').casefold()
            
            # if the artist is found but not the track, save artist info
            # note: some songs may not be found because they are not in the artist's top tracks
            if (compare_chart_artist == compare_track_artist or \
            compare_chart_artist in compare_track_artist or \
            compare_track_artist in compare_chart_artist) and \
            (compare_chart_song != compare_track_title):
                
                chart_entry[1]['artist_uri'] = track_artist_uri

            # if the artist and song title are found, save the relevant info
            if (compare_chart_artist == compare_track_artist or \
            compare_chart_artist in compare_track_artist or \
            compare_track_artist in compare_chart_artist) and \
            (compare_chart_song == compare_track_title or \
            compare_chart_song in compare_track_title or \
            compare_track_title in compare_chart_song):
                
                chart_entry[1]['artist_uri'] = track_artist_uri
                chart_entry[1]['song_uri']  = track_title_uri

                # continue to the next iteration once a match is found and saved
                break
        
        
        ##################################
        # get audio features for all songs
        ##################################
        
        # empty lists to hold desired audio feature scores
        acousticness = []
        duration = []
        danceability = []
        energy = []
        instrumentalness = []
        key = []
        liveness = []
        loudness = []
        mode = []
        speechiness = []
        tempo = []
        time_signature = []
        valence = []
    
        # make the API call for track audio features
        # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
        
        # save features for all artists with a URI
        artist_id = re.search('(?:spotify:artist:)(\w+)', chart_entry[1]['artist_uri']).group(1)
        chart_entry[1]['artist_id'] = artist_id

        if chart_entry[1]['song_uri'] != 'NA':
            song_id = re.search('(?:spotify:track:)(\w+)', chart_entry[1]['song_uri']).group(1)
            chart_entry[1]['song_id'] = song_id
            # API call
            audio = requests.get(base_url + 'audio-features/' + song_id, headers=headers).json()
    
            # save audio features to their respective lists
            chart_entry[1]['acousticness'] = audio['acousticness']
            chart_entry[1]['duration'] = audio['duration_ms']
            chart_entry[1]['danceability'] = audio['danceability']
            chart_entry[1]['energy'] = audio['energy']
            chart_entry[1]['instrumentalness'] = audio['instrumentalness']
            chart_entry[1]['key'] = audio['key']
            chart_entry[1]['liveness'] = audio['liveness']
            chart_entry[1]['loudness'] = audio['loudness']
            chart_entry[1]['mode'] = audio['mode']
            chart_entry[1]['speechiness'] = audio['speechiness']
            chart_entry[1]['tempo'] = audio['tempo']
            chart_entry[1]['time_signature'] = audio['time_signature']
            chart_entry[1]['valence'] = audio['valence']
            
        else:
#            chart_entry[1]['song_id'] = 'NA'
#            chart_entry[1]['acousticness'] = np.nan
#            chart_entry[1]['duration'] = np.nan
#            chart_entry[1]['danceability'] = np.nan
#            chart_entry[1]['energy'] = np.nan
#            chart_entry[1]['instrumentalness'] = np.nan
#            chart_entry[1]['key'] = np.nan
#            chart_entry[1]['liveness'] = np.nan
#            chart_entry[1]['loudness'] = np.nan
#            chart_entry[1]['mode'] = np.nan
#            chart_entry[1]['speechiness'] = np.nan
#            chart_entry[1]['tempo'] = np.nan
#            chart_entry[1]['time_signature'] = np.nan
#            chart_entry[1]['valence'] = np.nan
            

        ########################
        # get artist genre data
        ########################
        
        # get artist IDs to use in search
        if chart_entry[1]['artist_uri'] != 'NA':
            
            artist_id = re.search('(?:spotify:artist:)(\w+)', chart_entry[1]['artist_uri']).group(1)
            chart_entry[1]['artist_id'] = artist_id
            artist_ids.append(artist_id)
        else:
            artist_id = 'NA'
            
        # save artist-song combinations to use in dataframe construction
        chart_entry[1]['combo'] = artist_id + song_id
        
        # get the artist's genre(s)
        # Spotify 'artist' search
        # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-an-artist
        
        try:
            # API call
            get_artist = requests.get(base_url + 'artists/' + artist_id, headers=headers).json()
            artist_genres = get_artist['genres']
            # format genre strings
            artist_genres = [x.replace(' ', '_').lower() for x in artist_genres]
            chart_entry[1]['artist_genres'] = artist_genres

        # for artists with 'NA' URIs
        except:
            artist_genres = ['no_genre_data']        
        
    
        # empty dictionary to hold all genres associated with each artist
        artist_genre_dict = {}
        artist_genre_dict[chart_entry[1]['artist_id']] = artist_genres
        all_artist_genres.append(artist_genre_dict)
        
        # save new genres to the list of all genres
        for genre in artist_genres:
            if genre not in all_genres:
                all_genres.append(genre)
    
                
#    #######################################
#    # drop uncommon genres from the dataset
#    #######################################
#
#    # get the sum of each genre column (exclude the last column)
#    genre_counts = genre_df.sum(axis=0)
#    genre_counts_df = pd.DataFrame(genre_counts, columns=['count'])
#
#    # get genres to drop from the dataset
#    # keeping only the top 50 most common genres
#    drop_genres = genre_counts_df.sort_values(['count'],ascending=False)[50:].index
#
#    # remove low-count genres from the dataset
#    genres_df = full_df.drop(columns=drop_genres)


        # log progress
        clear_output()
        checkpoint = timeit.default_timer()
        print(f'Saved data for song {c+1} of {len(weekly_charts)} ({(c+1)/len(weekly_charts)*100:.2f}% complete)')
        print(f'{(checkpoint-start)/60:.2f} minutes elapsed')


    ################################
    # output all data to a dataframe
    ################################

    all_genres_dict = {}

    for genre in all_genres:
        all_genres_dict[genre] = [0] * len(weekly_charts)
    
    # create a dataframe chart data
    chart_df = pd.DataFrame.from_dict(top40_dict, orient='index')
    
    # create a dataframe with genre data, sort df columns
    genre_df = pd.DataFrame(all_genres_dict)
    genre_df.columns = sorted(genre_df.columns.tolist())
    genre_df['combo'] = chart_df['combo']
    genre_df.set_index('combo', inplace=True)

    # assign 1's to each artist's genre                
    for column in genre_df:
        
        for row, combo in enumerate(genre_df.index):
            
            for genre in chart_df['artist_genres'][row]:
                                    
                if genre == column:
                    genre_df.loc[combo, column] = 1


    # make a copy of the input dataframe
    trimmed_df = chart_df.copy()
    
    # drop unwanted columns
    trimmed_df.drop(columns = ['id_list', 'hit_list', 'artist_genres', 'artist_uri', 'song_uri'], inplace=True)
    
    # reorder columns for aesthetics
    trimmed_df = trimmed_df[['week', 'position', 'artist', 'song', 'artist_id', 'song_id', 'combo',
                           'acousticness', 'danceability', 'duration', 'energy', 'instrumentalness',
                           'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo',
                           'time_signature', 'valence']]
    
    # merge all data into a final dataframe
    output_df = trimmed_df.merge(genre_df, how="inner", on="combo")

    # log progress
    checkpoint = timeit.default_timer()
    print('Data saved')
    print(f'Total runtime: {(checkpoint-start)/60:.2f} minutes')
    print('Done!')

    # return the dataframe
    return output_df


In [392]:
top40_audio  = get_audio_features(top40_df)

AttributeError: 'NoneType' object has no attribute 'group'

# TEST RUN

In [259]:
top40_df.head(8)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,week,position,song,artist
0,0,0,7/7/1997,1,Mmm Bop,Hanson
1,1,1,7/7/1997,2,Bitch,Meredith Brooks
2,2,2,7/7/1997,3,Say You'll Be There,Spice Girls
3,3,3,7/7/1997,4,Return Of The Mack,Mark Morrison
4,4,4,7/7/1997,5,Sunny Came Home,Shawn Colvin
5,5,5,7/7/1997,6,Do You Know,Robyn
6,6,6,7/7/1997,7,The Freshmen,Verve Pipe
7,7,7,7/7/1997,8,Hard To Say I'm Sorry,Az Yet


In [638]:
def top40_dict(dfin):
    
    start = timeit.default_timer()
    
    # change input dataframe to a dictionary
    top40_dict = dfin.to_dict(orient='index') 
    
    # base URL for API calls
    base_url = 'https://api.spotify.com/v1/'

    # empty lists to store all genres, artist IDs in the dataset
    all_genres = []
    all_artist_genres = []
    artist_ids = []

    for c, chart_entry in enumerate(top40_dict.items()):

        print(f'Now working on: {chart_entry[1]["artist"]} - {chart_entry[1]["song"]}')

        ################################
        # search artists
        ################################

        # use an 'item' search for artist and track
        # https://developer.spotify.com/documentation/web-api/reference/#/operations/search
        query_artist = re.sub('\s', '%20', chart_entry[1]['artist'])
        query = f'search?q=artist:{query_artist}&type=artist&limit=5'
        artist_search_results = requests.get(base_url + query, headers=headers).json()
        
        
        ################################
        # search artists' top tracks
        ################################

        # save all IDs from artist search results
        id_list = []
        #for result in artist_search_results['artists']['items']:
        for result in artist_search_results['artists']['items']:
            artist_id = result['id']
            id_list.append(artist_id)
        chart_entry[1]['id_list'] = id_list
        
        # for each artist ID, search that artist's top tracks
        # https://api.spotify.com/v1/artists/{id}/top-tracks
        hit_list = []
        for artist in id_list:
            
            artist_hits = requests.get(base_url + 'artists/' + artist + '/top-tracks?market=US', 
                                       headers=headers).json()
            hit_list.append(artist_hits)
            chart_entry[1]['hit_list'] = hit_list


        ################################
        # save artist and track URIs
        ################################

        # search each artist's hits for the track of interest
        try:
            
            for track in hit_list[0]['tracks']:
                
                chart_entry[1]['artist_uri'] = 'NA'
                chart_entry[1]['song_uri'] = 'NA'
    
                # Spotify track and artist info
                track_artist = track['artists'][0]['name']
                track_artist_uri = track['artists'][0]['uri']
                track_title = track['name']
                track_title_uri = track['uri']
                
                # simplify artist/song strings to search chart instances against spotify instances
                compare_track_artist = track_artist.replace(' ', '').casefold()
                compare_track_title = track_title.replace(' ', '').casefold()
                
                compare_chart_artist = chart_entry[1]['artist'].replace(' ', '').casefold()
                compare_chart_song = chart_entry[1]['song'].replace(' ', '').casefold()
                
                # if the artist is found but not the track, save artist info
                # note: some songs may not be found because they are not in the artist's top tracks
                if (compare_chart_artist == compare_track_artist or \
                compare_chart_artist in compare_track_artist or \
                compare_track_artist in compare_chart_artist) and \
                (compare_chart_song != compare_track_title):
                    
                    chart_entry[1]['artist_uri'] = track_artist_uri
    
                # if the artist and song title are found, save the relevant info
                if (compare_chart_artist == compare_track_artist or \
                compare_chart_artist in compare_track_artist or \
                compare_track_artist in compare_chart_artist) and \
                (compare_chart_song == compare_track_title or \
                compare_chart_song in compare_track_title or \
                compare_track_title in compare_chart_song):
                    
                    chart_entry[1]['artist_uri'] = track_artist_uri
                    chart_entry[1]['song_uri']  = track_title_uri
    
                    # continue to the next iteration once a match is found and saved
                    break
        
        except IndexError:
            
            chart_entry[1]['artist_uri'] = 'NA'
            chart_entry[1]['song_uri'] = 'NA'

        ##################################
        # get audio features for all songs
        ##################################
        
        # empty lists to hold desired audio feature scores
        acousticness = []
        duration = []
        danceability = []
        energy = []
        instrumentalness = []
        key = []
        liveness = []
        loudness = []
        mode = []
        speechiness = []
        tempo = []
        time_signature = []
        valence = []
    
        # make the API call for track audio features
        # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
        
        # save features for all artists with a URI
        if chart_entry[1]['song_uri'] != 'NA':
            song_id = re.search('(?:spotify:track:)(\w+)', chart_entry[1]['song_uri']).group(1)
            chart_entry[1]['song_id'] = song_id
            # API call
            audio = requests.get(base_url + 'audio-features/' + song_id, headers=headers).json()
    
            # save audio features to their respective lists
            chart_entry[1]['acousticness'] = audio['acousticness']
            chart_entry[1]['duration'] = audio['duration_ms']
            chart_entry[1]['danceability'] = audio['danceability']
            chart_entry[1]['energy'] = audio['energy']
            chart_entry[1]['instrumentalness'] = audio['instrumentalness']
            chart_entry[1]['key'] = audio['key']
            chart_entry[1]['liveness'] = audio['liveness']
            chart_entry[1]['loudness'] = audio['loudness']
            chart_entry[1]['mode'] = audio['mode']
            chart_entry[1]['speechiness'] = audio['speechiness']
            chart_entry[1]['tempo'] = audio['tempo']
            chart_entry[1]['time_signature'] = audio['time_signature']
            chart_entry[1]['valence'] = audio['valence']
            
        else:
            chart_entry[1]['song_id'] = 'NA'
            chart_entry[1]['acousticness'] = np.nan
            chart_entry[1]['duration'] = np.nan
            chart_entry[1]['danceability'] = np.nan
            chart_entry[1]['energy'] = np.nan
            chart_entry[1]['instrumentalness'] = np.nan
            chart_entry[1]['key'] = np.nan
            chart_entry[1]['liveness'] = np.nan
            chart_entry[1]['loudness'] = np.nan
            chart_entry[1]['mode'] = np.nan
            chart_entry[1]['speechiness'] = np.nan
            chart_entry[1]['tempo'] = np.nan
            chart_entry[1]['time_signature'] = np.nan
            chart_entry[1]['valence'] = np.nan

        ########################
        # get artist genre data
        ########################
        
        # get artist IDs to use in search
        if chart_entry[1]['artist_uri'] != 'NA':
            
            artist_id = re.search('(?:spotify:artist:)(\w+)', chart_entry[1]['artist_uri']).group(1)
            chart_entry[1]['artist_id'] = artist_id
            artist_ids.append(artist_id)
        else:
            artist_id = 'NA'
            
        # save artist-song combinations to use in dataframe construction
        if artist_id != 'NA':
            chart_entry[1]['combo'] = artist_id + song_id
        else:
            chart_entry[1]['combo'] = compare_chart_artist + compare_chart_song
        print(chart_entry[1]['combo'])
        # get the artist's genre(s)
        # Spotify 'artist' search
        # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-an-artist
        
#        try:
        if artist_id != 'NA':
            # API call
            get_artist = requests.get(base_url + 'artists/' + artist_id, headers=headers).json()
            artist_genres = get_artist['genres']
            # format genre strings
            artist_genres = [x.replace(' ', '_').lower() for x in artist_genres]
            chart_entry[1]['artist_genres'] = artist_genres

        # for artists with 'NA' URIs
        else:
            artist_genres = ['no_genre_info']        
        
    
#        # empty dictionary to hold all genres associated with each artist
#        artist_genre_dict = {}
#        artist_genre_dict[chart_entry[1]['artist_id']] = artist_genres
#        all_artist_genres.append(artist_genre_dict)
        
        # save new genres to the list of all genres
        for genre in artist_genres:
            if genre not in all_genres:
                all_genres.append(genre)
    
                
#    #######################################
#    # drop uncommon genres from the dataset
#    #######################################
#
#    # get the sum of each genre column (exclude the last column)
#    genre_counts = genre_df.sum(axis=0)
#    genre_counts_df = pd.DataFrame(genre_counts, columns=['count'])
#
#    # get genres to drop from the dataset
#    # keeping only the top 50 most common genres
#    drop_genres = genre_counts_df.sort_values(['count'],ascending=False)[50:].index
#
#    # remove low-count genres from the dataset
#    genres_df = full_df.drop(columns=drop_genres)


        # log progress
        clear_output()
        checkpoint = timeit.default_timer()
        print(f'Saved data for song {c+1} of {len(dfin)} ({(c+1)/len(dfin)*100:.2f}% complete)')
        print(f'{(checkpoint-start)/60:.2f} minutes elapsed')

    print('Formatting data...')    
    ################################
    # output all data to a dataframe
    ################################

    all_genres_dict = {}

    for genre in all_genres:
        all_genres_dict[genre] = [0] * len(dfin)
    
    # create a dataframe chart data
    chart_df = pd.DataFrame.from_dict(top40_dict, orient='index')
    chart_df.set_index('combo', inplace=True, drop=False)
#    print(chart_df['combo'])
#    print(type(chart_df['combo']))
#    print(chart_df['combo'].dtypes)

    # create a dataframe with genre data, sort df columns
    genre_df = pd.DataFrame.from_dict(all_genres_dict, orient='columns')
    genre_df.columns = sorted(genre_df.columns.tolist())
    genre_df['combo'] = chart_df.index
    genre_df.set_index('combo', inplace=True, drop=False)
    
    # assign 1's to each artist's genre                
#    try:
    for column in genre_df:
        
        for row, combo in enumerate(genre_df.index):
            
            try:
                for genre in chart_df['artist_genres'][row]:
                
                    if genre == column:
                        genre_df.loc[combo, column] = 1
            
            except TypeError:
                genre_df.loc[combo, column] = np.nan

#    except KeyError:
#        pass
#    # assign 1's to each artist's genre                
#    if artist_id != 'NA':
#        
#        for column in genre_df:
#            print(column)
#            for row, combo in enumerate(genre_df.index):
#                print(combo)
#                for genre in chart_df['artist_genres'][row]:
#                    print(genre)                    
#                    if genre == column:
#                        genre_df.loc[combo, column] = 1
#    
#    # but if the artist has no saved genres, assign NA's
#    else:  
#        for cell in genre_df.loc[chart_entry[1]['combo']]:
#            cell = 'NA'
        
    # make a copy of the input dataframe
    trimmed_df = chart_df.copy()
    
    # drop unwanted columns
    trimmed_df.drop(columns = ['id_list', 'hit_list', 'artist_genres', 'artist_uri', 'song_uri'], inplace=True)
    
    # reorder columns for aesthetics
    trimmed_df = trimmed_df[['week', 'position', 'artist', 'song', 'artist_id', 'song_id', 'combo',
                           'acousticness', 'danceability', 'duration', 'energy', 'instrumentalness',
                           'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo',
                           'time_signature', 'valence']]
    

    # merge all data into a final dataframe
    output_df = trimmed_df.merge(genre_df, how="inner", left_index=True, right_index=True)
    output_df.drop(columns=['combo_x', 'combo_y'], inplace=True)
    output_df.reset_index(inplace=True)
    
    # log progress
    checkpoint = timeit.default_timer()
    print('Data saved')
    print(f'Total runtime: {(checkpoint-start)/60:.2f} minutes')
    print('Done!')

    # return the dataframe
    return output_df


In [639]:
try40 = top40_dict(top40_df.iloc[480:500])

Saved data for song 2 of 20 (10.00% complete)
0.06 minutes elapsed
Now working on: Torrey Carter - Take That


KeyError: 'song_uri'

In [548]:
try40

Unnamed: 0,combo,week,position,artist,song,artist_id,song_id,acousticness,danceability,duration,...,time_signature,valence,ectofolk,lilith,neo_mellow,new_wave_pop,no_genre_info,nz_pop,pop_rock,singer-songwriter
0,7m60UAnbgFFNuJbmS6OxTk2skmOCFU64Bg7Ytkgwliwe,7/7/97,16,Sister Hazel,All For You,7m60UAnbgFFNuJbmS6OxTk,2skmOCFU64Bg7Ytkgwliwe,0.0316,0.633,218707.0,...,4.0,0.641,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,08DLZqQd6XDAVteF9nSEJ12ck13qfgRZ1msyEJlDqzvk,7/7/97,17,Paula Cole,Where Have All The Cowboys Gone?,08DLZqQd6XDAVteF9nSEJ1,2ck13qfgRZ1msyEJlDqzvk,0.0351,0.588,263653.0,...,4.0,0.621,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
2,4vdt8TD56jjQfmxFCmhubX0wJyFu6JHwUoKQN5X3uzVe,7/7/97,18,Omc,How Bizarre,4vdt8TD56jjQfmxFCmhubX,0wJyFu6JHwUoKQN5X3uzVe,0.269,0.783,224200.0,...,4.0,0.824,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,3AO6RTDA16mg8czQI5Y9K075HZ6Wxnw8xbgz6HU9kKCS,7/7/97,20,Duncan Sheik,Barely Breathing,3AO6RTDA16mg8czQI5Y9K0,75HZ6Wxnw8xbgz6HU9kKCS,0.0281,0.482,254547.0,...,4.0,0.516,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [331]:
try40.loc[5]

week                                                            7/7/1997
position                                                               6
artist                                                             Robyn
song                                                         Do You Know
artist_id                                         6UE7nl9mha6s8z0wFQFIZ2
song_id                                           7g13jf3zqlP5S68Voo5v9m
combo                       6UE7nl9mha6s8z0wFQFIZ27g13jf3zqlP5S68Voo5v9m
acousticness                                                     0.00202
danceability                                                       0.573
duration                                                          278080
energy                                                             0.926
instrumentalness                                                  0.0117
key                                                                    6
liveness                                           

In [49]:
base_url = 'https://api.spotify.com/v1/'

query = f'search?q=artist:Jewel&type=artist&limit=5'
robyn_search_results = requests.get(base_url + query, headers=get_non_track_tokens(0)).json()
robyn_search_results

{'artists': {'href': 'https://api.spotify.com/v1/search?query=artist%3AJewel&type=artist&offset=0&limit=5',
  'items': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4RnBFZRiMLRyZy0AzzTg2C'},
    'followers': {'href': None, 'total': 998365},
    'genres': ['alternative hip hop',
     'escape room',
     'hip hop',
     'political hip hop'],
    'href': 'https://api.spotify.com/v1/artists/4RnBFZRiMLRyZy0AzzTg2C',
    'id': '4RnBFZRiMLRyZy0AzzTg2C',
    'images': [{'height': 640,
      'url': 'https://i.scdn.co/image/ab6761610000e5ebea5eda58996c7f08d423d522',
      'width': 640},
     {'height': 320,
      'url': 'https://i.scdn.co/image/ab67616100005174ea5eda58996c7f08d423d522',
      'width': 320},
     {'height': 160,
      'url': 'https://i.scdn.co/image/ab6761610000f178ea5eda58996c7f08d423d522',
      'width': 160}],
    'name': 'Run The Jewels',
    'popularity': 62,
    'type': 'artist',
    'uri': 'spotify:artist:4RnBFZRiMLRyZy0AzzTg2C'},
   {'external_urls': {'s

In [629]:
robyn_search_results['artists']['items']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/5tObUBbLZkzXjXXR8Dw9Je'},
  'followers': {'href': None, 'total': 0},
  'genres': [],
  'href': 'https://api.spotify.com/v1/artists/5tObUBbLZkzXjXXR8Dw9Je',
  'id': '5tObUBbLZkzXjXXR8Dw9Je',
  'images': [],
  'name': 'Torrey Carter',
  'popularity': 0,
  'type': 'artist',
  'uri': 'spotify:artist:5tObUBbLZkzXjXXR8Dw9Je'},
 {'external_urls': {'spotify': 'https://open.spotify.com/artist/7JZlwHigpDkuz8XbuuK72j'},
  'followers': {'href': None, 'total': 75},
  'genres': [],
  'href': 'https://api.spotify.com/v1/artists/7JZlwHigpDkuz8XbuuK72j',
  'id': '7JZlwHigpDkuz8XbuuK72j',
  'images': [{'height': 640,
    'url': 'https://i.scdn.co/image/ab67616d0000b27373ec8c6a8758c4ed65a4071c',
    'width': 640},
   {'height': 300,
    'url': 'https://i.scdn.co/image/ab67616d00001e0273ec8c6a8758c4ed65a4071c',
    'width': 300},
   {'height': 64,
    'url': 'https://i.scdn.co/image/ab67616d0000485173ec8c6a8758c4ed65a4071c',
    'width': 64}]

In [51]:
# save all IDs from artist search results
id_list = []
chart_entry = {'artist':'Jewel', 'song':'You Were Meant For Me'}
#for result in artist_search_results['artists']['items']:
for result in robyn_search_results['artists']['items']:
    artist_id = result['id']
    id_list.append(artist_id)
chart_entry['id_list'] = id_list

# for each artist ID, search that artist's top tracks
# https://api.spotify.com/v1/artists/{id}/top-tracks
hit_list = []
for artist in id_list:
    
    artist_hits = requests.get(base_url + 'artists/' + artist + '/top-tracks?market=US', 
                               headers=get_non_track_tokens(1)).json()
    hit_list.append(artist_hits)
    chart_entry['hit_list'] = hit_list


In [52]:
#hit_list[1]['tracks']
for item in hit_list:
    print(item['tracks'])


[{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4RnBFZRiMLRyZy0AzzTg2C'}, 'href': 'https://api.spotify.com/v1/artists/4RnBFZRiMLRyZy0AzzTg2C', 'id': '4RnBFZRiMLRyZy0AzzTg2C', 'name': 'Run The Jewels', 'type': 'artist', 'uri': 'spotify:artist:4RnBFZRiMLRyZy0AzzTg2C'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/57UnSUpae3SbRekxNa5Kgl'}, 'href': 'https://api.spotify.com/v1/artists/57UnSUpae3SbRekxNa5Kgl', 'id': '57UnSUpae3SbRekxNa5Kgl', 'name': 'El-P', 'type': 'artist', 'uri': 'spotify:artist:57UnSUpae3SbRekxNa5Kgl'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/2N4EYkIlG1kv25g6Wv8LGI'}, 'href': 'https://api.spotify.com/v1/artists/2N4EYkIlG1kv25g6Wv8LGI', 'id': '2N4EYkIlG1kv25g6Wv8LGI', 'name': 'Killer Mike', 'type': 'artist', 'uri': 'spotify:artist:2N4EYkIlG1kv25g6Wv8LGI'}], 'external_urls': {'spotify': 'https://open.spotify.com/album/2vY03PfKPFUUM1FA2lgmC2'}, 'href': 'https://api.spotify.

In [54]:
chart_entry['artist_uri'] = 'NA'
chart_entry['song_uri'] = 'NA'

# search each artist's hits for the track of interest
for item in hit_list:
    if item['tracks'] == []:
        continue
    else:
#        for t, track in enumerate(hit_list[0]['tracks']):
        if item['tracks'] != []:
            for track in item['tracks']:
                
                # Spotify track and artist info
                track_artist = track['artists'][0]['name']
                track_artist_uri = track['artists'][0]['uri']
                track_title = track['name']
                track_title_uri = track['uri']
                
                # simplify artist/song strings to search chart instances against spotify instances
                compare_track_artist = track_artist.replace(' ', '').casefold()
                compare_track_title = track_title.replace(' ', '').casefold()
                
                compare_chart_artist = chart_entry['artist'].replace(' ', '').casefold()
                compare_chart_song = chart_entry['song'].replace(' ', '').casefold()
                
            #    # if the artist is found but not the track, save artist info
            #    # note: some songs may not be found because they are not in the artist's top tracks
            #    if compare_chart_artist == compare_track_artist or \
            #    compare_chart_artist in compare_track_artist or \
            #    compare_track_artist in compare_chart_artist and \
            #    compare_chart_song != compare_track_title:
            #        
            #        chart_entry['artist_uri'] = track_artist_uri
                    
                # if the artist and song title are found, save the relevant info
                if (compare_chart_artist == compare_track_artist or \
                compare_chart_artist in compare_track_artist or \
                compare_track_artist in compare_chart_artist) and \
                (compare_chart_song == compare_track_title or \
                compare_chart_song in compare_track_title or \
                compare_track_title in compare_chart_song):
                    
                    chart_entry['artist_uri'] = track_artist_uri
                    chart_entry['song_uri']  = track_title_uri
            
                    # continue to the next iteration once a match is found and saved
                    break
    

In [55]:
compare_chart_artist

'jewel'

In [56]:
chart_entry['song_uri']

'spotify:track:7y7aSSTiQfO0Ace4F7yxzE'

In [57]:
# empty lists to hold desired audio feature scores
acousticness = []
duration = []
danceability = []
energy = []
instrumentalness = []
key = []
liveness = []
loudness = []
mode = []
speechiness = []
tempo = []
time_signature = []
valence = []

# make the API call for track audio features
# https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features

# save features for all artists with a URI
if chart_entry['artist_uri'] != 'NA':
    song_id = re.search('(?:spotify:track:)(\w+)', chart_entry['song_uri']).group(1)
    chart_entry['song_id'] = song_id
    # API call
    audio = requests.get(base_url + 'audio-features/' + song_id, headers=get_non_track_tokens(2)).json()

    # save audio features to their respective lists
    chart_entry['acousticness'] = audio['acousticness']
    chart_entry['duration'] = audio['duration_ms']
    chart_entry['danceability'] = audio['danceability']
    chart_entry['energy'] = audio['energy']
    chart_entry['instrumentalness'] = audio['instrumentalness']
    chart_entry['key'] = audio['key']
    chart_entry['liveness'] = audio['liveness']
    chart_entry['loudness'] = audio['loudness']
    chart_entry['mode'] = audio['mode']
    chart_entry['speechiness'] = audio['speechiness']
    chart_entry['tempo'] = audio['tempo']
    chart_entry['time_signature'] = audio['time_signature']
    chart_entry['valence'] = audio['valence']
    
else:
    chart_entry['acousticness'] = np.nan
    chart_entry['duration'] = np.nan
    chart_entry['danceability'] = np.nan
    chart_entry['energy'] = np.nan
    chart_entry['instrumentalness'] = np.nan
    chart_entry['key'] = np.nan
    chart_entry['liveness'] = np.nan
    chart_entry['loudness'] = np.nan
    chart_entry['mode'] = np.nan
    chart_entry['speechiness'] = np.nan
    chart_entry['tempo'] = np.nan
    chart_entry['time_signature'] = np.nan
    chart_entry['valence'] = np.nan


In [58]:
chart_entry

{'artist': 'Jewel',
 'song': 'You Were Meant For Me',
 'id_list': ['4RnBFZRiMLRyZy0AzzTg2C',
  '6FbDoZnMBTdhhhLuJBOOqP',
  '4TBsKdA7190eaHM5E2HQI9',
  '17Cit7wNuKM6q67MwLH6gD',
  '7caMdiwqwg1WefL7Jjm23M'],
 'hit_list': [{'tracks': [{'album': {'album_type': 'album',
      'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4RnBFZRiMLRyZy0AzzTg2C'},
        'href': 'https://api.spotify.com/v1/artists/4RnBFZRiMLRyZy0AzzTg2C',
        'id': '4RnBFZRiMLRyZy0AzzTg2C',
        'name': 'Run The Jewels',
        'type': 'artist',
        'uri': 'spotify:artist:4RnBFZRiMLRyZy0AzzTg2C'},
       {'external_urls': {'spotify': 'https://open.spotify.com/artist/57UnSUpae3SbRekxNa5Kgl'},
        'href': 'https://api.spotify.com/v1/artists/57UnSUpae3SbRekxNa5Kgl',
        'id': '57UnSUpae3SbRekxNa5Kgl',
        'name': 'El-P',
        'type': 'artist',
        'uri': 'spotify:artist:57UnSUpae3SbRekxNa5Kgl'},
       {'external_urls': {'spotify': 'https://open.spotify.com/artist/2N4

# WORKING VERSION

In [32]:
def top40_dict(dfin):
    
    start = timeit.default_timer()
    
    # change input dataframe to a dictionary
    top40_dict = dfin.to_dict(orient='index') 
    
    # base URL for API calls
    base_url = 'https://api.spotify.com/v1/'

    # empty lists to store all genres, artist IDs in the dataset
    all_genres = []
    all_artist_genres = []
    artist_ids = []

    failed_charts = []
    
    for c, chart_entry in enumerate(top40_dict.items()):
        
        try:
            
            print(f'Now working on: {chart_entry[1]["artist"]} - {chart_entry[1]["song"]}')
            ################################
            # search artists
            ################################
    
            # use an 'item' search for artist and track
            # https://developer.spotify.com/documentation/web-api/reference/#/operations/search
            query_artist = re.sub('\s', '%20', chart_entry[1]['artist'])
            query = f'search?q=artist:{query_artist}&type=artist&limit=5'
            artist_search_results = requests.get(base_url + query, headers=get_non_track_tokens(0)).json()
            
            
            ################################
            # search artists' top tracks
            ################################
    
            # save all IDs from artist search results
            id_list = []
            #for result in artist_search_results['artists']['items']:
            for result in artist_search_results['artists']['items']:
                artist_id = result['id']
                id_list.append(artist_id)
            chart_entry[1]['id_list'] = id_list
            
            # for each artist ID, search that artist's top tracks
            # https://api.spotify.com/v1/artists/{id}/top-tracks
            hit_list = []
            for a, artist in enumerate(id_list):
                
                artist_hits = requests.get(base_url + 'artists/' + artist + '/top-tracks?market=US', 
                                           headers=get_track_tokens(a)).json()
                hit_list.append(artist_hits)
                chart_entry[1]['hit_list'] = hit_list
    
    
            ################################
            # save artist and track URIs
            ################################
    
            # search each artist's hits for the track of interest
    #        for item in hit_list:
    #            if item['tracks'] == []:
    #                continue
    #                
    #            else:
    #                if item['tracks'] != []:
    #        
    #                    try:
    #                        for track in item['tracks']:
    #            #            for track in hit_list[0]['tracks']:
    #                            
    #                            chart_entry[1]['artist_uri'] = 'NA'
    #                            chart_entry[1]['song_uri'] = 'NA'
    #                
    #                
    #                            # Spotify track and artist info
    #                            track_artist = track['artists'][0]['name']
    #                            track_artist_uri = track['artists'][0]['uri']
    #                            track_title = track['name']
    #                            track_title_uri = track['uri']
    #                            
    #                            # simplify artist/song strings to search chart instances against spotify instances
    #                            compare_track_artist = track_artist.replace(' ', '').casefold()
    #                            compare_track_title = track_title.replace(' ', '').casefold()
    #                            
    #                            compare_chart_artist = chart_entry[1]['artist'].replace(' ', '').casefold()
    #                            compare_chart_song = chart_entry[1]['song'].replace(' ', '').casefold()
    #                            
    #                            # if the artist is found but not the track, save artist info
    #                            # note: some songs may not be found because they are not in the artist's top tracks
    #                            if (compare_chart_artist == compare_track_artist or \
    #                            compare_chart_artist in compare_track_artist or \
    #                            compare_track_artist in compare_chart_artist) and \
    #                            (compare_chart_song != compare_track_title):
    #                                
    #                                chart_entry[1]['artist_uri'] = track_artist_uri
    #                
    #                            # if the artist and song title are found, save the relevant info
    #                            if (compare_chart_artist == compare_track_artist or \
    #                            compare_chart_artist in compare_track_artist or \
    #                            compare_track_artist in compare_chart_artist) and \
    #                            (compare_chart_song == compare_track_title or \
    #                            compare_chart_song in compare_track_title or \
    #                            compare_track_title in compare_chart_song):
    #                                
    #                                chart_entry[1]['artist_uri'] = track_artist_uri
    #                                chart_entry[1]['song_uri']  = track_title_uri
    #                
    #                                # continue to the next iteration once a match is found and saved
    #                                break
    #                    
    #                    except IndexError:
    #                        chart_entry[1]['artist_uri'] = 'NA'
    #                        chart_entry[1]['song_uri'] = 'NA'
    
            for track in hit_list[0]['tracks']:
                
                chart_entry[1]['artist_uri'] = 'NA'
                chart_entry[1]['song_uri'] = 'NA'
    
                # Spotify track and artist info
                track_artist = track['artists'][0]['name']
                track_artist_uri = track['artists'][0]['uri']
                track_title = track['name']
                track_title_uri = track['uri']
                
                # simplify artist/song strings to search chart instances against spotify instances
                compare_track_artist = track_artist.replace(' ', '').casefold()
                compare_track_title = track_title.replace(' ', '').casefold()
                
                compare_chart_artist = chart_entry[1]['artist'].replace(' ', '').casefold()
                compare_chart_song = chart_entry[1]['song'].replace(' ', '').casefold()
                
#                # if the artist is found but not the track, save artist info
#                # note: some songs may not be found because they are not in the artist's top tracks
#                if (compare_chart_artist == compare_track_artist or \
#                compare_chart_artist in compare_track_artist or \
#                compare_track_artist in compare_chart_artist) and \
#                (compare_chart_song != compare_track_title):
#                    
#                    chart_entry[1]['artist_uri'] = track_artist_uri
    
                # if the artist and song title are found, save the relevant info
                if (compare_chart_artist == compare_track_artist or \
                compare_chart_artist in compare_track_artist or \
                compare_track_artist in compare_chart_artist) and \
                (compare_chart_song == compare_track_title or \
                compare_chart_song in compare_track_title or \
                compare_track_title in compare_chart_song):
                    
                    chart_entry[1]['artist_uri'] = track_artist_uri
                    chart_entry[1]['song_uri']  = track_title_uri
    
                    # continue to the next iteration once a match is found and saved
                    break
            
            # if a song uri hasn't been found, send track info to the failures list, continue to the next iteration
            if chart_entry[1]['song_uri'] == 'NA':
                failed_charts.append([c, chart_entry[1]['artist'], chart_entry[1]['song']])
                continue
                
            ##################################
            # get audio features for all songs
            ##################################
            
            # empty lists to hold desired audio feature scores
            acousticness = []
            duration = []
            danceability = []
            energy = []
            instrumentalness = []
            key = []
            liveness = []
            loudness = []
            mode = []
            speechiness = []
            tempo = []
            time_signature = []
            valence = []
        
            # make the API call for track audio features
            # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
            
            # save features for all artists with a URI
            if chart_entry[1]['song_uri'] != 'NA':
                song_id = re.search('(?:spotify:track:)(\w+)', chart_entry[1]['song_uri']).group(1)
                chart_entry[1]['song_id'] = song_id
                # API call
                audio = requests.get(base_url + 'audio-features/' + song_id, headers=get_non_track_tokens(1)).json()
        
                # save audio features to their respective lists
                chart_entry[1]['acousticness'] = audio['acousticness']
                chart_entry[1]['duration'] = audio['duration_ms']
                chart_entry[1]['danceability'] = audio['danceability']
                chart_entry[1]['energy'] = audio['energy']
                chart_entry[1]['instrumentalness'] = audio['instrumentalness']
                chart_entry[1]['key'] = audio['key']
                chart_entry[1]['liveness'] = audio['liveness']
                chart_entry[1]['loudness'] = audio['loudness']
                chart_entry[1]['mode'] = audio['mode']
                chart_entry[1]['speechiness'] = audio['speechiness']
                chart_entry[1]['tempo'] = audio['tempo']
                chart_entry[1]['time_signature'] = audio['time_signature']
                chart_entry[1]['valence'] = audio['valence']
                
            else:
                song_id = 'NA'
                chart_entry[1]['song_id'] = 'NA'
                chart_entry[1]['acousticness'] = np.nan
                chart_entry[1]['duration'] = np.nan
                chart_entry[1]['danceability'] = np.nan
                chart_entry[1]['energy'] = np.nan
                chart_entry[1]['instrumentalness'] = np.nan
                chart_entry[1]['key'] = np.nan
                chart_entry[1]['liveness'] = np.nan
                chart_entry[1]['loudness'] = np.nan
                chart_entry[1]['mode'] = np.nan
                chart_entry[1]['speechiness'] = np.nan
                chart_entry[1]['tempo'] = np.nan
                chart_entry[1]['time_signature'] = np.nan
                chart_entry[1]['valence'] = np.nan
    
#            print(f'artist URI: {chart_entry[1]["artist_uri"]}')
#            print(f'artist ID: {artist_id}')
#            print(f'song URI: {chart_entry[1]["song_uri"]}')
#            print(f'song ID: {chart_entry[1]["song_id"]}')
    
            ########################
            # get artist genre data
            ########################
            
            # get artist IDs to use in search
#            if chart_entry[1]['artist_uri'] != 'NA':
                
            artist_id = re.search('(?:spotify:artist:)(\w+)', chart_entry[1]['artist_uri']).group(1)
            chart_entry[1]['artist_id'] = artist_id
            artist_ids.append(artist_id)
#            else:
#                artist_id = 'NA'
                
            # save artist-song combinations to use in dataframe construction
            if artist_id != 'NA':
                chart_entry[1]['combo'] = artist_id + song_id
            else:
                chart_entry[1]['combo'] = compare_chart_artist + compare_chart_song
            # get the artist's genre(s)
            # Spotify 'artist' search
            # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-an-artist
            
    #        try:
            if artist_id != 'NA':
                # API call
                get_artist = requests.get(base_url + 'artists/' + artist_id, headers=get_non_track_tokens(2)).json()
                artist_genres = get_artist['genres']
                # format genre strings
                artist_genres = [x.replace(' ', '_').lower() for x in artist_genres]
                chart_entry[1]['artist_genres'] = artist_genres
    
            # for artists with 'NA' URIs
            else:
                artist_genres = ['no_genre_info']        
            
        
    #        # empty dictionary to hold all genres associated with each artist
    #        artist_genre_dict = {}
    #        artist_genre_dict[chart_entry[1]['artist_id']] = artist_genres
    #        all_artist_genres.append(artist_genre_dict)
            
            # save new genres to the list of all genres
            for genre in artist_genres:
                if genre not in all_genres:
                    all_genres.append(genre)
            
    
            # log progress
            clear_output()
            checkpoint = timeit.default_timer()
            print(f'Saved data for song {c+1} of {len(dfin)} ({(c+1)/len(dfin)*100:.2f}% complete)')
            print(f'{(checkpoint-start)/60:.2f} minutes elapsed')
    
        # for chart items that don't work, skip and save info to a list
        except:
            failed_charts.append([c, chart_entry[1]['artist'], chart_entry[1]['song']])
        
    
    print('Formatting data...')    

    ################################
    # output all data to a dataframe
    ################################

    all_genres_dict = {}

    for genre in all_genres:
        all_genres_dict[genre] = [0] * len(dfin)
    
    # create a dataframe chart data
    chart_df = pd.DataFrame.from_dict(top40_dict, orient='index')
    chart_df.set_index('combo', inplace=True, drop=False)

    # create a dataframe with genre data, sort df columns
    genre_df = pd.DataFrame.from_dict(all_genres_dict, orient='columns')
    genre_df.columns = sorted(genre_df.columns.tolist())
    
    genre_df['combo'] = chart_df.index
    genre_df.set_index('combo', inplace=True, drop=False)
    
    # assign 1's to each artist's genre                
#    try:
    for column in genre_df:
        
        for row, combo in enumerate(genre_df.index):
            
            try:
                for genre in chart_df['artist_genres'][row]:
                
                    if genre == column:
                        genre_df.loc[combo, column] = 1
            
            except TypeError:
                genre_df.loc[combo, column] = np.nan

#    except KeyError:
#        pass
#    # assign 1's to each artist's genre                
#    if artist_id != 'NA':
#        
#        for column in genre_df:
#            print(column)
#            for row, combo in enumerate(genre_df.index):
#                print(combo)
#                for genre in chart_df['artist_genres'][row]:
#                    print(genre)                    
#                    if genre == column:
#                        genre_df.loc[combo, column] = 1
#    
#    # but if the artist has no saved genres, assign NA's
#    else:  
#        for cell in genre_df.loc[chart_entry[1]['combo']]:
#            cell = 'NA'
        
    # make a copy of the input dataframe
    trimmed_df = chart_df.copy()
    
    # drop unwanted columns
    trimmed_df.drop(columns = ['id_list', 'hit_list', 'artist_genres', 'artist_uri', 'song_uri'], inplace=True)
    
    # reorder columns for aesthetics
    trimmed_df = trimmed_df[['week', 'position', 'artist', 'song', 'artist_id', 'song_id', 'combo',
                           'acousticness', 'danceability', 'duration', 'energy', 'instrumentalness',
                           'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo',
                           'time_signature', 'valence']]
    

    # merge all data into a final dataframe
    output_df = trimmed_df.merge(genre_df, how="inner", left_index=True, right_index=True)
    output_df.drop(columns=['combo_x', 'combo_y'], inplace=True)
    pd.to_datetime(output_df['week'])
    output_df.sort_values(by=['week','position'], inplace=True)
    output_df.reset_index(inplace=True)
    
    # log progress
    checkpoint = timeit.default_timer()
    print('Data saved')
    print(f'Total runtime: {(checkpoint-start)/60:.2f} minutes')
    print('Done!')

    # return the dataframe
    return output_df, failed_charts;


In [33]:
try40, try40_failures = top40_dict(top40_df.iloc[10:15])

Saved data for song 5 of 5 (100.00% complete)
0.20 minutes elapsed
Formatting data...
Data saved
Total runtime: 0.20 minutes
Done!


In [34]:
try40

Unnamed: 0,combo,week,position,artist,song,artist_id,song_id,acousticness,danceability,duration,...,hip_pop,mellow_gold,neo_mellow,neo_soul,pop_r&b,pop_rock,post-grunge,r&b,rock,urban_contemporary
0,6TcnmlCSxihzWOQJ8k0rNS42et6fnHCw1HIPSrdPprMl,7/7/97,11,Third Eye Blind,Semi-charmed Life,6TcnmlCSxihzWOQJ8k0rNS,42et6fnHCw1HIPSrdPprMl,0.00832,0.64,268360.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,6nzxy2wXs6tLgzEtqOkEi22b29qJRy4asbQNFjPO96XC,7/7/97,12,Monica,For You I Will,6nzxy2wXs6tLgzEtqOkEi2,2b29qJRy4asbQNFjPO96XC,0.243,0.652,294867.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
2,1MPcILKoMCJym9KscdYxuM4ACxa9buEUnOdYEoPcnMpi,7/7/97,13,White Town,Your Woman,1MPcILKoMCJym9KscdYxuM,4ACxa9buEUnOdYEoPcnMpi,0.033,0.815,259893.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,7/7/97,14,Jewel,You Were Meant For Me,,,,,,...,,,,,,,,,,
4,0jJNGWrpjGIHUdTTJiIYeB0z1b34WikhOH9ZxU8QDWcv,7/7/97,15,Wallflowers,One Headlight,0jJNGWrpjGIHUdTTJiIYeB,0z1b34WikhOH9ZxU8QDWcv,0.000655,0.701,312627.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [35]:
try40_failures

[[3, 'Jewel', 'You Were Meant For Me']]

In [43]:
try40_sorted = try40.sort_values(['week', 'position'])
try40_sorted

Unnamed: 0,combo,week,position,artist,song,artist_id,song_id,acousticness,danceability,duration,...,trap_latino,trap_queen,tropical,uk_pop,urban_contemporary,viral_rap,vocal_house,warrington_indie,west_coast_rap,world
0,01WjpKiWVNurV5hjIadB8C6q4aoWgTQ8td2AvqQXuFqm,1/10/00,28,Filter,Take A Picture,01WjpKiWVNurV5hjIadB8C,6q4aoWgTQ8td2AvqQXuFqm,0.00147,0.541,363800.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1324,,1/10/00,39,Snoop Dogg Presents Tha Eastsidaz,G'd Up,,,,,,...,,,,,,,,,,
1325,,1/10/00,39,Snoop Dogg Presents Tha Eastsidaz,G'd Up,,,,,,...,,,,,,,,,,
1326,,1/10/00,39,Snoop Dogg Presents Tha Eastsidaz,G'd Up,,,,,,...,,,,,,,,,,
1327,,1/10/00,39,Snoop Dogg Presents Tha Eastsidaz,G'd Up,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23,0SdiiPkr02EUdekHZJkt584auyLwpht5bFKtPpulMsCj,9/8/97,14,Hanson,Where's The Love?,0SdiiPkr02EUdekHZJkt58,4auyLwpht5bFKtPpulMsCj,0.00176,0.585,252360.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,0SdiiPkr02EUdekHZJkt584auyLwpht5bFKtPpulMsCj,9/8/97,14,Hanson,Where's The Love?,0SdiiPkr02EUdekHZJkt58,4auyLwpht5bFKtPpulMsCj,0.00176,0.585,252360.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
392,4uN3DsfENc7dp0OLO0FEIb3uPfVXcjnpOjyzI3jb3js4,9/8/97,33,Sugar Ray,Fly,4uN3DsfENc7dp0OLO0FEIb,3uPfVXcjnpOjyzI3jb3js4,0.02020,0.835,244640.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92,1VPr8y4GGZJBtWyaoLdYUT1DViQw0p1vo0eAMRlUF4Lr,9/8/97,36,Allure,All Cried Out,1VPr8y4GGZJBtWyaoLdYUT,1DViQw0p1vo0eAMRlUF4Lr,0.17600,0.613,275773.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
try40_cleaned = try40.drop_duplicates()
try40_cleaned.dropna(inplace=True)
print(len(try40_cleaned))
try40_cleaned

294


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  try40_cleaned.dropna(inplace=True)


Unnamed: 0,combo,week,position,artist,song,artist_id,song_id,acousticness,danceability,duration,...,trap_latino,trap_queen,tropical,uk_pop,urban_contemporary,viral_rap,vocal_house,warrington_indie,west_coast_rap,world
0,01WjpKiWVNurV5hjIadB8C6q4aoWgTQ8td2AvqQXuFqm,1/10/00,28,Filter,Take A Picture,01WjpKiWVNurV5hjIadB8C,6q4aoWgTQ8td2AvqQXuFqm,0.00147,0.541,363800.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,05RZIdfz59ZW2FvFuwnmNK3fQtdmVB0cAe4Hs9e5bVQb,6/21/99,38,K-Ci,Tell Me It's Real,05RZIdfz59ZW2FvFuwnmNK,3fQtdmVB0cAe4Hs9e5bVQb,0.18900,0.696,278600.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,05RZIdfz59ZW2FvFuwnmNK5GorFaKkP2mLREQvhSblIg,2/16/98,25,K-Ci,All My Life,05RZIdfz59ZW2FvFuwnmNK,5GorFaKkP2mLREQvhSblIg,0.11400,0.629,331240.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,05oH07COxkXKIMt6mIPRee6sHsXIJoEN5JpdkGMQDJxt,6/1/98,15,Brandy,The Boy Is Mine,05oH07COxkXKIMt6mIPRee,6sHsXIJoEN5JpdkGMQDJxt,0.53900,0.704,294787.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,05oH07COxkXKIMt6mIPRee6tBD4yjOf9P8rWwUlXdJFm,11/23/98,28,Brandy,Have You Ever?,05oH07COxkXKIMt6mIPRee,6tBD4yjOf9P8rWwUlXdJFm,0.54200,0.698,273440.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627,7urq0VfqxEYEEiZUkebXT43kVIFDE3G89I2RPVkiRaRj,7/7/97,34,112,Cupid,7urq0VfqxEYEEiZUkebXT4,3kVIFDE3G89I2RPVkiRaRj,0.37300,0.685,252267.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
629,7xojBGlLd0SGscMIL31gdV4G3HuXbtn7AwtW4aCIaMxx,1/11/99,39,Voices Of Theory,Wherever You Go,7xojBGlLd0SGscMIL31gdV,4G3HuXbtn7AwtW4aCIaMxx,0.68700,0.527,339760.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
630,7xojBGlLd0SGscMIL31gdV5919i3Sp9U2IE5m7hhlgSw,6/15/98,39,Voices Of Theory,Say It,7xojBGlLd0SGscMIL31gdV,5919i3Sp9U2IE5m7hhlgSw,0.67500,0.381,269467.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
631,7y1GT7SdgGiFLWoktv2TSw6znv7i4Wif5fLwI6OjKHZ4,5/6/00,24,BBmak,Back Here,7y1GT7SdgGiFLWoktv2TSw,6znv7i4Wif5fLwI6OjKHZ4,0.06980,0.624,218200.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
try40_cleaned.to_csv('../top40_audio_data_sample.csv', index=False)

# EDITS

In [64]:
def top40_dict(dfin):
    
    start = timeit.default_timer()
    
    # change input dataframe to a dictionary
    top40_dict = dfin.to_dict(orient='index') 
    
    # base URL for API calls
    base_url = 'https://api.spotify.com/v1/'

    # empty lists to store all genres, artist IDs in the dataset
    all_genres = []
    all_artist_genres = []
    artist_ids = []

    failed_charts = []
    
    for c, chart_entry in enumerate(top40_dict.items()):
        
        try:
            
            print(f'Now working on: {chart_entry[1]["artist"]} - {chart_entry[1]["song"]}')
            ################################
            # search artists
            ################################
    
            # use an 'item' search for artist and track
            # https://developer.spotify.com/documentation/web-api/reference/#/operations/search
            query_artist = re.sub('\s', '%20', chart_entry[1]['artist'])
            query = f'search?q=artist:{query_artist}&type=artist&limit=5'
            artist_search_results = requests.get(base_url + query, headers=get_non_track_tokens(0)).json()
            
            
            ################################
            # search artists' top tracks
            ################################
    
            # save all IDs from artist search results
            id_list = []
            #for result in artist_search_results['artists']['items']:
            for result in artist_search_results['artists']['items']:
                artist_id = result['id']
                id_list.append(artist_id)
            chart_entry[1]['id_list'] = id_list
            
            # for each artist ID, search that artist's top tracks
            # https://api.spotify.com/v1/artists/{id}/top-tracks
            hit_list = []
            for a, artist in enumerate(id_list):
                
                artist_hits = requests.get(base_url + 'artists/' + artist + '/top-tracks?market=US', 
                                           headers=get_track_tokens(a)).json()
                hit_list.append(artist_hits)
                chart_entry[1]['hit_list'] = hit_list
    
    
            ################################
            # save artist and track URIs
            ################################
    
            # search each artist's hits for the track of interest
    #        for item in hit_list:
    #            if item['tracks'] == []:
    #                continue
    #                
    #            else:
    #                if item['tracks'] != []:
    #        
    #                    try:
    #                        for track in item['tracks']:
    #            #            for track in hit_list[0]['tracks']:
    #                            
    #                            chart_entry[1]['artist_uri'] = 'NA'
    #                            chart_entry[1]['song_uri'] = 'NA'
    #                
    #                
    #                            # Spotify track and artist info
    #                            track_artist = track['artists'][0]['name']
    #                            track_artist_uri = track['artists'][0]['uri']
    #                            track_title = track['name']
    #                            track_title_uri = track['uri']
    #                            
    #                            # simplify artist/song strings to search chart instances against spotify instances
    #                            compare_track_artist = track_artist.replace(' ', '').casefold()
    #                            compare_track_title = track_title.replace(' ', '').casefold()
    #                            
    #                            compare_chart_artist = chart_entry[1]['artist'].replace(' ', '').casefold()
    #                            compare_chart_song = chart_entry[1]['song'].replace(' ', '').casefold()
    #                            
    #                            # if the artist is found but not the track, save artist info
    #                            # note: some songs may not be found because they are not in the artist's top tracks
    #                            if (compare_chart_artist == compare_track_artist or \
    #                            compare_chart_artist in compare_track_artist or \
    #                            compare_track_artist in compare_chart_artist) and \
    #                            (compare_chart_song != compare_track_title):
    #                                
    #                                chart_entry[1]['artist_uri'] = track_artist_uri
    #                
    #                            # if the artist and song title are found, save the relevant info
    #                            if (compare_chart_artist == compare_track_artist or \
    #                            compare_chart_artist in compare_track_artist or \
    #                            compare_track_artist in compare_chart_artist) and \
    #                            (compare_chart_song == compare_track_title or \
    #                            compare_chart_song in compare_track_title or \
    #                            compare_track_title in compare_chart_song):
    #                                
    #                                chart_entry[1]['artist_uri'] = track_artist_uri
    #                                chart_entry[1]['song_uri']  = track_title_uri
    #                
    #                                # continue to the next iteration once a match is found and saved
    #                                break
    #                    
    #                    except IndexError:
    #                        chart_entry[1]['artist_uri'] = 'NA'
    #                        chart_entry[1]['song_uri'] = 'NA'
    
            for track in hit_list[0]['tracks']:
                
                chart_entry[1]['artist_uri'] = 'NA'
                chart_entry[1]['song_uri'] = 'NA'
    
                # Spotify track and artist info
                track_artist = track['artists'][0]['name']
                track_artist_uri = track['artists'][0]['uri']
                track_title = track['name']
                track_title_uri = track['uri']
                
                # simplify artist/song strings to search chart instances against spotify instances
                compare_track_artist = track_artist.replace(' ', '').casefold()
                compare_track_title = track_title.replace(' ', '').casefold()
                
                compare_chart_artist = chart_entry[1]['artist'].replace(' ', '').casefold()
                compare_chart_song = chart_entry[1]['song'].replace(' ', '').casefold()
                
#                # if the artist is found but not the track, save artist info
#                # note: some songs may not be found because they are not in the artist's top tracks
#                if (compare_chart_artist == compare_track_artist or \
#                compare_chart_artist in compare_track_artist or \
#                compare_track_artist in compare_chart_artist) and \
#                (compare_chart_song != compare_track_title):
#                    
#                    chart_entry[1]['artist_uri'] = track_artist_uri
    
                # if the artist and song title are found, save the relevant info
                if (compare_chart_artist == compare_track_artist or \
                compare_chart_artist in compare_track_artist or \
                compare_track_artist in compare_chart_artist) and \
                (compare_chart_song == compare_track_title or \
                compare_chart_song in compare_track_title or \
                compare_track_title in compare_chart_song):
                    
                    chart_entry[1]['artist_uri'] = track_artist_uri
                    chart_entry[1]['song_uri']  = track_title_uri
    
                    # continue to the next iteration once a match is found and saved
                    break
            
            # if a song uri hasn't been found, send track info to the failures list, continue to the next iteration
            if chart_entry[1]['song_uri'] == 'NA':
                failed_charts.append([c, chart_entry[1]['artist'], chart_entry[1]['song']])
                continue
                
            ##################################
            # get audio features for all songs
            ##################################
            
            # empty lists to hold desired audio feature scores
            acousticness = []
            duration = []
            danceability = []
            energy = []
            instrumentalness = []
            key = []
            liveness = []
            loudness = []
            mode = []
            speechiness = []
            tempo = []
            time_signature = []
            valence = []
        
            # make the API call for track audio features
            # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
            
            # save features for all artists with a URI
            if chart_entry[1]['song_uri'] != 'NA':
                song_id = re.search('(?:spotify:track:)(\w+)', chart_entry[1]['song_uri']).group(1)
                chart_entry[1]['song_id'] = song_id
                # API call
                audio = requests.get(base_url + 'audio-features/' + song_id, headers=get_non_track_tokens(1)).json()
        
                # save audio features to their respective lists
                chart_entry[1]['acousticness'] = audio['acousticness']
                chart_entry[1]['duration'] = audio['duration_ms']
                chart_entry[1]['danceability'] = audio['danceability']
                chart_entry[1]['energy'] = audio['energy']
                chart_entry[1]['instrumentalness'] = audio['instrumentalness']
                chart_entry[1]['key'] = audio['key']
                chart_entry[1]['liveness'] = audio['liveness']
                chart_entry[1]['loudness'] = audio['loudness']
                chart_entry[1]['mode'] = audio['mode']
                chart_entry[1]['speechiness'] = audio['speechiness']
                chart_entry[1]['tempo'] = audio['tempo']
                chart_entry[1]['time_signature'] = audio['time_signature']
                chart_entry[1]['valence'] = audio['valence']
                
            else:
                song_id = 'NA'
                chart_entry[1]['song_id'] = 'NA'
                chart_entry[1]['acousticness'] = np.nan
                chart_entry[1]['duration'] = np.nan
                chart_entry[1]['danceability'] = np.nan
                chart_entry[1]['energy'] = np.nan
                chart_entry[1]['instrumentalness'] = np.nan
                chart_entry[1]['key'] = np.nan
                chart_entry[1]['liveness'] = np.nan
                chart_entry[1]['loudness'] = np.nan
                chart_entry[1]['mode'] = np.nan
                chart_entry[1]['speechiness'] = np.nan
                chart_entry[1]['tempo'] = np.nan
                chart_entry[1]['time_signature'] = np.nan
                chart_entry[1]['valence'] = np.nan
    
#            print(f'artist URI: {chart_entry[1]["artist_uri"]}')
#            print(f'artist ID: {artist_id}')
#            print(f'song URI: {chart_entry[1]["song_uri"]}')
#            print(f'song ID: {chart_entry[1]["song_id"]}')
    
            ########################
            # get artist genre data
            ########################
            
            # get artist IDs to use in search
#            if chart_entry[1]['artist_uri'] != 'NA':
                
            artist_id = re.search('(?:spotify:artist:)(\w+)', chart_entry[1]['artist_uri']).group(1)
            chart_entry[1]['artist_id'] = artist_id
            artist_ids.append(artist_id)
#            else:
#                artist_id = 'NA'
                
            # save artist-song combinations to use in dataframe construction
            if artist_id != 'NA':
                chart_entry[1]['combo'] = artist_id + song_id
            else:
                chart_entry[1]['combo'] = compare_chart_artist + compare_chart_song
            # get the artist's genre(s)
            # Spotify 'artist' search
            # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-an-artist
            
    #        try:
            if artist_id != 'NA':
                # API call
                get_artist = requests.get(base_url + 'artists/' + artist_id, headers=get_non_track_tokens(2)).json()
                artist_genres = get_artist['genres']
                # format genre strings
                artist_genres = [x.replace(' ', '_').lower() for x in artist_genres]
                chart_entry[1]['artist_genres'] = artist_genres
    
            # for artists with 'NA' URIs
            else:
                artist_genres = ['no_genre_info']        
            
        
    #        # empty dictionary to hold all genres associated with each artist
    #        artist_genre_dict = {}
    #        artist_genre_dict[chart_entry[1]['artist_id']] = artist_genres
    #        all_artist_genres.append(artist_genre_dict)
            
            # save new genres to the list of all genres
            for genre in artist_genres:
                if genre not in all_genres:
                    all_genres.append(genre)
            
    
            # log progress
            clear_output()
            checkpoint = timeit.default_timer()
            print(f'Saved data for song {c+1} of {len(dfin)} ({(c+1)/len(dfin)*100:.2f}% complete)')
            print(f'{(checkpoint-start)/60:.2f} minutes elapsed')
    
        # for chart items that don't work, skip and save info to a list
        except:
            failed_charts.append([c, chart_entry[1]['artist'], chart_entry[1]['song']])
        
    
    print('Formatting data...')    

    ################################
    # output all data to a dataframe
    ################################

    all_genres_dict = {}

    for genre in all_genres:
        all_genres_dict[genre] = [0] * len(dfin)
    
    # create a dataframe chart data
    chart_df = pd.DataFrame.from_dict(top40_dict, orient='index')
    chart_df.set_index('combo', inplace=True, drop=False)

    # create a dataframe with genre data, sort df columns
    genre_df = pd.DataFrame.from_dict(all_genres_dict, orient='columns')
    genre_df.columns = sorted(genre_df.columns.tolist())
    
    genre_df['combo'] = chart_df.index
    genre_df.set_index('combo', inplace=True, drop=False)
    
    # assign 1's to each artist's genre                
#    try:
    for column in genre_df:
        
        for row, combo in enumerate(genre_df.index):
            
            try:
                for genre in chart_df['artist_genres'][row]:
                
                    if genre == column:
                        genre_df.loc[combo, column] = 1
            
            except TypeError:
                genre_df.loc[combo, column] = np.nan

#    except KeyError:
#        pass
#    # assign 1's to each artist's genre                
#    if artist_id != 'NA':
#        
#        for column in genre_df:
#            print(column)
#            for row, combo in enumerate(genre_df.index):
#                print(combo)
#                for genre in chart_df['artist_genres'][row]:
#                    print(genre)                    
#                    if genre == column:
#                        genre_df.loc[combo, column] = 1
#    
#    # but if the artist has no saved genres, assign NA's
#    else:  
#        for cell in genre_df.loc[chart_entry[1]['combo']]:
#            cell = 'NA'
        
    # make a copy of the input dataframe
    trimmed_df = chart_df.copy()
    
    # drop unwanted columns
    trimmed_df.drop(columns = ['id_list', 'hit_list', 'artist_genres', 'artist_uri', 'song_uri'], inplace=True)
    
    # reorder columns for aesthetics
    trimmed_df = trimmed_df[['week', 'position', 'artist', 'song', 'artist_id', 'song_id', 'combo',
                           'acousticness', 'danceability', 'duration', 'energy', 'instrumentalness',
                           'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo',
                           'time_signature', 'valence']]
    

    # merge all data into a final dataframe
    output_df = trimmed_df.merge(genre_df, how="inner", left_index=True, right_index=True)
    output_df.drop(columns=['combo_x', 'combo_y'], inplace=True)
    pd.to_datetime(output_df['week'])
    output_df.sort_values(by=['week','position'], inplace=True)
    output_df.dropna(inplace=True)
    output_df.reset_index(inplace=True)
    
    # log progress
    checkpoint = timeit.default_timer()
    print('Data saved')
    print(f'Total runtime: {(checkpoint-start)/60:.2f} minutes')
    print('Done!')

    # return the dataframe
    return output_df, failed_charts;


In [68]:
try40, try40_failed = top40_dict(top40_df)

Saved data for song 4400 of 4400 (100.00% complete)
173.35 minutes elapsed
Formatting data...
Data saved
Total runtime: 187.56 minutes
Done!


In [69]:
try40.to_csv

Unnamed: 0,combo,week,position,artist,song,artist_id,song_id,acousticness,danceability,duration,...,virginia_hip_hop,vocal_house,vocal_jazz,washington_indie,west_coast_rap,world,worship,yacht_rock,yakut_pop,yodeling
0,3BmGtnKgCSGYIUhmivXKWX3xrn9i8zhNZsTtcoWgQEAd,1/1/05,32,Kelly Clarkson,Since U Been Gone,3BmGtnKgCSGYIUhmivXKWX,3xrn9i8zhNZsTtcoWgQEAd,0.00165,0.662,188960.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,55Aa2cqylxrFIXC767Z8655Qy6a5KzM4XlRxsNcGYhgH,1/1/11,8,Lil Wayne,6 Foot 7 Foot,55Aa2cqylxrFIXC767Z865,5Qy6a5KzM4XlRxsNcGYhgH,0.00070,0.364,248587.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,698hF4vcwHwPy8ltmXermq45sDIKapDyxPl307QpEAwl,1/1/11,37,Far East Movement,Rocketeer,698hF4vcwHwPy8ltmXermq,45sDIKapDyxPl307QpEAwl,0.18100,0.664,211253.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,63wjoROpeh5f11Qm93UiJ12Adn2LNgkHMH5TelQVAu4n,1/1/11,40,Keri Hilson,Pretty Girl Rock,63wjoROpeh5f11Qm93UiJ1,2Adn2LNgkHMH5TelQVAu4n,0.20300,0.666,243920.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3hcs9uc56yIGFCSy9leWe77qTaDOcld0VmBWXnkbUj45,1/1/22,10,Lil Durk,Broadway Girls,3hcs9uc56yIGFCSy9leWe7,7qTaDOcld0VmBWXnkbUj45,0.04170,0.659,185600.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2292,4WC2Edj7Ruq94JH2jkC5hB18jS43nfrzTvN68Q9tkjDh,9/9/00,37,Profyle,Liar,4WC2Edj7Ruq94JH2jkC5hB,18jS43nfrzTvN68Q9tkjDh,0.22600,0.694,297893.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2293,31TPClRtHm23RisEBtV3X70O45fw2L5vsWpdsOdXwNAR,9/9/06,1,Justin Timberlake,Sexy Back,31TPClRtHm23RisEBtV3X7,0O45fw2L5vsWpdsOdXwNAR,0.05840,0.967,242733.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2294,0zOcE3mg9nS6l3yxt1Y0bK5fVZC9GiM4e8vu99W0Xf6J,9/9/06,25,Fray,How To Save A Life,0zOcE3mg9nS6l3yxt1Y0bK,5fVZC9GiM4e8vu99W0Xf6J,0.26900,0.640,262533.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2295,13y7CgLHjMVRMDqxdx0Xdo1XRgIKC5TPwo7nWGyKqgG0,9/9/17,31,Gucci Mane,I Get The Bag,13y7CgLHjMVRMDqxdx0Xdo,1XRgIKC5TPwo7nWGyKqgG0,0.02320,0.890,233087.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
try40_failed

[[13, 'Jewel', 'You Were Meant For Me'],
 [23, 'Sheryl Crow', 'A Change Would Do You Good'],
 [25, 'Bone Thugs-N-Harmony', 'Look Into My Eyes'],
 [26, 'Rome', 'I Belong To You'],
 [28, 'Tim McGraw', "It's Your Love"],
 [30, 'Gina G', 'Gimme Some Love'],
 [32, 'R. Kelly', 'Gotham City'],
 [34, '3rd Party', 'Can U Feel It'],
 [36, 'Counting Crow', 'Daylight Fading'],
 [38, 'Babyface', 'How Come, How Long'],
 [42, 'Will Smith', 'Men In Black'],
 [43, 'Savage Garden', 'To The Moon And Back'],
 [46, 'Notorious B.I.G.', 'Mo Money,mo Problems'],
 [50, 'Dru Hill', 'Never Make A Promise'],
 [52, 'SWV', 'Someone'],
 [53, '98 Degrees', 'Invisible Man'],
 [54, 'Matchbox 20', 'Push'],
 [55, 'Mr.President', 'Coco Jamboo'],
 [56, 'Jewel', 'Foolish Games'],
 [57, 'Magoo', 'Up Jumps Da Boogie'],
 [59, 'Usher', 'You Make Me Wanna...'],
 [60, 'Mariah Carey', 'Honey'],
 [69, 'Brian McKnight', "You Should Be Mine (Don't Waste Your Time)"],
 [72, 'Boyz II Men', '4 Seasons Of Loneliness'],
 [73, 'Peach Union