Use this notebook to extract audio features and genres associated with individual Spotify tracks
- adapted from 'Pulling_Spotify_Data.ipynb' to incorporate most commands into a function

In [21]:
# dependencies
import requests
import json
import pandas as pd
import timeit
from IPython.display import clear_output
import os

# import spotipy
#!pip install spotipy
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [28]:
# Spotify API credentials
from config import spotify_client_ID as sp_client, spotify_client_secret as sp_secret

In [30]:
sp_client = 'e96e5bede18d4700864101c59f3b8e68'
sp_secret = '1d831c508b7f4f4888290a38343bcebe'

In [33]:
# set up client credentials 
# https://spotipy.readthedocs.io/en/master/?highlight=spotifyclientcredentials#client-credentials-flow
# API info: https://developer.spotify.com/documentation/web-api/reference/#/

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(sp_client, sp_secret),
                    request_timeout=5,
                    retries=5)

Exception ignored in: <function Spotify.__del__ at 0x7fca841b6b80>
Traceback (most recent call last):
  File "/opt/anaconda3/envs/mlenv/lib/python3.9/site-packages/spotipy/client.py", line 188, in __del__
    if isinstance(self._session, requests.Session):
AttributeError: 'Spotify' object has no attribute '_session'


TypeError: __init__() got an unexpected keyword argument 'request_timeout'

In [None]:
# create a test dataset: 1000 tracks from 2018
# using Spotify 'track' search:
# https://developer.spotify.com/documentation/web-api/reference/#/operations/get-track

def get_audio_features():

    # start timer
    start = timeit.default_timer()

    ################################
    # get a sample of spotify tracks
    ################################

    # empty lists to hold API query results
    artist_name = [] # artist
    track_name = []  # song title
    track_id = []    # Spotify ID for the track
    popularity = []  # current popularity of the track (0-100)

    # 1000 API track queries
    for i in range(0,1000,50):
        track_results = sp.search(q='year:2018', type='track', limit=50,offset=i)
        for i, t in enumerate(track_results['tracks']['items']):
            artist_name.append(t['artists'][0]['name'])
            track_name.append(t['name'])
            track_id.append(t['id'])
            popularity.append(t['popularity'])
        
    # save API query results to a dataframe
    track_df = pd.DataFrame({
        'artist_name' : artist_name, 
        'track_name' : track_name, 
        'track_id' : track_id, 
        'popularity' : popularity
        })

    # log progress    
    checkpoint_1 = timeit.default_timer()
    print(f'Added {len(track_df)} tracks to the dataset')
    print(f'{(checkpoint_1-start):.2f} seconds elapsed')
    print(f'Now getting artist URIs...')

    
    ###################################################
    # get all artist URIs to search for genres by artist
    ###################################################
     
    # Spotify 'item' search
    # https://developer.spotify.com/documentation/web-api/reference/#/operations/search
    # https://spotipy.readthedocs.io/en/master/

    search_all_artists = []
    printcounter = 0
    
    for a, artist in enumerate(track_df['artist_name']):
        
        attempts = 0
        while attempts < 20: # some tracks don't have relevant info, so try another track up to 20x
            try:
                artist_search = sp.search(q='artist:' + artist, type='track', limit=1, offset=a)
                artist_uri = artist_search['tracks']['items'][0]['artists'][0]['uri']  
            except:
                attempts += 1
        
        search_all_artists.append(artist_uri)
        
        printcounter += 1
        if (printcounter == 50):
            printcounter = 0
            # log progress
            clear_output()
            checkpoint_3 = timeit.default_timer()
            print(f'Added {len(search_all_artists)}/1000 artist URIs to the dataset')
            print(f'{(checkpoint_3-start)/60:.2f} minutes elapsed')

    # add artist URIs to the dataframe
    track_df['artist_uri'] = search_all_artists

    # log progress
    print('Now getting genre data...')

    
    ########################
    # get artist genre data
    ########################

    # empty list of genres
    all_genres = []

    # empty dictionary to hold all genres associated with each artist
    artist_genre_dict = {}

    # get genres associated with all artists    
    artist_URIs = search_all_artists
    
    printcounter = 0
    
    for artist in artist_URIs:

        # get the artist's genre(s)
        # Spotify 'artist' search
        # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-an-artist
        
        try:
            artist_genres = sp.artist(artist)['genres']
            # format genre strings
            artist_genres = [x.replace(' ', '_').lower() for x in artist_genres]
        
        # for artists with 'NA' URIs
        except:
            artist_genres = []

 
        artist_genre_dict[artist] = artist_genres

        # save new genres to the list of all genres
        for genre in artist_genres:
            if genre not in all_genres:
                all_genres.append(genre)
        
        printcounter += 1
        if (printcounter == 50):
            # log progress
            clear_output()
            checkpoint_4 = timeit.default_timer()
            print(f'Found genres for {printcounter}/1000 artists')
            print(f'{(checkpoint_4-start)/60:.2f} minutes elapsed')
            printcounter = 0

    # fill the new genre dataframe
    all_genres_dict = {}
    for genre in all_genres:
        all_genres_dict[genre] = [0] * len(track_df)

    # create dataframe with genre data, sort df columns
    genre_df = pd.DataFrame(all_genres_dict)
    genre_df.columns = sorted(genre_df.columns.tolist())
    genre_df['artist_uri'] = artist_URIs
    genre_df.index = artist_URIs

    # assign 1's to each artist's genre                
    for column in genre_df:

        for artist in genre_df.index:

            for genre in artist_genre_dict[artist]:

                if genre == column:

                    genre_df.loc[artist, column] = 1

    # concatenate all data into one dataframe
    full_df = track_df.merge(genre_df, how="inner", on="artist_uri")
    full_df.drop_duplicates(inplace=True)

    # log progress
    checkpoint_5 = timeit.default_timer()
    print(f'Found {len(all_genres)} genres')
    print(f'{(checkpoint_5-start)/60:.2f} minutes elapsed')
    print(f'Dropping {len(all_genres)-50} least common genres from the dataset...')
    
    
    #######################################
    # drop uncommon genres from the dataset
    #######################################

    # get the sum of each genre column
    genre_counts = full_df.sum(axis=0)[5:]
    genre_counts_df = pd.DataFrame(genre_counts, columns=['count'])

    # get genres to drop from the dataset
    # keeping only the top 50 most common genres
    drop_genres = genre_counts_df.sort_values(['count'],ascending=False)[50:].index

    # remove low-count genres from the dataset
    genres_df = full_df.drop(columns=drop_genres)


    ####################################
    # get audio features for all tracks
    ####################################

    # empty lists to hold desired audio feature scores
    acousticness = []
    danceability = []
    energy = []
    instrumentalness = []
    key = []
    liveness = []
    loudness = []
    mode = []
    speechiness = []
    tempo = []
    time_signature = []
    valence = []

    track_features = []

    # make the API call for track audio features
    # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
    printcounter = 0
    for track in genres_df['track_id']:

        track_features.append(sp.audio_features(track))
        
        printcounter += 1
        if (printcounter == 50):
            printcounter = 0
            # log progress
            clear_output()
            checkpoint_6 = timeit.default_timer()
            print(f'Found audio features for {len(track_features)} tracks')
            print(f'{(checkpoint_6-start)/60:.2f} minutes elapsed')

    # log progress
    print(f'Saving data...')

    # save each feature score to the appropriate list
    for track in track_features:

        acousticness.append(track[0]['acousticness'])
        danceability.append(track[0]['danceability'])
        energy.append(track[0]['energy'])
        instrumentalness.append(track[0]['instrumentalness'])
        key.append(track[0]['key'])
        liveness.append(track[0]['liveness'])
        loudness.append(track[0]['loudness'])
        mode.append(track[0]['mode'])
        speechiness.append(track[0]['speechiness'])
        tempo.append(track[0]['tempo'])
        time_signature.append(track[0]['time_signature'])
        valence.append(track[0]['valence'])
        
    # save feature data to the final dataframe
    audio_df = genres_df
    audio_df['acousticness'] = acousticness
    audio_df['danceability'] = danceability
    audio_df['energy'] = energy
    audio_df['instrumentalness'] = instrumentalness
    audio_df['key'] = key
    audio_df['liveness'] = liveness
    audio_df['loudness'] = loudness
    audio_df['mode'] = mode
    audio_df['speechiness'] = speechiness
    audio_df['tempo'] = tempo
    audio_df['time_signature'] = time_signature
    audio_df['valence'] = valence

    # log progress
    checkpoint_7 = timeit.default_timer()
    print(f'\n\n')
    print(f'Found audio features for {len(genres_df)} tracks')
    print(f'Total runtime: {(checkpoint_7-start)/60:.2f} minutes')

    return audio_df

In [None]:
# run the function to get an audio features dataset
audio_df = get_audio_features()
audio_df.sample(10)

Added 1000 tracks to the dataset
14.79 seconds elapsed
Now getting artist URIs...


# code blocks

In [16]:
top40_df = pd.read_csv("../00_data/top40_1997_2022_raw.csv")
top40_df.columns

# store columns as lists
week = top40_df['week']
position = top40_df['position']
song = top40_df['song']
artist = top40_df['artist']

# practice list of 10 artists
a10 = artist[0:7].tolist() + artist[8:11].tolist()
a10

['Hanson',
 'Meredith Brooks',
 'Spice Girls',
 'Mark Morrison',
 'Shawn Colvin',
 'Robyn',
 'Verve Pipe',
 'Savage Garden',
 'Backstreet Boys',
 'Third Eye Blind']

In [47]:
search_all_artists = []
for artist in a10:
    artist_search = sp.search(q='artist:' + artist, type='track', limit=1)
    artist_uri = artist_search['tracks']['items'][0]['artists'][0]['uri']
    search_all_artists.append(artist_uri)

search_all_artists

['spotify:artist:0SdiiPkr02EUdekHZJkt58',
 'spotify:artist:2QmLFuIDtNDmmJY3OtvinN',
 'spotify:artist:0uq5PttqEjj3IH1bzwcrXF',
 'spotify:artist:6V3F8MZrOKdT9fU686ybE9',
 'spotify:artist:0K7VN4aHxHcEb7PqkfoIVA',
 'spotify:artist:6UE7nl9mha6s8z0wFQFIZ2',
 'spotify:artist:242iqFnwNhlidVBMI9GYKp',
 'spotify:artist:3NRFinRTEqUCfaTTZmk8ek',
 'spotify:artist:5rSXSAkZ67PYJSvpUpkOr7',
 'spotify:artist:6TcnmlCSxihzWOQJ8k0rNS']

In [37]:
sp.track('5MwynWK9s4hlyKHqhkNn4A')

{'album': {'album_type': 'album',
  'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4oUHIQIBe0LHzYfvXNW4QM'},
    'href': 'https://api.spotify.com/v1/artists/4oUHIQIBe0LHzYfvXNW4QM',
    'id': '4oUHIQIBe0LHzYfvXNW4QM',
    'name': 'Morgan Wallen',
    'type': 'artist',
    'uri': 'spotify:artist:4oUHIQIBe0LHzYfvXNW4QM'}],
  'available_markets': ['AD',
   'AE',
   'AG',
   'AL',
   'AM',
   'AO',
   'AR',
   'AT',
   'AU',
   'AZ',
   'BA',
   'BB',
   'BD',
   'BE',
   'BF',
   'BG',
   'BH',
   'BI',
   'BJ',
   'BN',
   'BO',
   'BR',
   'BS',
   'BT',
   'BW',
   'BY',
   'BZ',
   'CA',
   'CD',
   'CG',
   'CH',
   'CI',
   'CL',
   'CM',
   'CO',
   'CR',
   'CV',
   'CW',
   'CY',
   'CZ',
   'DE',
   'DJ',
   'DK',
   'DM',
   'DO',
   'DZ',
   'EC',
   'EE',
   'EG',
   'ES',
   'FI',
   'FJ',
   'FM',
   'FR',
   'GA',
   'GB',
   'GD',
   'GE',
   'GH',
   'GM',
   'GN',
   'GQ',
   'GR',
   'GT',
   'GW',
   'GY',
   'HK',
   'HN',
   'HR',
   'HT'

In [35]:
items = results['artists']['items']
if len(items) > 0:
    artist = items[0]
    print(artist['name'])

NameError: name 'results' is not defined

In [4]:
# start timer
start = timeit.default_timer()

################################
# get a sample of spotify tracks
################################

# empty lists to hold API query results
artist_name = [] # artist
track_name = []  # song title
track_id = []    # Spotify ID for the track
popularity = []  # current popularity of the track (0-100)

# 1000 API track queries
for i in range(0,1000,50):
    track_results = sp.search(q='year:2018', type='track', limit=50,offset=i)
    for i, t in enumerate(track_results['tracks']['items']):
        artist_name.append(t['artists'][0]['name'])
        track_name.append(t['name'])
        track_id.append(t['id'])
        popularity.append(t['popularity'])
    
# save API query results to a dataframe
track_df = pd.DataFrame({
    'artist_name' : artist_name, 
    'track_name' : track_name, 
    'track_id' : track_id, 
    'popularity' : popularity
    })

# log progress    
checkpoint_1 = timeit.default_timer()
print(f'Added {len(track_df)} tracks to the dataset')
print(f'{(checkpoint_1-start):.2f} seconds elapsed')
print(f'Now getting artist URIs...')


KeyboardInterrupt: 

In [None]:
###################################################
# get all artist URIs to search for genres by artist
###################################################
 
# Spotify 'item' search
# https://developer.spotify.com/documentation/web-api/reference/#/operations/search
# https://spotipy.readthedocs.io/en/master/

search_all_artists = []
printcounter = 0

for a, artist in enumerate(track_df['artist_name']):
    
    #attempts = 0
    #while attempts < 20: # some tracks don't have relevant info, so try another track up to 20x
    try:
        artist_search = sp.search(q='artist:' + artist, type='track', limit=1, offset=a)
        artist_uri = artist_search['tracks']['items'][0]['artists'][0]['uri']  
    except:
        #attempts += 1
        pass
    
    search_all_artists.append(artist_uri)
    
    printcounter += 1
    if (printcounter == 50):
        printcounter = 0
    # log progress
    clear_output()
    checkpoint_3 = timeit.default_timer()
    print(f'Added {len(search_all_artists)}/1000 artist URIs to the dataset')
    print(f'{(checkpoint_3-start)/60:.2f} minutes elapsed')

# add artist URIs to the dataframe
track_df['artist_uri'] = search_all_artists

# log progress
print('Now getting genre data...')


In [None]:
########################
# get artist genre data
########################

# empty list of genres
all_genres = []

# empty dictionary to hold all genres associated with each artist
artist_genre_dict = {}

# get genres associated with all artists    
artist_URIs = search_all_artists

printcounter = 0

for artist in artist_URIs:

    # get the artist's genre(s)
    # Spotify 'artist' search
    # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-an-artist
    
    try:
        artist_genres = sp.artist(artist)['genres']
        # format genre strings
        artist_genres = [x.replace(' ', '_').lower() for x in artist_genres]
    
    # for artists with 'NA' URIs
    except:
        artist_genres = []

    artist_genre_dict[artist] = artist_genres

    # save new genres to the list of all genres
    for genre in artist_genres:
        if genre not in all_genres:
            all_genres.append(genre)
    
    printcounter += 1
    if (printcounter == 50):
        # log progress
        clear_output()
        checkpoint_4 = timeit.default_timer()
        print(f'Found genres for {printcounter}/1000 artists')
        print(f'{(checkpoint_4-start)/60:.2f} minutes elapsed')
        printcounter = 0

# fill the new genre dataframe
all_genres_dict = {}
for genre in all_genres:
    all_genres_dict[genre] = [0] * len(track_df)

# create dataframe with genre data, sort df columns
genre_df = pd.DataFrame(all_genres_dict)
genre_df.columns = sorted(genre_df.columns.tolist())
genre_df['artist_uri'] = artist_URIs
genre_df.index = artist_URIs

# assign 1's to each artist's genre                
for column in genre_df:

    for artist in genre_df.index:

        for genre in artist_genre_dict[artist]:

            if genre == column:

                genre_df.loc[artist, column] = 1

# concatenate all data into one dataframe
full_df = track_df.merge(genre_df, how="inner", on="artist_uri")
full_df.drop_duplicates(inplace=True)

# log progress
checkpoint_5 = timeit.default_timer()
print(f'Found {len(all_genres)} genres')
print(f'{(checkpoint_5-start)/60:.2f} minutes elapsed')
print(f'Dropping {len(all_genres)-50} least common genres from the dataset...')


#######################################
# drop uncommon genres from the dataset
#######################################

# get the sum of each genre column
genre_counts = full_df.sum(axis=0)[5:]
genre_counts_df = pd.DataFrame(genre_counts, columns=['count'])

# get genres to drop from the dataset
# keeping only the top 50 most common genres
drop_genres = genre_counts_df.sort_values(['count'],ascending=False)[50:].index

# remove low-count genres from the dataset
genres_df = full_df.drop(columns=drop_genres)


####################################
# get audio features for all tracks
####################################

# empty lists to hold desired audio feature scores
acousticness = []
danceability = []
energy = []
instrumentalness = []
key = []
liveness = []
loudness = []
mode = []
speechiness = []
tempo = []
time_signature = []
valence = []

track_features = []

# make the API call for track audio features
# https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
printcounter = 0
for track in genres_df['track_id']:

    track_features.append(sp.audio_features(track))
    
    printcounter += 1
    if (printcounter == 50):
        printcounter = 0
        # log progress
        clear_output()
        checkpoint_6 = timeit.default_timer()
        print(f'Found audio features for {len(track_features)} tracks')
        print(f'{(checkpoint_6-start)/60:.2f} minutes elapsed')

# log progress
print(f'Saving data...')

# save each feature score to the appropriate list
for track in track_features:

    acousticness.append(track[0]['acousticness'])
    danceability.append(track[0]['danceability'])
    energy.append(track[0]['energy'])
    instrumentalness.append(track[0]['instrumentalness'])
    key.append(track[0]['key'])
    liveness.append(track[0]['liveness'])
    loudness.append(track[0]['loudness'])
    mode.append(track[0]['mode'])
    speechiness.append(track[0]['speechiness'])
    tempo.append(track[0]['tempo'])
    time_signature.append(track[0]['time_signature'])
    valence.append(track[0]['valence'])
    
# save feature data to the final dataframe
audio_df = genres_df
audio_df['acousticness'] = acousticness
audio_df['danceability'] = danceability
audio_df['energy'] = energy
audio_df['instrumentalness'] = instrumentalness
audio_df['key'] = key
audio_df['liveness'] = liveness
audio_df['loudness'] = loudness
audio_df['mode'] = mode
audio_df['speechiness'] = speechiness
audio_df['tempo'] = tempo
audio_df['time_signature'] = time_signature
audio_df['valence'] = valence

# log progress
checkpoint_7 = timeit.default_timer()
print(f'\n\n')
print(f'Found audio features for {len(genres_df)} tracks')
print(f'Total runtime: {(checkpoint_7-start)/60:.2f} minutes')

return audio_df 

In [51]:
import requests
"https://api.spotify.com/v1/search?q=track:I%20need%20a hero%20artist:bonnie%20tyler&type=track"

query_artist = 'Morgan%20Wallen'
query_track = 'The%Way%I%Talk'
curl=f'https://api.spotify.com/v1/search?q=track:{query_track}%20artist:{query_artist}&type=track'
requests.get(curl)

<Response [401]>