### 1. Spotify API setup  

The __Client Credentials flow__ is used in server-to-server authentication.  
Only endpoints that do not access user information can be accessed. The advantage here in comparison with requests to the Web API made without an access token, is that a higher rate limit is applied.

In [18]:
import os
import json

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Reading Spotify web API credentials from settings.env hidden file
settings_root = '../settings/'
settingsfile = os.path.join(settings_root, 'settings.env')

with open(settingsfile) as f:
    env_vars = json.loads(f.read())

# Set environment variables
os.environ['SPOTIPY_CLIENT_ID'] = env_vars['SPOTIPY_CLIENT_ID']
os.environ['SPOTIPY_CLIENT_SECRET'] = env_vars['SPOTIPY_CLIENT_SECRET']

cid = os.getenv('SPOTIPY_CLIENT_ID')
secret = os.getenv('SPOTIPY_CLIENT_SECRET')
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### 2. Get Artist ID and features  

Data collection:

ARTIST Name --> SPOTIFY API: Search for an Item - __Artists__

In [2]:
import re
import timeit 
import chardet
import unicodedata
import pandas as pd
import numpy as np

pd.set_option('max_rows', 10)
data_root = '../data/'

datafile = os.path.join(data_root, 'NDB_artist_2021_2016.csv')

# chardet.detect() returns a dictionary which provides the encoding as the value associated with the key 'encoding'
with open(datafile, 'rb') as f:
    result = chardet.detect(f.read())

df_concerts_ndb = pd.read_csv(datafile, encoding=result['encoding'])

In [3]:
df_concerts_ndb

Unnamed: 0,edition,date,artist
0,2021,11/06/2021,Wilco
1,2021,13/06/2021,Imelda May
2,2021,15/06/2021,Woodkid
3,2021,15/06/2021,Awir Leon
4,2021,16/06/2021,Estrella Morente
...,...,...,...
206,2016,22/07/2016,Seal
207,2016,23/07/2016,Mariza
208,2016,26/07/2016,Electric Light Orchestra
209,2016,27/07/2016,Rozalén


In [4]:
def search_artist(name):  
    
    result = sp.search(q='artist:' + name, type='artist', limit=50)
    items = result['artists']['items']
    
    if len(items) > 0:
        for i in range(0,len(items)):
            if bool(re.fullmatch(name, items[i]['name'])):
                search_type = 'artist'
                return items[i]
                break
                
    else:
        return None

In [5]:
start = timeit.default_timer()

# create empty lists with artist features
artist_spid = []
artist_uri = []
artist_name = []
artist_followers = []
artist_popularity = []  
artist_genres = []
artist_edition = []

# empty list for artist not found
artist_not_found = []

for i in range(0,df_concerts_ndb.shape[0]):
    
    artist_results = search_artist(df_concerts_ndb.iloc[i]['artist'])
    
    if artist_results:
        artist_spid.append(artist_results['id'])
        artist_uri.append(artist_results['uri'])
        artist_name.append(artist_results['name'])
        artist_followers.append(artist_results['followers']['total'])
        artist_popularity.append(artist_results['popularity'])            
        artist_genres.append(artist_results['genres'])
        artist_edition.append(df_concerts_ndb.iloc[i]['edition'])
        
    else:
        artist_not_found.append(df_concerts_ndb.iloc[i]['artist'])  
        
        artist_spid.append(np.nan)
        artist_uri.append(np.nan)
        artist_name.append(df_concerts_ndb.iloc[i]['artist'])
        artist_followers.append(np.nan)
        artist_popularity.append(np.nan)            
        artist_genres.append(np.nan)
        artist_edition.append(df_concerts_ndb.iloc[i]['edition'])        
      
stop = timeit.default_timer()
print ('2. Get Artists features - Time (in seconds):', stop - start)

2. Get Artists features - Time (in seconds): 21.33079509999999


In [17]:
artist_not_found

[]

In [6]:
df_artist_features = pd.DataFrame({'artist_spid': artist_spid, 
                                    'artist_uri': artist_uri, 
                                    'artist_name': artist_name, 
                                    'artist_followers': artist_followers, 
                                    'artist_popularity': artist_popularity,
                                    'artist_genres': artist_genres,
                                    'artist_edition':artist_edition})

In [7]:
df_artist_features.shape

(211, 7)

In [8]:
df_artist_features.dtypes

artist_spid          object
artist_uri           object
artist_name          object
artist_followers      int64
artist_popularity     int64
artist_genres        object
artist_edition        int64
dtype: object

In [9]:
# Total number of artists playing at NDB different editions

len(df_artist_features['artist_name'].unique())

194

In [10]:
# Find duplicates - Artist playing in more than one NDB edition

df_artist_features.groupby('artist_name').filter(lambda group: len(group) > 1).sort_values(by='artist_name')

Unnamed: 0,artist_spid,artist_uri,artist_name,artist_followers,artist_popularity,artist_genres,artist_edition
132,5n9bMYfz9qss2VOW89EVs2,spotify:artist:5n9bMYfz9qss2VOW89EVs2,Bomba Estéreo,868641,66,"[cumbia, latin alternative, latin rock, pop el...",2018
194,5n9bMYfz9qss2VOW89EVs2,spotify:artist:5n9bMYfz9qss2VOW89EVs2,Bomba Estéreo,868641,66,"[cumbia, latin alternative, latin rock, pop el...",2016
123,2vI9KFm0fwSfPrpEgOeIbq,spotify:artist:2vI9KFm0fwSfPrpEgOeIbq,Brad Mehldau,156151,57,"[contemporary jazz, contemporary post-bop, coo...",2018
191,2vI9KFm0fwSfPrpEgOeIbq,spotify:artist:2vI9KFm0fwSfPrpEgOeIbq,Brad Mehldau,156151,57,"[contemporary jazz, contemporary post-bop, coo...",2016
124,6PkSULcbxFKkxdgrmPGAvn,spotify:artist:6PkSULcbxFKkxdgrmPGAvn,Cécile McLorin Salvant,63909,52,"[contemporary vocal jazz, jazz pop, vocal jazz]",2018
...,...,...,...,...,...,...,...
23,3wxFXeuHQEq4NubZunCIMe,spotify:artist:3wxFXeuHQEq4NubZunCIMe,Tomatito,65990,58,"[flamenco, flamenco guitar, nuevo flamenco, ru...",2021
99,1qupsn74dUofcXCZCBEwMS,spotify:artist:1qupsn74dUofcXCZCBEwMS,Twanguero,3916,29,[spanish folk rock],2019
52,1qupsn74dUofcXCZCBEwMS,spotify:artist:1qupsn74dUofcXCZCBEwMS,Twanguero,3916,29,[spanish folk rock],2021
179,2QoU3awHVdcHS8LrZEKvSM,spotify:artist:2QoU3awHVdcHS8LrZEKvSM,Wilco,591045,65,"[alternative country, alternative rock, chicag...",2016


In [11]:
# Convert edition column to dummy/indicator variable to avoid duplicates and keep edition column for future filtering

df_artist_features = pd.get_dummies(df_artist_features, prefix='', prefix_sep='', columns=['artist_edition'])

In [12]:
df_artist_features = df_artist_features.groupby('artist_name').agg({'artist_spid':'max',
                                                                    'artist_uri':'max',
                                                                    'artist_followers':'max',
                                                                    'artist_popularity': 'max',
                                                                    'artist_genres':'max',
                                                                    '2016':'sum',
                                                                    '2017':'sum',
                                                                    '2018':'sum',
                                                                    '2019':'sum',
                                                                    '2021':'sum'}).reset_index()

In [13]:
df_artist_features.loc[df_artist_features['artist_name'] == 'Cécile McLorin Salvant']

Unnamed: 0,artist_name,artist_spid,artist_uri,artist_followers,artist_popularity,artist_genres,2016,2017,2018,2019,2021
45,Cécile McLorin Salvant,6PkSULcbxFKkxdgrmPGAvn,spotify:artist:6PkSULcbxFKkxdgrmPGAvn,63909,52,"[contemporary vocal jazz, jazz pop, vocal jazz]",1,0,1,0,1


In [14]:
df_artist_features.shape

(194, 11)

In [15]:
df_artist_features

Unnamed: 0,artist_name,artist_spid,artist_uri,artist_followers,artist_popularity,artist_genres,2016,2017,2018,2019,2021
0,2CELLOS,6Fi8CHfO8WGtu3yO8c2Mc4,spotify:artist:6Fi8CHfO8WGtu3yO8c2Mc4,809361,64,"[bow pop, cello, classify]",1,0,0,0,0
1,Agnes Obel,1rKrEdI6GKirxWHxIUPYms,spotify:artist:1rKrEdI6GKirxWHxIUPYms,654339,65,[chamber pop],0,0,0,0,1
2,Alex Conde,34gsS5srfZEkRrwauF4Czj,spotify:artist:34gsS5srfZEkRrwauF4Czj,275,7,[spanish jazz],0,0,0,0,1
3,Alfa Mist,2i1CPudyCUjL50Wqjv8AMI,spotify:artist:2i1CPudyCUjL50Wqjv8AMI,146230,56,"[british jazz, indie jazz, indie soul, uk cont...",0,0,0,0,1
4,Alpha Blondy,41ekW4MXG59xJMXR8dX1OG,spotify:artist:41ekW4MXG59xJMXR8dX1OG,382426,61,"[african reggae, french reggae, reggae, roots ...",0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
189,Youssou N'Dour,77zlytAFjPFjUKda8TNIDY,spotify:artist:77zlytAFjPFjUKda8TNIDY,119077,57,"[afropop, mande pop, mbalax, world]",0,0,0,0,1
190,Zahara,7uLePkJ2f0MwEcphODfkuu,spotify:artist:7uLePkJ2f0MwEcphODfkuu,185161,61,"[cantautor, latin rock, spanish indie pop, spa...",0,0,0,1,0
191,Zenet,2s7td67DdtSXTx2TGzs01i,spotify:artist:2s7td67DdtSXTx2TGzs01i,62874,51,"[cantautor, flamenco]",0,0,0,0,1
192,Zoé,6IdtcAwaNVAggwd6sCKgTI,spotify:artist:6IdtcAwaNVAggwd6sCKgTI,3445302,75,"[latin alternative, latin rock, mexican rock, ...",0,0,1,0,0


In [16]:
artist_file = os.path.join(data_root, 'artists_ndb.csv')
df_artist_features.to_csv(artist_file, encoding='utf-8', index=False)