### 1. Spotify API setup  

The __Client Credentials flow__ is used in server-to-server authentication. Only endpoints that do not access user information can be accessed. The advantage here in comparison with requests to the Web API made without an access token, is that a higher rate limit is applied.

In [1]:
import os
import json

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Reading Spotify web API credentials from settings.env hidden file
settings_root = "../settings/"
settingsfile = os.path.join(settings_root, "settings.env")

with open(settingsfile) as f:
    env_vars = json.loads(f.read())

# Set environment variables
os.environ['SPOTIPY_CLIENT_ID'] = env_vars['SPOTIPY_CLIENT_ID']
os.environ['SPOTIPY_CLIENT_SECRET'] = env_vars['SPOTIPY_CLIENT_SECRET']

cid = os.getenv('SPOTIPY_CLIENT_ID')
secret = os.getenv('SPOTIPY_CLIENT_SECRET')
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### 2. Get Artist ID and features  

Data collection strategy:

ARTIST --> DISCOGRAPHY --> TRACKS --> AUDIO FEATURES

In [2]:
import re
import timeit 
import chardet
import unicodedata
import pandas as pd
import numpy as np

pd.set_option("max_rows", 10)
data_root = "../data/"

datafile = os.path.join(data_root, "NDB_artist_2021_2016.csv")

# chardet.detect() returns a dictionary which provides the encoding as the value associated with the key 'encoding'
with open(datafile, 'rb') as f:
    result = chardet.detect(f.read())

df_concerts_ndb = pd.read_csv(datafile, encoding=result['encoding'])

In [3]:
df_concerts_ndb

Unnamed: 0,edition,date,artist
0,2021,11/06/2021,Wilco
1,2021,13/06/2021,Imelda May
2,2021,15/06/2021,Woodkid
3,2021,15/06/2021,Awir Leon
4,2021,16/06/2021,Estrella Morente
...,...,...,...
201,2016,22/07/2016,Seal
202,2016,23/07/2016,Mariza
203,2016,26/07/2016,Electric Light Orchestra
204,2016,27/07/2016,Rozalén


In [4]:
def search_artist(name):  
    
    result = sp.search(q='artist:' + name, type='artist', limit=50)
    items = result['artists']['items']
    
    if len(items) > 0:
        for i in range(0,len(items)):
            if bool(re.fullmatch(name, items[i]['name'])):
                search_type = 'artist'
                return items[i]
                break
                
    else:
        return None

In [5]:
start = timeit.default_timer()

# create empty lists with artist features
artist_spid = []
artist_name = []
artist_followers = []
artist_popularity = []  
artist_genres = []
artist_edition = []

# empty list with artist not found
artist_not_found = []

for i in range(0,df_concerts_ndb.shape[0]):
    
    artist_results = search_artist(df_concerts_ndb.iloc[i]['artist'])
    
    if artist_results:
        artist_spid.append(artist_results['id'])
        artist_name.append(artist_results['name'])
        artist_followers.append(artist_results['followers']['total'])
        artist_popularity.append(artist_results['popularity'])            
        artist_genres.append(artist_results['genres'])
        artist_edition.append(df_concerts_ndb.iloc[i]['edition'])
        
    else:
        artist_not_found.append(df_concerts_ndb.iloc[i]['artist'])  
        
        artist_spid.append(np.nan)
        artist_name.append(df_concerts_ndb.iloc[i]['artist'])
        artist_followers.append(np.nan)
        artist_popularity.append(np.nan)            
        artist_genres.append(np.nan)
        artist_edition.append(df_concerts_ndb.iloc[i]['edition'])        
      
stop = timeit.default_timer()
print ('2. Get Artists features - Time (in seconds):', stop - start)

2. Get Artists features - Time (in seconds): 16.969738000000007


In [6]:
# Artist not found searching by Artist Name, review it and search it by Album Name

artist_not_found

['La La Land']

In [7]:
df_artist_features = pd.DataFrame({'artist_spid': artist_spid, 
                                    'artist_name': artist_name, 
                                    'artist_followers': artist_followers, 
                                    'artist_popularity': artist_popularity,
                                    'artist_genres': artist_genres,
                                    'artist_edition':artist_edition})

In [8]:
df_artist_features.shape

(206, 6)

In [9]:
df_artist_features.dtypes

artist_spid           object
artist_name           object
artist_followers     float64
artist_popularity    float64
artist_genres         object
artist_edition         int64
dtype: object

In [10]:
# Total number of artists playing at NDB different editions

len(df_artist_features['artist_name'].unique())

189

In [11]:
# Find duplicates - Artist playing in more than one NDB edition

df_artist_features.groupby('artist_name').filter(lambda group: len(group) > 1).sort_values(by='artist_name')

Unnamed: 0,artist_spid,artist_name,artist_followers,artist_popularity,artist_genres,artist_edition
129,5n9bMYfz9qss2VOW89EVs2,Bomba Estéreo,845508.0,67.0,"[cumbia, latin alternative, latin rock, pop el...",2018
189,5n9bMYfz9qss2VOW89EVs2,Bomba Estéreo,845508.0,67.0,"[cumbia, latin alternative, latin rock, pop el...",2016
120,2vI9KFm0fwSfPrpEgOeIbq,Brad Mehldau,148811.0,59.0,"[bebop, contemporary jazz, contemporary post-b...",2018
186,2vI9KFm0fwSfPrpEgOeIbq,Brad Mehldau,148811.0,59.0,"[bebop, contemporary jazz, contemporary post-b...",2016
121,6PkSULcbxFKkxdgrmPGAvn,Cécile McLorin Salvant,61722.0,54.0,"[contemporary vocal jazz, jazz pop, vocal jazz]",2018
...,...,...,...,...,...,...
23,3wxFXeuHQEq4NubZunCIMe,Tomatito,62352.0,57.0,"[flamenco, flamenco guitar, nuevo flamenco, ru...",2021
98,1qupsn74dUofcXCZCBEwMS,Twanguero,3728.0,31.0,[spanish folk rock],2019
52,1qupsn74dUofcXCZCBEwMS,Twanguero,3728.0,31.0,[spanish folk rock],2021
174,2QoU3awHVdcHS8LrZEKvSM,Wilco,573751.0,65.0,"[alternative country, alternative rock, chicag...",2016


In [12]:
# Convert edition column to dummy/indicator variable to avoid duplicates and keep edition column for future filtering

df_artist_features = pd.get_dummies(df_artist_features, prefix='', prefix_sep='', columns=['artist_edition'])

In [13]:
df_artist_features = df_artist_features.groupby('artist_name').agg({'artist_spid':'max',
                                                                    'artist_followers':'max',
                                                                    'artist_popularity': 'max',
                                                                    'artist_genres':'max',
                                                                    '2016':'sum',
                                                                    '2017':'sum',
                                                                    '2018':'sum',
                                                                    '2019':'sum',
                                                                    '2021':'sum'}).reset_index()

In [14]:
df_artist_features.loc[df_artist_features['artist_name'] == 'Cécile McLorin Salvant']

Unnamed: 0,artist_name,artist_spid,artist_followers,artist_popularity,artist_genres,2016,2017,2018,2019,2021
46,Cécile McLorin Salvant,6PkSULcbxFKkxdgrmPGAvn,61722.0,54.0,"[contemporary vocal jazz, jazz pop, vocal jazz]",1,0,1,0,1


In [15]:
df_artist_features.shape

(189, 10)

In [16]:
df_artist_features

Unnamed: 0,artist_name,artist_spid,artist_followers,artist_popularity,artist_genres,2016,2017,2018,2019,2021
0,2CELLOS,6Fi8CHfO8WGtu3yO8c2Mc4,779693.0,65.0,"[bow pop, cello, classify]",1,0,0,0,0
1,Agnes Obel,1rKrEdI6GKirxWHxIUPYms,627117.0,66.0,[chamber pop],0,0,0,0,1
2,Alan Parsons,6HpZkC8GUktP9utE6OPWZG,180645.0,44.0,"[album rock, art rock, progressive rock, symph...",1,0,0,0,0
3,Alex Conde,34gsS5srfZEkRrwauF4Czj,227.0,1.0,[spanish jazz],0,0,0,0,1
4,Alfa Mist,2i1CPudyCUjL50Wqjv8AMI,130385.0,55.0,"[british jazz, indie jazz, indie soul, neo r&b...",0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
184,Youssou N'Dour,77zlytAFjPFjUKda8TNIDY,110380.0,58.0,"[afropop, mande pop, mbalax, world]",0,0,0,0,1
185,Zahara,7uLePkJ2f0MwEcphODfkuu,176039.0,61.0,"[cantautor, latin rock, spanish indie pop, spa...",0,0,0,1,0
186,Zenet,2s7td67DdtSXTx2TGzs01i,60448.0,49.0,"[cantautor, flamenco, rumba]",0,0,0,0,1
187,Zoé,6IdtcAwaNVAggwd6sCKgTI,3301298.0,76.0,"[latin alternative, latin rock, mexican rock, ...",0,0,1,0,0


In [17]:
artist_file = os.path.join(data_root, 'artists_ndb.csv')
df_artist_features.to_csv(artist_file, encoding='utf-8', index=False)

### 3. Get Artist Discography  

Extract artist discography by artist Spotify ID. Limit search by album_type: __album__ or __single__ as some of the artist playing are new and have only published EPs.  

In order to avoid album duplicates, strategy applied:  

1. Get artist full discography (album_type: album), if no albums found search by album type: single.  

2. Sort search results by release date descending and keep first ocurrence when album duplicates are found (usually, latest album release remastered and with live tracks also).

2. Drop album Spotify ID duplicates as some of the artist playing at different editions has done collaborations together.  

3. Normalize album title string to avoid duplicates and releases of same album (Deluxe), [live], - remastered ...

In [18]:
def get_artists_discography(artist_spid):   
    
    artist_discography= []
    
    albums = []
    results = sp.artist_albums(artist_spid, album_type='album')
    albums.extend(results['items'])
        
    while results['next']:
        results = sp.next(results)
        albums.extend(results['items'])
        
    if albums:
        
        sorted(albums, key=lambda album: (album['release_date']))
        unique = set() # skip duplicate albums
        
        for album in albums:
            name = album['name'].lower()      
            if name not in unique:
                unique.add(name)
                artist_discography.append(album)
                
    else:
        singles = []
        results = sp.artist_albums(artist_spid, album_type='single')
        singles.extend(results['items'])

        while results['next']:
            results = sp.next(results)
            singles.extend(results['items'])

        if singles:

            sorted(singles, key=lambda album: (album['release_date']))
            unique = set() # skip duplicate albums

            for album in singles:
                name = album['name'].lower()      
                if name not in unique:
                    unique.add(name)
                    artist_discography.append(album)

    return artist_discography        

In [19]:
def search_album(name):  
    
    result = sp.search(q='album:' + name, type='album', limit=50)
    items = result['albums']['items']

    if len(items) > 0:
        for i in range(0,len(items)):
            if re.match(name, items[i]['name']):
                return items[i]
                break
                
    else:
        return None

In [20]:
start = timeit.default_timer()

# create empty lists with album features

artist_spid = []
artist_name = []
album_spid = []
album_name = []
album_release_date = []

# empty list for albumn not found
discography_not_found = []

for i in range(0,df_artist_features.shape[0]):
    
    if pd.notna(df_artist_features.iloc[i]['artist_spid']):
    
        album_results = get_artists_discography(df_artist_features.iloc[i]['artist_spid'])

        if not album_results:
            discography_not_found.append([df_artist_features.iloc[i]['artist_spid'],df_artist_features.iloc[i]['artist_name']])  

        else:
            for album in album_results:
                artist_spid.append(df_artist_features.iloc[i]['artist_spid']) 
                artist_name.append(df_artist_features.iloc[i]['artist_name'])            
                album_spid.append(album['id'])
                album_name.append(album['name'].lower())    
                album_release_date.append(album['release_date'])
    else:
            
        for a in range(0,len(artist_not_found)):
            
            album_results = search_album(artist_not_found[a])

            if not album_results:
                discography_not_found.append([np.nan,artist])

            else:
                artist_spid.append(np.nan) 
                artist_name.append(artist_not_found[a])
                album_spid.append(album_results['id'])
                album_name.append(album_results['name'].lower())    
                album_release_date.append(album_results['release_date'])
    
      
stop = timeit.default_timer()
print ('3. Get Artists Discography - Time (in seconds):', stop - start)

3. Get Artists Discography - Time (in seconds): 28.498889899999995


In [21]:
# Discography not found for Artists

discography_not_found

[]

In [22]:
df_artists_albumns = pd.DataFrame({'artist_spid': artist_spid,
                                    'artist_name': artist_name,
                                    'album_spid': album_spid, 
                                    'album_name': album_name, 
                                    'album_release_date': album_release_date})

In [112]:
df_artists_albumns

Unnamed: 0,artist_spid,artist_name,album_spid,album_name,album_release_date
0,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19
1,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,0sR0wZubrE2h3h4WxviRCX,score,2017-03-17
2,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,4OwQQC8C7jAunOuuySXzp1,score (japan version),2017-02-22
3,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1e629DDr56Yrmr7vNzQJBf,celloverse (japan version),2015-01-21
4,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,5YBY91ePHZWzgsLRrDK8DI,celloverse,2015-01-09
...,...,...,...,...,...
3354,2KftmGt9sk1yLjsAoloC3M,Zucchero,2I8UrYjiscfyn4jcw9WNUN,oro incenso & birra - zucchero sugar fornaciari,1989-01-01
3355,2KftmGt9sk1yLjsAoloC3M,Zucchero,0P3CGbrZ8zBzfSsP1UHc0n,blue's,1987-06-15
3356,2KftmGt9sk1yLjsAoloC3M,Zucchero,77q4z6bUC9nUGoiAvP9Sjt,rispetto,1986-01-01
3357,2KftmGt9sk1yLjsAoloC3M,Zucchero,510WtjFm0mIOmehUQvwWxz,zucchero & the randy jackson band,1985-01-01


In [23]:
df_artists_albumns.shape

(3366, 5)

In [24]:
# Total number of albumns from artist playing at NDB different editions

len(df_artists_albumns['album_spid'].unique())

3344

In [25]:
# Find album duplicates - collaborations from artists playing at different editions

df_artists_albumns.groupby('album_spid').filter(lambda group: len(group) > 1).sort_values(by='album_name')

Unnamed: 0,artist_spid,artist_name,album_spid,album_name,album_release_date
601,5olDKSsFhhmwh8UCWwKtpq,Chick Corea,74dcvlPv6JbYQd7O2LScsW,an evening with chick corea & herbie hancock,1979-01-01
1228,2ZvrvbQNrHKwjT7qfGFFUW,Herbie Hancock,74dcvlPv6JbYQd7O2LScsW,an evening with chick corea & herbie hancock,1979-01-01
1232,2ZvrvbQNrHKwjT7qfGFFUW,Herbie Hancock,2xGIvFXBvdU4HHI5wIzWzx,an evening with herbie hancock & chick corea i...,1978-02-01
604,5olDKSsFhhmwh8UCWwKtpq,Chick Corea,2xGIvFXBvdU4HHI5wIzWzx,an evening with herbie hancock & chick corea i...,1978-02-01
603,5olDKSsFhhmwh8UCWwKtpq,Chick Corea,1EPOdCXN9FoifxFq5t9uwh,an evening with...,1978-11
...,...,...,...,...,...
299,2vI9KFm0fwSfPrpEgOeIbq,Brad Mehldau,0lts77z4ZmCEah0YZ01AKp,quartet,2007-03-05
887,4qKIiUdFND09cgGOc5kfoR,Eliane Elias,4qNvX08NvdYVh05jvzZWsy,solos and duets,1995-01-01
1209,2ZvrvbQNrHKwjT7qfGFFUW,Herbie Hancock,4qNvX08NvdYVh05jvzZWsy,solos and duets,1995-01-01
93,4NOZW7dBOmBMMAG9EGQd4t,Ana Belén,28qcLkJL11NwnAVinfgHAO,victor y ana en vivo,1983-09-01


In [26]:
duplicate_album = df_artists_albumns[df_artists_albumns.duplicated(['album_spid'])]

In [27]:
df_artists_albumns = df_artists_albumns.drop_duplicates(subset=['album_spid'], keep='first').reset_index(drop=True)

In [28]:
df_artists_albumns.shape

(3344, 5)

In [29]:
df_artists_albumns.groupby(['artist_name'])['album_name'].count()

artist_name
2CELLOS            7
Agnes Obel         6
Alan Parsons      11
Alex Conde         1
Alfa Mist          4
                  ..
Youssou N'Dour    22
Zahara             8
Zenet             11
Zoé               19
Zucchero          36
Name: album_name, Length: 189, dtype: int64

In [30]:
df_artists_albumns.groupby(['artist_name','album_name']).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,artist_spid,album_spid,album_release_date
artist_name,album_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2CELLOS,2cellos,1,1,1
2CELLOS,celloverse,1,1,1
2CELLOS,celloverse (japan version),1,1,1
2CELLOS,in2ition,1,1,1
2CELLOS,let there be cello,1,1,1
...,...,...,...,...
Zucchero,una rosa blanca,1,1,1
Zucchero,zu & co.,1,1,1
Zucchero,zucchero,1,1,1
Zucchero,zucchero & the randy jackson band,1,1,1


In [31]:
df_artists_albumns[df_artists_albumns['artist_spid'].isna()]

Unnamed: 0,artist_spid,artist_name,album_spid,album_name,album_release_date
1706,,La La Land,3GU8BzFEAdFSRjc8jZkL3S,la la land (original motion picture soundtrack),2016-12-09


In [32]:
def text_normalization(list_str):
    
    for i in range(len(list_str)):
        text = list_str[i]
        text = text.lower()
        text = text.replace('\'','')
        text = unicodedata.normalize("NFKD", text).encode("ascii","ignore").decode("ascii")
        if text.find('(') > 0:
            aux=text[text.find('(')-1:text.find(')')+1]
            text=text.replace(aux,'')
        if text.find(' - ') > 0:
            aux=text[text.find(' - '):len(text)]
            text=text.replace(aux,'')
        if text.find('[') > 0:
            aux=text[text.find('[')-1:text.find(']')+1]
            text=text.replace(aux,'')            
            
        list_str[i]=text
         
    return list_str

In [33]:
artists_albumns = text_normalization(df_artists_albumns['album_name'].tolist())

In [34]:
df_artists_albumns['album_name'] = artists_albumns

In [35]:
df_artists_albumns

Unnamed: 0,artist_spid,artist_name,album_spid,album_name,album_release_date
0,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19
1,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,0sR0wZubrE2h3h4WxviRCX,score,2017-03-17
2,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,4OwQQC8C7jAunOuuySXzp1,score,2017-02-22
3,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1e629DDr56Yrmr7vNzQJBf,celloverse,2015-01-21
4,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,5YBY91ePHZWzgsLRrDK8DI,celloverse,2015-01-09
...,...,...,...,...,...
3339,2KftmGt9sk1yLjsAoloC3M,Zucchero,2I8UrYjiscfyn4jcw9WNUN,oro incenso & birra,1989-01-01
3340,2KftmGt9sk1yLjsAoloC3M,Zucchero,0P3CGbrZ8zBzfSsP1UHc0n,blues,1987-06-15
3341,2KftmGt9sk1yLjsAoloC3M,Zucchero,77q4z6bUC9nUGoiAvP9Sjt,rispetto,1986-01-01
3342,2KftmGt9sk1yLjsAoloC3M,Zucchero,510WtjFm0mIOmehUQvwWxz,zucchero & the randy jackson band,1985-01-01


In [36]:
# Find album duplicates - by artist name and album name - different editions of same album - keep latest release

df_artists_albumns.groupby(['artist_name','album_name']).\
                   filter(lambda group: len(group) > 1).\
                   sort_values(by=['artist_name','album_name'])

Unnamed: 0,artist_spid,artist_name,album_spid,album_name,album_release_date
3,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1e629DDr56Yrmr7vNzQJBf,celloverse,2015-01-21
4,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,5YBY91ePHZWzgsLRrDK8DI,celloverse,2015-01-09
1,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,0sR0wZubrE2h3h4WxviRCX,score,2017-03-17
2,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,4OwQQC8C7jAunOuuySXzp1,score,2017-02-22
10,1rKrEdI6GKirxWHxIUPYms,Agnes Obel,7dT5E8IB9rm7w1oo4Ma13X,aventine,2014-10-06
...,...,...,...,...,...
3325,2KftmGt9sk1yLjsAoloC3M,Zucchero,37QJSwpwlpoFFCg229hg9H,live in italy,2009-01-01
3326,2KftmGt9sk1yLjsAoloC3M,Zucchero,4uJA4rmNL2hUdcgNJE3FvA,live in italy,2008-01-01
3327,2KftmGt9sk1yLjsAoloC3M,Zucchero,7iP0tqUp7Bh7TOx6KlUHDm,live in italy,2008-01-01
3336,2KftmGt9sk1yLjsAoloC3M,Zucchero,1GUCbxpDJlkbgNHWUWmZ3s,miserere,1992-01-01


In [37]:
duplicate_album = df_artists_albumns[df_artists_albumns.duplicated(['artist_name','album_name'])]

In [38]:
df_artists_albumns = df_artists_albumns.drop_duplicates(subset=['artist_name','album_name'], keep='first').\
                                                        reset_index(drop=True)

In [39]:
df_artists_albumns.shape

(3001, 5)

In [40]:
df_artists_albumns['artist_name'].nunique()

189

In [41]:
df_artists_albumns.groupby(['artist_name','album_name']).ngroups

3001

In [42]:
df_artists_albumns

Unnamed: 0,artist_spid,artist_name,album_spid,album_name,album_release_date
0,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19
1,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,0sR0wZubrE2h3h4WxviRCX,score,2017-03-17
2,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1e629DDr56Yrmr7vNzQJBf,celloverse,2015-01-21
3,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,4OYCUKg4plshmpkGNEEDBz,in2ition,2013-08-21
4,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,7iIBnMC3Ds8x033w8QKrgF,2cellos,2012-05-02
...,...,...,...,...,...
2996,2KftmGt9sk1yLjsAoloC3M,Zucchero,2I8UrYjiscfyn4jcw9WNUN,oro incenso & birra,1989-01-01
2997,2KftmGt9sk1yLjsAoloC3M,Zucchero,0P3CGbrZ8zBzfSsP1UHc0n,blues,1987-06-15
2998,2KftmGt9sk1yLjsAoloC3M,Zucchero,77q4z6bUC9nUGoiAvP9Sjt,rispetto,1986-01-01
2999,2KftmGt9sk1yLjsAoloC3M,Zucchero,510WtjFm0mIOmehUQvwWxz,zucchero & the randy jackson band,1985-01-01


In [43]:
artist_discography_file = os.path.join(data_root, 'artist_discography_ndb.csv')
df_artists_albumns.to_csv(artist_discography_file, encoding='utf-8', index=False)

### 4.1. Get Album Tracks  

Get tracks by album using a nested loop increased with an offset of 20 Spotify album´s ID, maximum allowed.

In [44]:
start = timeit.default_timer()

# A comma-separated list of the Spotify IDs for the albums. Maximum: 20 IDs.
limit = 20
list_album_ids = []

# create empty lists with track features
artist_spid = []
artist_name = []
album_spid = []
album_name = [] 
album_release_date = []
track_spid = []
track_name = []

# empty list for tracks not found
track_not_found = []

# count = 0

for i in range(0,df_artists_albumns.shape[0],limit):
    
    list_album_ids = list(df_artists_albumns['album_spid'][i:i+limit])
    
    # get album tracks
    album_results = sp.albums(list_album_ids)
    
    for n, a in enumerate(album_results['albums']):
        
        if a == None:
            track_not_found.append([df_artists_albumns.iloc[i]['album_spid'],df_artists_albumns.iloc[i]['artist_name']])
            
            print('Discography not found for artist: ' + df_artist_features.iloc[i+n]['artist_name'] + 
                  ' * ' + df_artists_albumns.iloc[i+n]['album_spid']) 
        
        else:
            for x, t in enumerate(album_results['albums'][n]['tracks']['items']):
                artist_spid.append(df_artists_albumns.iloc[i+n]['artist_spid']) 
                artist_name.append(df_artists_albumns.iloc[i+n]['artist_name']) 
                album_spid.append(df_artists_albumns.iloc[i+n]['album_spid'])
                album_name.append(df_artists_albumns.iloc[i+n]['album_name']) 
                album_release_date.append(df_artists_albumns.iloc[i+n]['album_release_date']) 
                track_spid.append(t['id'])
                track_name.append(t['name'].lower())
                
#            print('Added tracks for album: ' + str(count+n) + ' * Album: ' + df_artists_albumns.iloc[i+n]['album_name'] + 
#                  ' * Total tracks: ' + str(len(album_results['albums'][n]['tracks']['items'])))
#    count +=limit 

stop = timeit.default_timer()
print ('4.1. Get Album Tracks (in seconds):', stop - start)

4.1. Get Album Tracks (in seconds): 36.948896200000036


In [45]:
df_albumns_tracks = pd.DataFrame({'artist_spid': artist_spid,
                                  'artist_name': artist_name,
                                  'album_spid': album_spid,
                                  'album_name': album_name, 
                                  'album_release_date': album_release_date,
                                  'track_spid': track_spid, 
                                  'track_name': track_name})

In [46]:
df_albumns_tracks

Unnamed: 0,artist_spid,artist_name,album_spid,album_name,album_release_date,track_spid,track_name
0,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,5pFW81GAL2wafUGGWg2gil,concept2
1,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,4dbH856gB6dJkZGKemEsGb,eye of the tiger
2,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,5POfcpwLoeMuwXSjA5uQIq,pirates of the caribbean
3,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,7F5zNt9vFrOkI2fYVd8GOA,cadenza
4,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,116stRytHB8url7FIioIRl,hallelujah
...,...,...,...,...,...,...,...
40385,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,3UNzTbgzzeWAMxMUFLppLJ,nuvola
40386,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,4iX8cZtVMdGRTsaPJhE7r9,come l'aria
40387,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,0nJqoUxL1wmHUB2W8ikjv5,perche' sei bella
40388,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,09tQY4UdG2AfVxXE850JbU,sandra


In [47]:
df_albumns_tracks.groupby(['artist_name','album_name'])['track_spid'].count()

artist_name  album_name                       
2CELLOS      2cellos                              17
             celloverse                           14
             in2ition                             21
             let there be cello                   14
             score                                14
                                                  ..
Zucchero     una rosa blanca                      26
             zu & co.                             18
             zucchero                             12
             zucchero & the randy jackson band     9
             zucchero who?                        22
Name: track_spid, Length: 3001, dtype: int64

In [48]:
# Trancks not found by Album ID

track_not_found

[]

In [49]:
df_albumns_tracks.groupby(['artist_name'])['artist_name'].count()

artist_name
2CELLOS            80
Agnes Obel         67
Alan Parsons      144
Alex Conde         10
Alfa Mist          35
                 ... 
Youssou N'Dour    306
Zahara             86
Zenet             127
Zoé               197
Zucchero          347
Name: artist_name, Length: 189, dtype: int64

### 4.2. Get Track info and audio features  

Get track details and audio features using a nested loop increased with an offset of 50 Spotify track´s ID, maximum allowed.

In [50]:
start = timeit.default_timer()

# A comma-separated list of the Spotify IDs for the tracks. Maximum: 50 IDs.
limit = 50
list_track_ids = []

# create empty lists with track info and audio features
track_popularity = []
track_duration_ms = []
track_danceability = []
track_energy = []
track_key = []
track_loudness = []
track_mode = []
track_speechiness = []
track_acousticness = []
track_instrumentalness = []
track_liveness = []
track_valence = []
track_tempo = []

for i in range(0,df_albumns_tracks.shape[0],limit):

    list_track_ids = list(df_albumns_tracks['track_spid'][i:i+limit])
    
    # get track info
    track_info = sp.tracks(list_track_ids)
    
    # get track audio features
    features = sp.audio_features(list_track_ids)
            
    for n, t in enumerate(track_info['tracks']):
        
        if t == None:
            track_popularity.append(np.nan)
            track_duration_ms.append(np.nan)   
            
            print('Not Found info for track: ' +  str(i+n) + ' * ' + df_albumns_tracks.iloc[i]['track_name'])
        
        else:
            track_popularity.append(t['popularity'])
            track_duration_ms.append(t['duration_ms'])
            
    
    for n, f in enumerate(features):
        
        if f == None:
            track_danceability.append(np.nan)
            track_energy.append(np.nan)
            track_key.append(np.nan)
            track_loudness.append(np.nan)
            track_mode.append(np.nan)
            track_speechiness.append(np.nan)
            track_acousticness.append(np.nan)
            track_instrumentalness.append(np.nan)
            track_liveness.append(np.nan)
            track_valence.append(np.nan)
            track_tempo.append(np.nan)           

            print('Not Found Audio Features for track: ' + str(i+n) + ' * ' + df_albumns_tracks.iloc[i+n]['track_name'])
            
        else:
            track_danceability.append(f['danceability'])
            track_energy.append(f['energy'])
            track_key.append(f['key'])
            track_loudness.append(f['loudness'])
            track_mode.append(f['mode'])
            track_speechiness.append(f['speechiness'])
            track_acousticness.append(f['acousticness'])
            track_instrumentalness.append(f['instrumentalness'])
            track_liveness.append(f['liveness'])
            track_valence.append(f['valence'])
            track_tempo.append(f['tempo'])
            
#    print('List - From : ' + str(i) + ' To: ' + str(i+len(features)) + 
#          ' * First ID: ' + list_track_ids[0] + ' * Last ID: ' + list_track_ids[len(list_track_ids)-1])       
      
stop = timeit.default_timer()
print ('4.2 Get Track Features (in seconds):', stop - start)

Not Found Audio Features for track: 20338 * saint valentine's day intro - live, 2017
4.2 Get Track Features (in seconds): 213.31354639999995


In [51]:
# Add new columns to tracks dataframe

df_albumns_tracks['track_popularity'] = track_popularity
df_albumns_tracks['track_duration_ms'] = track_duration_ms
df_albumns_tracks['track_danceability'] = track_danceability
df_albumns_tracks['track_energy'] = track_energy
df_albumns_tracks['track_key'] = track_key
df_albumns_tracks['track_loudness'] = track_loudness
df_albumns_tracks['track_mode'] = track_mode
df_albumns_tracks['track_speechiness'] = track_speechiness
df_albumns_tracks['track_acousticness'] = track_acousticness
df_albumns_tracks['track_instrumentalness'] = track_instrumentalness
df_albumns_tracks['track_liveness'] = track_liveness
df_albumns_tracks['track_valence'] = track_valence
df_albumns_tracks['track_tempo'] = track_tempo

In [52]:
# Find tracks rows with NaN audio features and drop it

df_albumns_tracks[df_albumns_tracks['track_danceability'].isnull()]

Unnamed: 0,artist_spid,artist_name,album_spid,album_name,album_release_date,track_spid,track_name,track_popularity,track_duration_ms,track_danceability,track_energy,track_key,track_loudness,track_mode,track_speechiness,track_acousticness,track_instrumentalness,track_liveness,track_valence,track_tempo
20338,4Pmlf0hZYXcoDSuaNrw23E,Little Steven,5VZyFOwIrkOmH90YGjNAKu,soulfire live!,2021-01-29,7EYDZYvXFGS4xqGoqY6Vjm,"saint valentine's day intro - live, 2017",0,12400,,,,,,,,,,,


In [53]:
df_albumns_tracks.dropna(subset=['track_danceability','track_energy','track_key','track_loudness','track_mode','track_speechiness','track_acousticness','track_instrumentalness','track_liveness','track_valence','track_tempo'], inplace=True)

In [54]:
df_albumns_tracks.reset_index(drop=True)

Unnamed: 0,artist_spid,artist_name,album_spid,album_name,album_release_date,track_spid,track_name,track_popularity,track_duration_ms,track_danceability,track_energy,track_key,track_loudness,track_mode,track_speechiness,track_acousticness,track_instrumentalness,track_liveness,track_valence,track_tempo
0,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,5pFW81GAL2wafUGGWg2gil,concept2,37,128746,0.762,0.954,6.0,-11.105,0.0,0.0315,0.8350,0.793000,0.1020,0.2840,125.006
1,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,4dbH856gB6dJkZGKemEsGb,eye of the tiger,47,244520,0.611,0.686,0.0,-5.410,0.0,0.0527,0.5780,0.552000,0.1220,0.5640,130.991
2,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,5POfcpwLoeMuwXSjA5uQIq,pirates of the caribbean,53,180320,0.824,0.722,2.0,-6.455,0.0,0.0474,0.7010,0.872000,0.0993,0.5750,115.435
3,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,7F5zNt9vFrOkI2fYVd8GOA,cadenza,34,77573,0.119,0.195,7.0,-19.247,1.0,0.0449,0.5100,0.926000,0.0779,0.0328,79.641
4,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,116stRytHB8url7FIioIRl,hallelujah,48,217440,0.366,0.185,4.0,-15.479,0.0,0.0403,0.8750,0.882000,0.1110,0.3410,150.019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40384,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,3UNzTbgzzeWAMxMUFLppLJ,nuvola,11,183173,0.548,0.552,5.0,-13.569,1.0,0.0471,0.0384,0.000000,0.1200,0.7690,115.845
40385,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,4iX8cZtVMdGRTsaPJhE7r9,come l'aria,9,190333,0.496,0.494,1.0,-13.678,1.0,0.0351,0.1180,0.000027,0.0608,0.6240,87.299
40386,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,0nJqoUxL1wmHUB2W8ikjv5,perche' sei bella,8,242000,0.519,0.487,4.0,-15.633,1.0,0.0549,0.1970,0.000002,0.0602,0.3680,113.421
40387,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,09tQY4UdG2AfVxXE850JbU,sandra,7,161000,0.690,0.508,4.0,-14.348,1.0,0.0496,0.1100,0.000000,0.0457,0.8360,122.924


In [55]:
df_albumns_tracks['track_spid'].nunique()

40389

In [56]:
albumns_tracks = text_normalization(df_albumns_tracks['track_name'].tolist())

In [57]:
df_albumns_tracks['track_name'] = albumns_tracks

In [58]:
# Find duplicates - Tracks by artist - order by popularity desc and keep First

df_albumns_tracks.groupby(['artist_name','track_name']).\
                  filter(lambda group: len(group) > 1).\
                  sort_values(by=['artist_name','track_name','track_popularity'],ascending=[False,False,False])

Unnamed: 0,artist_spid,artist_name,album_spid,album_name,album_release_date,track_spid,track_name,track_popularity,track_duration_ms,track_danceability,track_energy,track_key,track_loudness,track_mode,track_speechiness,track_acousticness,track_instrumentalness,track_liveness,track_valence,track_tempo
40078,2KftmGt9sk1yLjsAoloC3M,Zucchero,1EIfxgibJn5TQhwN1AUFKX,oro incenso & birra 30th anniversary edition,2019-06-14,72ZVpFAcycaAlUBZGQ31bO,youve chosen me,15,150773,0.346,0.0855,10.0,-12.807,1.0,0.0339,0.9380,0.003230,0.0936,0.1950,138.370
40335,2KftmGt9sk1yLjsAoloC3M,Zucchero,16wYKOLYPUELab2m0gE6u4,zucchero,1991-01-01,4ZORMEu1WzFK7QHW89VN65,youve chosen me,4,150133,0.415,0.0175,10.0,-21.614,1.0,0.0361,0.8980,0.000177,0.0784,0.2020,139.756
40077,2KftmGt9sk1yLjsAoloC3M,Zucchero,1EIfxgibJn5TQhwN1AUFKX,oro incenso & birra 30th anniversary edition,2019-06-14,2aTSgeB97l5UPCm0TObIxn,youre losing me,16,346253,0.416,0.3950,7.0,-8.521,1.0,0.0303,0.4000,0.003710,0.2750,0.0780,139.733
40333,2KftmGt9sk1yLjsAoloC3M,Zucchero,16wYKOLYPUELab2m0gE6u4,zucchero,1991-01-01,33gn0iwa2EQTtpbpnMHC7V,youre losing me,4,347906,0.468,0.2450,7.0,-13.714,1.0,0.0315,0.3740,0.002710,0.3180,0.0997,139.901
40300,2KftmGt9sk1yLjsAoloC3M,Zucchero,0MwkCufcI9AIciJQ3jAXCo,spiritodivino,1995-01-01,76Y2Mygoo8ZH9wJBtODYdd,x colpa di chi?,41,238533,0.667,0.8410,9.0,-9.504,1.0,0.0626,0.0181,0.000998,0.5220,0.4500,127.030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,7iIBnMC3Ds8x033w8QKrgF,2cellos,2012-05-02,7Eht658X79OyvRFXULXZHW,smooth criminal,3,204440,0.591,0.6720,9.0,-6.682,0.0,0.0600,0.9750,0.850000,0.6910,0.7610,139.098
71,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,7iIBnMC3Ds8x033w8QKrgF,2cellos,2012-05-02,7dNXFcI6YNveI5OiPoSRL2,human nature,2,167826,0.585,0.2160,2.0,-13.421,1.0,0.0850,0.9510,0.850000,0.1420,0.5660,100.031
77,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,7iIBnMC3Ds8x033w8QKrgF,2cellos,2012-05-02,44g6sW2KJOep078GkPlFyK,human nature,2,191613,0.727,0.2520,11.0,-16.099,0.0,0.0494,0.9710,0.925000,0.6970,0.4770,101.405
45,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,4OYCUKg4plshmpkGNEEDBz,in2ition,2013-08-21,05TtDdlVEhrbMbVC8ZKulc,every breath you take,3,229586,0.455,0.1810,11.0,-18.515,1.0,0.0414,0.7590,0.805000,0.1110,0.4410,125.831


In [59]:
duplicate_track = df_albumns_tracks[df_albumns_tracks.duplicated(['artist_name','track_name'])]

In [60]:
df_albumns_tracks = df_albumns_tracks.drop_duplicates(subset=['artist_name','track_name'], keep='first')\
                                      .reset_index(drop=True)

In [61]:
df_albumns_tracks

Unnamed: 0,artist_spid,artist_name,album_spid,album_name,album_release_date,track_spid,track_name,track_popularity,track_duration_ms,track_danceability,track_energy,track_key,track_loudness,track_mode,track_speechiness,track_acousticness,track_instrumentalness,track_liveness,track_valence,track_tempo
0,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,5pFW81GAL2wafUGGWg2gil,concept2,37,128746,0.762,0.954,6.0,-11.105,0.0,0.0315,0.8350,0.793000,0.1020,0.2840,125.006
1,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,4dbH856gB6dJkZGKemEsGb,eye of the tiger,47,244520,0.611,0.686,0.0,-5.410,0.0,0.0527,0.5780,0.552000,0.1220,0.5640,130.991
2,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,5POfcpwLoeMuwXSjA5uQIq,pirates of the caribbean,53,180320,0.824,0.722,2.0,-6.455,0.0,0.0474,0.7010,0.872000,0.0993,0.5750,115.435
3,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,7F5zNt9vFrOkI2fYVd8GOA,cadenza,34,77573,0.119,0.195,7.0,-19.247,1.0,0.0449,0.5100,0.926000,0.0779,0.0328,79.641
4,6Fi8CHfO8WGtu3yO8c2Mc4,2CELLOS,1tqitQVdu4ygSsO5AABG3F,let there be cello,2018-10-19,116stRytHB8url7FIioIRl,hallelujah,48,217440,0.366,0.185,4.0,-15.479,0.0,0.0403,0.8750,0.882000,0.1110,0.3410,150.019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26503,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,3UNzTbgzzeWAMxMUFLppLJ,nuvola,11,183173,0.548,0.552,5.0,-13.569,1.0,0.0471,0.0384,0.000000,0.1200,0.7690,115.845
26504,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,4iX8cZtVMdGRTsaPJhE7r9,come laria,9,190333,0.496,0.494,1.0,-13.678,1.0,0.0351,0.1180,0.000027,0.0608,0.6240,87.299
26505,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,0nJqoUxL1wmHUB2W8ikjv5,perche sei bella,8,242000,0.519,0.487,4.0,-15.633,1.0,0.0549,0.1970,0.000002,0.0602,0.3680,113.421
26506,2KftmGt9sk1yLjsAoloC3M,Zucchero,7KX3hdT8Kf9zBtEEVevKH0,un po di zucchero,1983-01-01,09tQY4UdG2AfVxXE850JbU,sandra,7,161000,0.690,0.508,4.0,-14.348,1.0,0.0496,0.1100,0.000000,0.0457,0.8360,122.924


In [62]:
artist_tracks_file = os.path.join(data_root, 'artist_tracks.csv')
df_albumns_tracks.to_csv(artist_tracks_file, encoding='utf-8', index=False)