In [11]:
# Importing libraries
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy
import sys
from pprint import pprint
import pandas as pd
import os
import json
import numpy as np
import pandas

from pandas.errors import ParserError

In [12]:
# Authentication to Spotify API
config_file = 'config/spotify_api_credentials.json' # JSON file with your personal credentials
with open(config_file, 'r') as f:
    config = json.load(f)
    
cid = config['client_id']
secret = config['client_id_secret']

credentials = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=credentials)

In [13]:
market = 'br'
years = [2019, 2020, 2021]
hits = pd.DataFrame(columns=['song_id',
                            'song_name',
                            'artist_id',
                            'artist_name',
                            'popularity',
                            'explicit',
                            'song_type',
                            'track_number',
                            'num_artists',
                            'num_available_markets',
                            'release_date',
                            'duration_ms',
                            'key',
                            'mode',
                            'time_signature',
                            'acousticness',
                            'danceability',
                            'energy',
                            'instrumentalness',
                            'liveness',
                            'loudness',
                            'speechiness',
                            'valence',
                            'tempo'
])

In [22]:
chart.head()

Unnamed: 0,Position,TrackName,Artist,Streams,URL
0,1,Atrasadinha - Ao Vivo,Felipe Araújo,5517626,https://open.spotify.com/track/5EKHbW7mXcAR4hC...
1,2,Parado no Bailão,MC L da Vinte,4439458,https://open.spotify.com/track/3Hb9kUdm4yf839F...
2,3,Notificação Preferida - Ao Vivo,Zé Neto & Cristiano,4345439,https://open.spotify.com/track/2qkNnJvQvhLiZ63...
3,4,Ciumeira - Ao Vivo,Marília Mendonça,4161551,https://open.spotify.com/track/2H1liABu20HXBOm...
4,5,Quem Me Dera,Márcia Fellipe,3989150,https://open.spotify.com/track/2lAyctg5FocS67e...


In [None]:
chart.columns = chart[]

In [14]:
song_ids = set()

In [23]:
for year in years:
    print('%s | %d | Retrieving charts...' % (market, year))

    directory = 'dataset/charts/%s/%d' % (market, year)
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            path = directory + '/' + filename

            try:
                chart = pd.read_csv(path, names=['Position', 'TrackName', 'Artist', 'Streams', 'URL'], encoding='utf-8', delimiter=',', header=1)
                ids = [x.split('https://open.spotify.com/track/')[1] for x in chart['URL']]

                for sid in ids:
                    song_ids.add(sid)
            except ParserError:
                print('ERROR | Check file %s' % path)
print('Finished.')

br | 2019 | Retrieving charts...
br | 2020 | Retrieving charts...
br | 2021 | Retrieving charts...
Finished.


In [24]:
total_songs = len(song_ids)

In [25]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [26]:
# Split song_ids into equal-sized (n=50) lists
CHUNK_SIZE = 50

sid_list = list(song_ids)
sid_chks = list(chunks(sid_list, CHUNK_SIZE))

In [27]:
# Get track info and audio features
count = 0
for chnk in sid_chks:
    track_info_list = sp.tracks(chnk) # Get basic info fom tracks
    audio_features_list = sp.audio_features(chnk) # Get acoustic features

    for i in range(len(chnk)):    
        song = {}
        track_info = track_info_list['tracks'][i]
        audio_features = audio_features_list[i]
        
        # Sanity check
        if audio_features:
            if track_info['id'] != audio_features['id']:
                print(f"ERROR | {track_info['id']} != {audio_features['id']}")

        song['song_id'] = track_info['id']
        song['song_name'] = track_info['name']
        song['popularity'] = track_info['popularity']
        song['explicit'] = track_info['explicit']
        song['track_number'] = track_info['track_number']                         
        song['num_available_markets'] = max(len(track_info['available_markets']), len(track_info['album']['available_markets']))
        song['duration_ms'] = track_info['duration_ms']

        # Artist-based features
        artist_id = []
        artist_name = []
        for artist in track_info['artists']:
            artist_id.append(artist['id'])
            artist_name.append(artist['name'])
        song['artist_id'] = str(artist_id)
        song['artist_name'] = str(artist_name)
        song['num_artists'] = len(artist_id)
        if song['num_artists'] > 1:
            song['song_type'] = 'Collaboration'
        else:
            song['song_type'] = 'Solo'

        # Album-based features
        song['release_date'] = track_info['album']['release_date']

        # Acoustic features
        song['key'] = audio_features['key'] if audio_features else np.nan
        song['mode'] = audio_features['mode'] if audio_features else np.nan
        song['time_signature'] = audio_features['time_signature'] if audio_features else np.nan
        song['acousticness'] = audio_features['acousticness'] if audio_features else np.nan
        song['danceability'] = audio_features['danceability'] if audio_features else np.nan
        song['energy'] = audio_features['energy'] if audio_features else np.nan
        song['instrumentalness'] = audio_features['instrumentalness'] if audio_features else np.nan
        song['liveness'] = audio_features['liveness'] if audio_features else np.nan
        song['loudness'] = audio_features['loudness'] if audio_features else np.nan
        song['speechiness'] = audio_features['speechiness'] if audio_features else np.nan
        song['valence'] = audio_features['valence'] if audio_features else np.nan
        song['tempo'] = audio_features['tempo'] if audio_features else np.nan

        # Appending in dataframe
        hits = hits.append(song, ignore_index=True)
    
    count += CHUNK_SIZE
    if count % 100 == 0:
        print('%.2f%% | Collected %d from %d songs' % ((100 * count / total_songs), count, total_songs))

print('Finished')

4.65% | Collected 100 from 2150 songs
9.30% | Collected 200 from 2150 songs
13.95% | Collected 300 from 2150 songs
18.60% | Collected 400 from 2150 songs
23.26% | Collected 500 from 2150 songs
27.91% | Collected 600 from 2150 songs
32.56% | Collected 700 from 2150 songs
37.21% | Collected 800 from 2150 songs
41.86% | Collected 900 from 2150 songs
46.51% | Collected 1000 from 2150 songs
51.16% | Collected 1100 from 2150 songs
55.81% | Collected 1200 from 2150 songs
60.47% | Collected 1300 from 2150 songs
65.12% | Collected 1400 from 2150 songs
69.77% | Collected 1500 from 2150 songs
74.42% | Collected 1600 from 2150 songs
79.07% | Collected 1700 from 2150 songs
83.72% | Collected 1800 from 2150 songs
88.37% | Collected 1900 from 2150 songs
93.02% | Collected 2000 from 2150 songs
97.67% | Collected 2100 from 2150 songs
Finished


In [28]:
# Saving data
hits.drop_duplicates(subset = 'song_id', keep = 'first', inplace = True) 
hits.to_csv('dataset/spotify_hits_dataset_complete.csv', index=False, sep='\t', encoding='utf-8', header=True)

In [40]:
hits.head(30)

Unnamed: 0,song_id,song_name,artist_id,artist_name,popularity,explicit,song_type,track_number,num_artists,num_available_markets,...,time_signature,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo
0,7offICpl9kla5kC61uUAs9,Bate Palma,"['5GqnSMX8p2hxqsM6LqDty3', '0jTDeBJQr3unrK29Lk...","['MC JottaPê', 'Lexa']",50,False,Collaboration,1,2,181,...,4,0.626,0.857,0.685,0.0,0.206,-4.84,0.38,0.837,150.052
1,1dLXLETYiZohIZWnse4ypG,Better,['5ZsFI1h6hIdQRw2ti0hz81'],['ZAYN'],68,False,Solo,1,1,183,...,4,0.353,0.637,0.418,0.0,0.131,-7.137,0.301,0.0807,84.295
2,79Uf0vVa7HwWKSgdpTwtRm,Século 21,"['7KVJCU4z5L4EUHILL8aMxR', '4PzYKhC14sTJNEr0dz...","['Leo Santana', 'Luísa Sonza']",50,True,Collaboration,5,2,184,...,4,0.456,0.832,0.769,0.0,0.407,-7.023,0.0578,0.724,132.972
3,3U4isOIWM3VvDubwSI3y7a,All of Me,['5y2Xq6xcjJb2jVM54GHK3t'],['John Legend'],88,False,Solo,6,1,184,...,4,0.922,0.422,0.264,0.0,0.132,-7.064,0.0322,0.331,119.93
4,78kar2tZk7655xZMibzXO3,Oi,['5D56dZmhE9DgT01XixdHiD'],['Lagum'],63,False,Solo,4,1,184,...,4,0.296,0.807,0.56,0.0,0.101,-9.356,0.0459,0.725,134.093
5,1qSikFO1BLTMyZCi2SbksV,Pray (feat. Conor Maynard),"['0NGAZxHanS9e0iNHpR8f2W', '6mU8ucezzms5I2kNH6...","['Alok', 'Conor Maynard']",45,False,Collaboration,1,2,1,...,4,0.113,0.355,0.737,0.0,0.218,-4.169,0.05,0.269,169.489
6,7cMztRE7oWfQSRN569IY7K,Namora Eu Aí,['1elUiq4X7pxej6FRlrEzjM'],['Jorge & Mateus'],69,False,Solo,14,1,181,...,4,0.411,0.689,0.805,0.0,0.0608,-3.884,0.104,0.961,156.129
7,4191RXFPa7Ge9XkA4cWlna,Lost Cause,['6qqNVTkY8uBg9cP3Jd7DAH'],['Billie Eilish'],80,True,Solo,7,1,184,...,4,0.702,0.526,0.334,0.00782,0.0586,-8.491,0.251,0.522,74.966
8,7uU04pPMnKELpv8mIJ3yLa,Zé da Recaída - Ao Vivo,['7MiDcPa6UiV3In7lIM71IN'],['Gusttavo Lima'],59,False,Solo,1,1,180,...,4,0.0675,0.837,0.881,0.0,0.728,-2.98,0.0369,0.756,134.974
9,1IWNylpZ477gIVUDpJL66u,Sour Candy (with BLACKPINK),"['1HY2Jd0NmPuamShAr6KMms', '41MozSoPIsD1dJM0CL...","['Lady Gaga', 'BLACKPINK']",69,False,Collaboration,10,2,184,...,4,0.0604,0.752,0.87,1.1e-05,0.489,-3.812,0.0623,0.784,120.006


In [39]:
for year in years:
    print('%s | %d | Updating charts...' % (market, year))

    directory = 'dataset/charts/%s/%d' % (market, year)
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            path = directory + '/' + filename

            try:
                chart = pd.read_csv(path, names=['Position', 'TrackName', 'Artist', 'Streams', 'URL'], encoding='utf-8', delimiter=',', header=1)
                chart.to_csv('dataset/charts/%s/%d/updated-%s' % (market, year, filename), index=False, encoding='utf-8')
            except ParserError:
                print('ERROR | Check file %s' % path)
print('Finished.')

br | 2019 | Updating charts...
br | 2020 | Updating charts...
br | 2021 | Updating charts...
Finished.


In [37]:
path = 'dataset/charts/br/2019/updated-regional-br-weekly-2018-12-28--2019-01-04.csv'
chart = pd.read_csv(path, delimiter=',')
chart.head()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 11, saw 2
