# CREATING THE DATASET SCRAPPED SINCE SPOTIFY API (SPOTIPY)

For creating the dataset trough the Spotify API we are going to import the following libraries:

In [1]:
import spotipy
import spotipy.util as util
import pandas as pd

First of all we must register us in the spotify page for developers (https://developer.spotify.com/) for get our client ids to do requests to the API

In [2]:
cid = 'your_client_id'
secret= 'your_secret_id'

token = spotipy.oauth2.SpotifyClientCredentials(client_id=cid, client_secret=secret)
cache_token = token.get_access_token()
sp = spotipy.Spotify(cache_token)

When we do the requests we get metadata (If you want to see the metadata, use the "metadata" method that is inside the function that follows), so we must take the parts of the metadata in what we are interested to, so let's create a function that for one side gives us the list of songs that has a playlist and on the other it takes the information that interests us within the metadata for each song in the playlist.

In [3]:
def playlist_analyze(playlist_id):

    columns = [
        'artist_id', 'artist_name','track_id','track_name','album_id', 'album_name','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',                             'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'time_signature',
               ]

    playlist_df = pd.DataFrame(columns=columns)

    metadata = sp.playlist_tracks(playlist_id=playlist_id)['items']

    features = {}

    for track in metadata:
        features['artist_id'] = track['track']["album"]['artists'][0]['id']
        features['artist_name'] = track['track']["album"]['artists'][0]['name']
        features['track_id'] = track['track']['id']
        features['track_name'] = track['track']['name']
        features['album_id'] =  track['track']['album']['id']
        features['album_name'] = track['track']['album']['name']
        audio_features = sp.audio_features(tracks=features['track_id'])[0]
        for f in columns[6:]:
            features[f] = audio_features[f]

        track_df = pd.DataFrame(features, index = [0])
        playlist_df = pd.concat([playlist_df, track_df], ignore_index=True)

    del playlist_df['id']
    del playlist_df['type']
    
    return playlist_df

The API allows a limited number of requests, so when I tried to do a loop for running the function one time (or less times at least) and concantenate all the resultings Dataframes in only one, an error rises. Finally I have executed the function one time per list, and join all the dataframes into a list. It's boring and unefficient I know, but is the way that I found...

At the same time that I create the frames, I add a label column with the genre of the playlist analyzed.

In [5]:
techno1 = playlist_analyze(playlist_id= '3P1cXwm04X20xZ6V31lsJy') # Pole group
techno1['label'] = 'techno'

techno2 = playlist_analyze(playlist_id= '4hdl30OKdcZT0QWFBeXCmN') # Dynamic reflection
techno2['label'] = 'techno'

techno3 = playlist_analyze(playlist_id= '1Ww4mV2yx1D5025YybX5Xs') # Float records Discography
techno3['label'] = 'techno'

techno4 = playlist_analyze(playlist_id= '16soM7EPPujoYoLfygQByw') # Illegal Alien Records
techno4['label'] = 'techno'

techno5 = playlist_analyze(playlist_id= '5jSEj702WC4Ag9Ky3rvxRn') # SNTS VXS
techno5['label'] = 'techno'
                     
reggaeton1 =  playlist_analyze(playlist_id= '03sDEv7FN58Mb9CJOs1Tgn') # Reggaeton 2020
reggaeton1['label'] = 'reggaeton'

reggaeton2 =  playlist_analyze(playlist_id= '37i9dQZF1DX8SfyqmSFDwe') # Reggaeton classics               
reggaeton2['label'] = 'reggaeton'

reggaeton3 =  playlist_analyze(playlist_id= '3w74oguynJ5C2xBrQwZoha') # Reggaeton 2016/2017/2018         
reggaeton3['label'] = 'reggaeton'

reggaeton4 =  playlist_analyze(playlist_id= '1e5t2YFFF2AoQDjvlwwb46') # Reggaeton 2019           
reggaeton4['label'] = 'reggaeton'

reggaeton5 =  playlist_analyze(playlist_id= '7eTjBdjyTxMp5JTh3ClAzc') # Reggaeton 2014/2015 Verano 2015 Party Summer             
reggaeton5['label'] = 'reggaeton'

hiphop1 = playlist_analyze(playlist_id= '6GyVHGnKUkemc1EJKKd4nt') # Hip Hop 90's
hiphop1['label'] = 'hip-hop'

hiphop2 = playlist_analyze(playlist_id= '0VmmZTMTHafp1zdyHcBmsX') # Best Hip Hop 2000's Hits             
hiphop2['label'] = 'hip-hop'

hiphop3 = playlist_analyze(playlist_id= '2tmp6zVeKUeo7HqRHVI9UE') # hip hop 2010's~2020's           
hiphop3['label'] = 'hip-hop'

hiphop4 = playlist_analyze(playlist_id= '4MPXQ118FvaciMTiAweNK9') # french hip hop
hiphop4['label'] = 'hip-hop'

hiphop5 = playlist_analyze(playlist_id= '45gaAgCMLxB15QLW4AuhQ1') # Hip hop España              
hiphop5['label'] = 'hip-hop'

rock1 = playlist_analyze(playlist_id= '30KEBso4y7A8gZaZzccfes') # The Rolling Stones
rock1['label'] = 'rock'

rock2 = playlist_analyze(playlist_id= '40lF0q156Mia45i9SYl9uz') # Complete Led Zeppelin           
rock2['label'] = 'rock'

rock3 = playlist_analyze(playlist_id= '6mNsuJk3czAxfBtNKy5mB5') # Pearl Jam Complete              
rock3['label'] = 'rock'

rock4 = playlist_analyze(playlist_id= '6yvEgd1N7ZJrPcX4OVsCRo')  # Complete The Black Keys
rock4['label'] = 'rock'

rock5 = playlist_analyze(playlist_id= '1PZGLQ4XWoAzrZ2Nfjbqvr') # Blur Complete          
rock5['label'] = 'rock'

jazz1 = playlist_analyze(playlist_id= '4uXIzYKr0b3WUOWbrfMtii') # The Complete Dreyfus Jazz Recordings (L'Intégrale) (Michel Petrucciani)
jazz1['label'] = 'jazz'

jazz2 = playlist_analyze(playlist_id= '7dO4VJmUhpC8Zcl3k62IDI') # Keith Jarrett: The Complete ECM Recordings
jazz2['label'] = 'jazz'

jazz3 = playlist_analyze(playlist_id= '37i9dQZF1EFI0Kzb17qEYK') # Written By Miles Davis
jazz3['label'] = 'jazz'

jazz4 = playlist_analyze(playlist_id= '1MMimAMBN8um4dpuRqNc6q') # Slow Jazz - Chilling w/ Chet Baker          
jazz4['label'] = 'jazz'

jazz5 = playlist_analyze(playlist_id= '34qcFoFmM0tUyS4eFsOC4G') # Marcin Wasilewski Trio            
jazz5['label'] = 'jazz'

megalist_df = [techno1, techno2, techno3, techno4, techno5, reggaeton1, reggaeton2, reggaeton3, reggaeton4, reggaeton5, hiphop1, hiphop2, hiphop3, hiphop4, hiphop5, rock1, rock2, rock3, rock4, rock5, jazz1, jazz2, jazz3, jazz4, jazz5]

The next step was join all the dataframes through a function, concatenation, and shuffle all the instances of the created dataframe.

In [6]:
def concatenation(list_df):
    count = 0
    for i in list_df:
        if count == 0:
            df = i
            count += 1
        else:
            df = df.append(i)
            
    df = df.sample(frac=1).reset_index(drop=True) 

    return df

atracks = concatenation(list_df=megalist_df)

atracks.head()

Unnamed: 0,artist_id,artist_name,track_id,track_name,album_id,album_name,danceability,energy,key,loudness,...,instrumentalness,liveness,valence,tempo,uri,track_href,analysis_url,duration_ms,time_signature,label
0,56n1NeXsTOOxjX3Z4lVMTJ,SFDK,4HSUcBiPq6e9fsYv5q1yvg,El Niño Güei,1Nux8B69WySS5XMMprdgL3,2005,0.873,0.508,6,-5.564,...,0.0,0.0609,0.694,89.956,spotify:track:4HSUcBiPq6e9fsYv5q1yvg,https://api.spotify.com/v1/tracks/4HSUcBiPq6e9...,https://api.spotify.com/v1/audio-analysis/4HSU...,289200,4,hip-hop
1,1jLIRfWqZf402sVJzCVVNc,Go Hiyama,2q5Z0l8sJkvwTGIwVTq0nS,C - Reeko Remix,5aHgA6VigO9vXotfvR9EdL,Crispy Bites EP,0.743,0.949,1,-6.188,...,0.888,0.0309,0.326,138.028,spotify:track:2q5Z0l8sJkvwTGIwVTq0nS,https://api.spotify.com/v1/tracks/2q5Z0l8sJkvw...,https://api.spotify.com/v1/audio-analysis/2q5Z...,396522,4,techno
2,56n1NeXsTOOxjX3Z4lVMTJ,SFDK,4aWrVBft7jEHE10aADrg4p,Sin Miedo a Vivir,0Tp6yhqsb5UekzoUSBtxZK,Sin Miedo a Vivir (Extended Version),0.66,0.873,0,-7.349,...,0.0,0.317,0.759,97.038,spotify:track:4aWrVBft7jEHE10aADrg4p,https://api.spotify.com/v1/tracks/4aWrVBft7jEH...,https://api.spotify.com/v1/audio-analysis/4aWr...,193013,4,hip-hop
3,7kaC28XGoTp1ViSZAA0iuX,Jossef,28TkGcDuxQlbjo6rJEjh7e,Una Noche,3ixuEcCz3Cha9FTolbdmFM,Una Noche,0.648,0.738,0,-4.903,...,0.000511,0.0975,0.29,179.997,spotify:track:28TkGcDuxQlbjo6rJEjh7e,https://api.spotify.com/v1/tracks/28TkGcDuxQlb...,https://api.spotify.com/v1/audio-analysis/28Tk...,192667,4,reggaeton
4,6Unmr1mmDxRqZY7jkSQOcg,Snts,2NIbNluHtEOlpYBuo0T7XV,NO JVSTIFIED RESENTMENTS,3AlgTOV86TxYwEcrDT4ozE,EVOKED RVPTVRE,0.388,0.998,6,-8.848,...,0.655,0.51,0.174,137.007,spotify:track:2NIbNluHtEOlpYBuo0T7XV,https://api.spotify.com/v1/tracks/2NIbNluHtEOl...,https://api.spotify.com/v1/audio-analysis/2NIb...,341733,4,techno


Finally, we are going to put the data into a CSV document.

In [7]:
atracks.to_csv('spotify_tracks', index=True)