In [2]:
# import libaries
import requests
import pandas as pd
import base64
import numpy as np
import json
import itertools

# 1. Extracting data using Spotify API

## Authorization

In [3]:
# ID
client_id = '**********'
client_secret = '*********'
# id and secret are anonymised for security reason

In [4]:
# base64 encoded client credentials
try:
    client_creds = f'{client_id}:{client_secret}'
    client_creds_b64 = base64.b64encode(client_creds.encode()) # base64 can only encode bytes, so encode str to bytes first

    #get token access following the format of Spotify API
    token_url = 'https://accounts.spotify.com/api/token'
    method = 'POST'
    auth_data = {
        'grant_type': "client_credentials"
    }
    auth_header = {
        'Content-Type': 'application/x-www-form-urlencoded',
        "Authorization": f'Basic {client_creds_b64.decode()}' #Basic <base64 encoded client_id:client_secret>
    }
    r = requests.post(token_url,data = auth_data, headers = auth_header)
    token_response_data = r.json()

    if 200<= r.status_code <= 299: # check if it connects to the API successfully
        print(r.status_code)
        access_token = token_response_data['access_token'] # get token access
except:
    print('Error',np.NAN)

200


## Get the tracks and the artists of my playlist

In [5]:
# create a function to request data
def get_data(token, id, endpoint):
    get_header = {
        'Authorization': f'Bearer {token}',
        'Content-Type': 'application/json',
        'Accept': 'application/json'
    }
    res = requests.get(endpoint, headers = get_header, )
    response = res.json()
    return response

In [6]:
# get the data of the tracks of my playlist
playlist_id = '1mRlx7vfI14CKkavuMRczM?si=92373b41ffe8439a'
tracks_endpoint = f'https://api.spotify.com/v1/playlists/{playlist_id}/tracks'
token = access_token
tracklist = get_data (token, playlist_id, tracks_endpoint)

In [7]:
tracklist.keys() #check the keys of tracklist()

dict_keys(['collaborative', 'description', 'external_urls', 'followers', 'href', 'id', 'images', 'name', 'owner', 'primary_color', 'public', 'snapshot_id', 'tracks', 'type', 'uri'])

In [8]:
tracklist['tracks'].keys() # locate where the useful data is

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [9]:
tracklist['tracks']['items'][0].keys()

dict_keys(['added_at', 'added_by', 'is_local', 'primary_color', 'track', 'video_thumbnail'])

In [10]:
tracklist['tracks']['items'][0]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

In [11]:
tracklist['tracks']['items'][0]['track']['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/2wY79sveU1sp5g7SokKOiI'},
  'href': 'https://api.spotify.com/v1/artists/2wY79sveU1sp5g7SokKOiI',
  'id': '2wY79sveU1sp5g7SokKOiI',
  'name': 'Sam Smith',
  'type': 'artist',
  'uri': 'spotify:artist:2wY79sveU1sp5g7SokKOiI'}]

In [12]:
# get json format data in local drive
with open ('tracklist.json','w') as f:
    json.dump(tracklist,f)

In [13]:
# create empty list for the data I need from the tracklist
artist_names = []
artist_id =[]
artist_url = []
track_names = []
track_url = []
track_id = []
popularity = []

In [14]:
# append the value to each specific list
for i in tracklist['tracks']['items']:
    artist_names.append(i['track']['artists'][0]['name']) # append the artist's name of each artist into the empty list artist_names
    artist_url.append(i['track']['artists'][0]['external_urls']) # append the artist's url of each artist into the empty list artist_url
    artist_id.append(i['track']['artists'][0]['id']) # append the artist's id of each artist into the empty list artist_id
    track_names.append(i['track']['name']) # append the name of each track into the empty list track_names
    track_url.append(i['track']['external_urls']) # append the track's url of each track into the empty list track_url
    track_id.append(i['track']['id']) # append the  id of each track into the empty list track_id
    popularity.append(i['track']['popularity']) # append the value of the popularity of each track into the empty list popularity


In [15]:
#create a dictionary for the data I have gotten
track_dic = {
'artist_names': artist_names,
'artist_url': artist_url,
'artist_id':artist_id,
'track_names': track_names,
'track_url': track_url,
'track_id': track_id,
'popularity': popularity
}

In [16]:
# format the library to a dataframe
df_tracklist = pd.DataFrame(track_dic)

In [17]:
df_tracklist.head() # check the dataframe

Unnamed: 0,artist_names,artist_url,artist_id,track_names,track_url,track_id,popularity
0,Sam Smith,{'spotify': 'https://open.spotify.com/artist/2...,2wY79sveU1sp5g7SokKOiI,Lay Me Down,{'spotify': 'https://open.spotify.com/track/74...,74sb4Gib0cL3TQeCjYF8vh,67
1,Doja Cat,{'spotify': 'https://open.spotify.com/artist/5...,5cj0lLjcoR7YOSnhnX0Po5,Kiss Me More (feat. Naomi Watanabe),{'spotify': 'https://open.spotify.com/track/6X...,6XgBxK1v4li6dOI5to10Sz,59
2,Joji,{'spotify': 'https://open.spotify.com/artist/3...,3MZsBdqDrRTJihTHQrO6Dq,Glimpse of Us,{'spotify': 'https://open.spotify.com/track/6x...,6xGruZOHLs39ZbVccQTuPZ,96
3,Kendrick Lamar,{'spotify': 'https://open.spotify.com/artist/2...,2YZyLoL8N0Wb9xBt1NhZWg,The Heart Part 5,{'spotify': 'https://open.spotify.com/track/5q...,5qbhVL3vB7HwWvb0042B7y,65
4,Soft Lipa,{'spotify': 'https://open.spotify.com/artist/3...,3Xp3DA50zRP4TYOtNR7k1T,關於小熊,{'spotify': 'https://open.spotify.com/track/13...,13tJ9Mkj7cjXxrTjOh0LAN,47


In [18]:
# save the dataframe as csv file in local drive
df_tracklist.to_csv('tracklist.csv',index = False)

## Get artists_features data

In [19]:
# create empty lists of the data I need

followers = []
genres = []
id = []
artist_name = []
artist_popularity = []
artist_url = []

# create a dictionary of the data I need
artists_features_dic = {
    'followers':followers,
    'genres':genres,
    'id':id,
    'artist_name':artist_name,
    'artist_popularity':artist_popularity,
    'artist_url':artist_url
}

In [20]:
# created a function to get the data for artists features
def get_artists_features(token,artist_id):
    artists_features_endpoint = f'https://api.spotify.com/v1/artists/{artist_id}'
    get_header = {
        'Authorization': f'Bearer {token}',
        'Content-Type': 'application/json',
        'Accept': 'application/json'

    }
    r = requests.get(artists_features_endpoint, headers = get_header, )
    artists_features_response = r.json()
    return artists_features_response

In [21]:
# create a empty list for the data I request
artists_features = []
token = access_token
for artist_id in df_tracklist['artist_id']: # use for loop to get the artist's features of each artist who is in the tracklist
    artists_features_list = get_artists_features(token, artist_id )
    artists_features.append(artists_features_list) # append the features in the empty list

In [22]:
artists_features # check what data are in the list

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/2wY79sveU1sp5g7SokKOiI'},
  'followers': {'href': None, 'total': 20044493},
  'genres': ['dance pop', 'pop', 'uk pop'],
  'href': 'https://api.spotify.com/v1/artists/2wY79sveU1sp5g7SokKOiI',
  'id': '2wY79sveU1sp5g7SokKOiI',
  'images': [{'height': 640,
    'url': 'https://i.scdn.co/image/ab6761610000e5eb7f9cb6cb3ac6f8a055153ced',
    'width': 640},
   {'height': 320,
    'url': 'https://i.scdn.co/image/ab676161000051747f9cb6cb3ac6f8a055153ced',
    'width': 320},
   {'height': 160,
    'url': 'https://i.scdn.co/image/ab6761610000f1787f9cb6cb3ac6f8a055153ced',
    'width': 160}],
  'name': 'Sam Smith',
  'popularity': 82,
  'type': 'artist',
  'uri': 'spotify:artist:2wY79sveU1sp5g7SokKOiI'},
 {'external_urls': {'spotify': 'https://open.spotify.com/artist/5cj0lLjcoR7YOSnhnX0Po5'},
  'followers': {'href': None, 'total': 21552164},
  'genres': ['dance pop', 'pop'],
  'href': 'https://api.spotify.com/v1/artists/5cj0lLjcoR7YOSn

In [23]:
artists_features[0].keys() # check keys in the list

dict_keys(['external_urls', 'followers', 'genres', 'href', 'id', 'images', 'name', 'popularity', 'type', 'uri'])

In [24]:
for i in range(100): # use for loop to append the data I need to the empty lists created at the beginning. As 100 is the limit of the value I can get from Spotify API, the range is till 100.
    followers.append(artists_features[i]['followers']['total']) # append the value of followers of each artist into the empty list of followers
    genres.append(artists_features[i]['genres']) # append the genres of each artist into the empty list of genres
    id.append(artists_features[i]['id']) # append the id of each artist into the empty list of id
    artist_name.append(artists_features[i]['name']) # append the name of each artist into the empty list of artist_name
    artist_popularity.append(artists_features[i]['popularity']) # append the value of popularity of each artist into the empty list of artist_popularity
    artist_url.append(artists_features[i]['href']) # append the value of href of each artist into the empty list of artist_url

In [25]:
# create a dataframe regarding the dictionary
df_artists_features_dic = pd.DataFrame(artists_features_dic)

In [26]:
#save the dataframe as csv file in the local drive
df_artists_features_dic.to_csv('artists_features.csv', index = False)

In [30]:
df_artists_features_dic.head() # check the dataframe

Unnamed: 0,followers,genres,id,artist_name,artist_popularity,artist_url
0,20044493,"[dance pop, pop, uk pop]",2wY79sveU1sp5g7SokKOiI,Sam Smith,82,https://api.spotify.com/v1/artists/2wY79sveU1s...
1,21552164,"[dance pop, pop]",5cj0lLjcoR7YOSnhnX0Po5,Doja Cat,88,https://api.spotify.com/v1/artists/5cj0lLjcoR7...
2,6828617,[viral pop],3MZsBdqDrRTJihTHQrO6Dq,Joji,85,https://api.spotify.com/v1/artists/3MZsBdqDrRT...
3,21436080,"[conscious hip hop, hip hop, rap, west coast rap]",2YZyLoL8N0Wb9xBt1NhZWg,Kendrick Lamar,88,https://api.spotify.com/v1/artists/2YZyLoL8N0W...
4,110514,"[chinese indie, mandopop, taiwan hip hop, taiw...",3Xp3DA50zRP4TYOtNR7k1T,Soft Lipa,45,https://api.spotify.com/v1/artists/3Xp3DA50zRP...


## Get Toptracks of the artists

In [31]:
# create a function to request the toptracks of the artists in my playlist
def get_toptracks(token,artist_id):
    toptracks_endpoint = f'	https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market=GB'
    get_header = {
        'Authorization': f'Bearer {token}',
        'Content-Type': 'application/json',
        'Accept': 'application/json'

    }
    r = requests.get(toptracks_endpoint, headers = get_header, )
    toptracks_response = r.json()
    return toptracks_response

In [32]:
toptracks = [] # create an empty list
token = access_token
for artist_id in df_tracklist['artist_id']: # use for loop to loop the function with artist_id I have from the first dataframe df_tracklist
    toptracks_list = get_toptracks(token, artist_id ) # assign the data I get by the function each loop
    toptracks.append(toptracks_list) # append all the data I got of each artist_id  in terms of the top tracks into the empty list toptracks

In [33]:
toptracks[1]['tracks'][9].keys() # check the keys of each track from the list

dict_keys(['album', 'artists', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'is_playable', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

In [34]:
toptracks[99] # toptracks[0] contains the information of all the 10 top tracks of the first artist

{'tracks': [{'album': {'album_type': 'single',
    'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/1AhjOkOLkbHUfcHDSErXQs'},
      'href': 'https://api.spotify.com/v1/artists/1AhjOkOLkbHUfcHDSErXQs',
      'id': '1AhjOkOLkbHUfcHDSErXQs',
      'name': '88rising',
      'type': 'artist',
      'uri': 'spotify:artist:1AhjOkOLkbHUfcHDSErXQs'},
     {'external_urls': {'spotify': 'https://open.spotify.com/artist/6UbmqUEgjLA6jAcXwbM1Z9'},
      'href': 'https://api.spotify.com/v1/artists/6UbmqUEgjLA6jAcXwbM1Z9',
      'id': '6UbmqUEgjLA6jAcXwbM1Z9',
      'name': 'BIBI',
      'type': 'artist',
      'uri': 'spotify:artist:6UbmqUEgjLA6jAcXwbM1Z9'}],
    'external_urls': {'spotify': 'https://open.spotify.com/album/6zQD9g698P2LjEtKpoBXWq'},
    'href': 'https://api.spotify.com/v1/albums/6zQD9g698P2LjEtKpoBXWq',
    'id': '6zQD9g698P2LjEtKpoBXWq',
    'images': [{'height': 640,
      'url': 'https://i.scdn.co/image/ab67616d0000b273d373d58ad02ede9bfba2dbb5',
      'wid

In [84]:
# create empty lists for the data I want
artist_name =[]
artist_id =[]
track_name =[]
track_id =[]
track_popularity =[]
track_url = []


In [86]:
# use the library itertools to loop two variables (the artists and the 10 top tracks of each artist

for i,j in itertools.product([0,99],[0,9]):
        artist_name.append(toptracks[i]['tracks'][j]['artists'][0]['name']) # append the artist name of each top track of each artist into the empty list artist_name
        artist_id.append(toptracks[i]['tracks'][j]['artists'][0]['id']) # append the artist id of each top track of each artist into the empty list artist_id
        track_name.append(toptracks[i]['tracks'][j]['name']) # append the track name of each top track of each artist into the empty list track_name
        track_id.append(toptracks[i]['tracks'][j]['id']) # append the track id of each top track of each artist into the empty list track_id
        track_popularity.append(toptracks[i]['tracks'][j]['popularity']) # append the value of the popularity of each top track of each artist into the empty list track_popularity
        track_url.append(toptracks[i]['tracks'][j]['href']) # append the url of each top track of each artist into the empty list track_url

In [90]:
# create the dictionary for the data based on the lists
toptracks_dic = {
    'artist_name':artist_name,
    'artist_id':artist_id,
    'track_name':track_name,
    'track_id':track_id,
    'track_popularity':track_popularity,
    'track_url':track_url
}

In [91]:
# create a dataframe regarding the dictionary
df_toptracks_dic = pd.DataFrame(toptracks_dic)

In [92]:
# save the dataframe as csv file in local drive
df_toptracks_dic.to_csv('toptracks.csv', index = False)

## Get the features of the tracks in the tracklis

In [41]:
# create a function to request the data of tracks' audio features
def get_features (token,track_id):
    features_endpoint = f'https://api.spotify.com/v1/audio-features/{track_id}'
    get_header = {
        'Authorization': f'Bearer {token}',
        'Content-Type': 'application/json',
        'Accept': 'application/json'
    }

    r = requests.get(features_endpoint, headers = get_header, )
    features_response = r.json()
    return features_response

In [42]:
# create empty lists for the data I want, preparing for creating a dataframe
danceability = []
energy = []
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
type = []
id = []
track_href = []
analysis_url = []
duration_ms =[]

In [43]:
# create a dictionary regarding the lists above
features_dic = {
    'danceability':danceability,
    'energy':energy,
    'key':key,
    'loudness':loudness,
    'mode':mode,
    'speechiness':speechiness,
    'acousticness':acousticness,
    'instrumentalness':instrumentalness,
    'liveness':liveness,
    'valence':valence,
    'tempo':tempo,
    'type':type,
    'id':id,
    'track_href':track_href,
    'analysis_url':analysis_url,
    'duration_ms':duration_ms
}

In [44]:
track_features = [] # create an empty list for all the data requested
token = access_token
for track_id in df_tracklist['track_id']: # use for loop to get every track's features where the track id is in the tracklist
    features_list = get_features(token, track_id ) # assign the data I got with the function to a variable
    track_features.append(features_list) # append all the data of the features of each track to the empty list


In [46]:
# use for loop to append the value of each feature of each song in the tracklist to the empty lists created above
for i in range(100):
    danceability.append(track_features[i]['danceability'])
    energy.append(track_features[i]['energy'])
    key.append(track_features[i]['key'])
    loudness.append(track_features[i]['loudness'])
    mode.append(track_features[i]['mode'])
    speechiness.append(track_features[i]['speechiness'])
    acousticness.append(track_features[i]['acousticness'])
    instrumentalness.append(track_features[i]['instrumentalness'])
    liveness.append(track_features[i]['liveness'])
    valence.append(track_features[i]['valence'])
    tempo.append(track_features[i]['tempo'])
    type.append(track_features[i]['type'])
    id.append(track_features[i]['id'])
    track_href.append(track_features[i]['track_href'])
    analysis_url.append(track_features[i]['analysis_url'])
    duration_ms.append(track_features[i]['duration_ms'])

In [47]:
# create a dataframe of the dictionary
df_features_dic = pd.DataFrame(features_dic)

In [48]:
df_features_dic.info() # check the infomation of the dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      100 non-null    float64
 1   energy            100 non-null    float64
 2   key               100 non-null    int64  
 3   loudness          100 non-null    float64
 4   mode              100 non-null    int64  
 5   speechiness       100 non-null    float64
 6   acousticness      100 non-null    float64
 7   instrumentalness  100 non-null    float64
 8   liveness          100 non-null    float64
 9   valence           100 non-null    float64
 10  tempo             100 non-null    float64
 11  type              100 non-null    object 
 12  id                100 non-null    object 
 13  track_href        100 non-null    object 
 14  analysis_url      100 non-null    object 
 15  duration_ms       100 non-null    int64  
dtypes: float64(9), int64(3), object(4)
memory usa

In [35]:
# save the dataframe as csv file in local drive
df_features_dic.to_csv('features.csv', index = False)