This is the template for how we collected all the feature data we used. 
The process below was repeated for various slices of 10000 playlists, and a slice was used every time the requests broke down (from an authentification token timeout, being blocked for too many hits, an internet error, etc.)

In [133]:
%%bash
head 'mpd.v1/data/mpd.slice.20000-20999.json'

# preview first few lines

{
    "info": {
        "generated_on": "2017-12-03 08:41:42.057563", 
        "slice": "20000-20999", 
        "version": "v1"
    }, 
    "playlists": [
        {
            "name": "Prelude", 
            "collaborative": "false", 


In [135]:
import ijson

filename = "mpd.v1/data/mpd.slice.20000-20999.json"

# open file with current slice of 1000 playlists
with open(filename, 'r') as f:
    objects = ijson.items(f, 'playlists.item')
    columns = list(objects)

In [136]:
column_names = [col["tracks"] for col in columns]

In [137]:
# get track URI and names from playlists; we will use these as reference points when we hit the Spotify API
playlists = column_names
track_features_uri = []
track_features_name = []
for playlist in playlists:
    for tracks in playlist:
        track_features_uri.append(tracks["track_uri"])
        track_features_name.append(tracks["track_name"])

In [138]:
import pandas as pd
track_features = pd.DataFrame(track_features_name, columns=['track_name'])
track_features["ids"] = track_features_uri
track_features = track_features.drop_duplicates()
track_features.ids = track_features.ids.str.slice(14)
track_features = track_features.reset_index()
track_features.head()

Unnamed: 0,index,track_name,ids
0,0,A Thousand Years,4eYaDRhiL5iesFp2EuoODr
1,1,Can't Help Falling In Love - Recorded Live at ...,5GBWQszdw6PtAN0Negzut6
2,2,Ho Hey,1jdNcAD8Ir58RlsdGjJJdx
3,3,Everything,4T6HLdP6OcAtqC6tGnQelG
4,4,She Is Love,1q74TesvRCIo5RvJM5B84F


In [139]:
# how many tracks in the current slice?
track_features.shape

(35247, 3)

In [140]:
# various arrays to store later dataframe's column data; we manually chose potentially relevant attributes

# from Spotify track object
album_id = [] 
album_name = [] 
album_release_date = [] 

track_artist_ids = [] 

track_id = [] 
track_duration_ms = [] 
track_explicit = [] 
track_name = [] 
track_popularity = [] 

# from Spotify audio features object
danceability = []
energy = []
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
audio_features_id = []

# from Spotify artist object
artist1_id = []
followers = []
artist_popularity = []
artist_genre1 = []
artist_genre2 = []
artist_genre3 = []

# from Spotify album object
album_label = []
album_popularity = []

In [None]:
import requests
i=0

for ids in track_features.ids:
    
    # FROM TRACK OBJECT
    track_features_endpoint = "https://api.spotify.com/v1/tracks/{}".format(ids)
    # the below authorization token must be periodically changed and manually requested from the Spotify API
    headers = {"Authorization":"Bearer BQCqe5naoZO73MxotOI1nxw9QgiYYkpwv148di3XSYyc8zEF2-lc_2DDWKxi383ZuNZVTSMECpa5Wp5OWyyiaXj_WUH_ZrlQyLLvwNiS-NYOtzr_u2wzkJMA3fZLY46KInkd8H9iRFWuAfAG2nl9rG02PyLtxRhpRIEIjNd1T-J_biHFN5gTfZZpKmXsS83tqoT0LYIQF9-LFVTuOTwbgvWcvVKnV7J0VmfdCd75Mwupgiq1ag--TKTeVkyvPD_NubDH46m_kC0XDe8c"}
    rep = requests.get(url=track_features_endpoint, headers=headers).json()
    
    album_id.append(rep['album']['id'])
    cur_album_id = album_id[i]
    
    album_name.append(rep["album"]["name"])
    album_release_date.append(rep["album"]["release_date"])
    
    track_artist_ids.append([artist['id'] for artist in rep['artists']]) #this is multiple but we're only doing 1st
    track_artist_id1 = track_artist_ids[i][0]
    
    track_id.append(rep['id'])
    track_duration_ms.append(rep["duration_ms"])
    track_explicit.append(rep["explicit"])
    track_name.append(rep["name"])
    track_popularity.append(rep["popularity"])
    
    # FROM ARTIST OBJECT
    artists_endpoint = "https://api.spotify.com/v1/artists/{}".format(track_artist_id1)
    rep = requests.get(url=artists_endpoint,headers=headers).json()
    artist1_id.append(rep['id'])
    followers.append(rep["followers"]["total"])
    artist_popularity.append(rep["popularity"])
    if len(rep['genres']) >= 1:
        artist_genre1.append(rep['genres'][0])
    else:
        artist_genre1.append(None)
    if len(rep['genres']) >= 2:
        artist_genre2.append(rep['genres'][1])
    else:
        artist_genre2.append(None)
    if len(rep['genres']) >= 3:
        artist_genre3.append(rep['genres'][2])
    else:
        artist_genre3.append(None)
    
    
    # FROM AUDIO FEATURES OBJECT
    audio_features_endpoint = "https://api.spotify.com/v1/audio-features/{}".format(ids)
    rep = requests.get(url=audio_features_endpoint, headers=headers).json()
    danceability.append(rep["danceability"])
    energy.append(rep["energy"])
    key.append(rep["key"])
    loudness.append(rep["loudness"])
    mode.append(rep["mode"])
    speechiness.append(rep["speechiness"])
    acousticness.append(rep["acousticness"])
    instrumentalness.append(rep["instrumentalness"])
    liveness.append(rep["liveness"])
    valence.append(rep["valence"])
    tempo.append(rep["tempo"])
    audio_features_id.append(rep['id'])
    
    # FROM ALBUM OBJECT
    albums_endpoint = "https://api.spotify.com/v1/albums/{}".format(cur_album_id)
    rep = requests.get(url=albums_endpoint, headers=headers).json()
    album_label.append(rep["label"])
    album_popularity.append(rep["popularity"])
    
    if i==10000:
        break
    # print all i's so we can figure out when the scraping broke down
    i = i + 1

In [113]:
# SPECIAL CELL: this is for when the scraping breaks down IN THE MIDDLE of the previous cell's running. You can see 
# on which object request the scraping broke, so append None to all lists after that breakdown

# this ensures that when we combine all the lists into one dataframe, it doesn't automatically drop columns with 
# no data (from the breakdown)


# tracks
album_id.append(None)
album_name.append(None)
album_release_date.append(None)

track_artist_ids.append(None)

track_id.append(None)
track_duration_ms.append(None)
track_explicit.append(None)
track_name.append(None)
track_popularity.append(None)

#artists 
artist1_id.append(None)
followers.append(None)
artist_popularity.append(None)
artist_genre1.append(None)
artist_genre2.append(None)
artist_genre3.append(None)

#audio features 
danceability.append(None)
energy.append(None)
key.append(None)
loudness.append(None)
mode.append(None)
speechiness.append(None)
acousticness.append(None)
instrumentalness.append(None)
liveness.append(None)
valence.append(None)
tempo.append(None)
audio_features_id.append(None)

#albums
album_label.append(None)
album_popularity.append(None)

In [142]:
# combine all lists into one df
track_records = pd.DataFrame(album_id, columns=['album_id'])

track_records["album_name"] = album_name
track_records["album_release_date"] = album_release_date

track_records["track_artist_ids"] = track_artist_ids

track_records['track_id'] = track_id
track_records["track_duration_ms"] = track_duration_ms
track_records["track_explicit"] = track_explicit
track_records["track_name"] = track_name
track_records["track_popularity"] = track_popularity


track_records["danceability"] = danceability
track_records["energy"] = energy
track_records["key"] = key
track_records["loudness"] = loudness
track_records["mode"] = mode
track_records["speechiness"] = speechiness
track_records["acousticness"] = acousticness
track_records["instrumentalness"] = instrumentalness
track_records["liveness"] = liveness
track_records["valence"] = valence
track_records["tempo"] = tempo
track_records["audio_features_id"] = audio_features_id

track_records['artist1_id'] = artist1_id
track_records['followers'] = followers
track_records['artist_popularity'] = artist_popularity
track_records['artist_genre1'] = artist_genre1
track_records['artist_genre2'] = artist_genre2
track_records['artist_genre3'] = artist_genre3

track_records['album_label'] = album_label
track_records['album_popularity'] = album_popularity


In [143]:
track_records.shape

(1989, 29)

In [144]:
track_records

Unnamed: 0,album_id,album_name,album_release_date,track_artist_ids,track_id,track_duration_ms,track_explicit,track_name,track_popularity,danceability,...,tempo,audio_features_id,artist1_id,followers,artist_popularity,artist_genre1,artist_genre2,artist_genre3,album_label,album_popularity
0,1gIuyEXICtt24CEKrYSOw8,The Piano Guys,2012,[0jW6R8CVyVohuUJVcuweDI],4eYaDRhiL5iesFp2EuoODr,275973,False,A Thousand Years,43,0.230,...,157.084,4eYaDRhiL5iesFp2EuoODr,0jW6R8CVyVohuUJVcuweDI,784409,65,bow pop,classify,,Portrait/Sony Masterworks,43
1,7o2VLivg95UduHjTMTIEIf,Be OK,2008,[2vm8GdHyrJh2O2MfbQFYG0],5GBWQszdw6PtAN0Negzut6,193946,False,Can't Help Falling In Love - Recorded Live at ...,40,0.244,...,72.868,5GBWQszdw6PtAN0Negzut6,2vm8GdHyrJh2O2MfbQFYG0,505291,61,acoustic pop,ectofolk,folk-pop,Cabin 24 Records,45
2,5h7fx8ILwOZ3I5yQ4eGBl7,The Lumineers,2012-01-01,[16oZKvXb6WkQlVAjwo2Wbg],1jdNcAD8Ir58RlsdGjJJdx,161226,False,Ho Hey,70,0.685,...,79.991,1jdNcAD8Ir58RlsdGjJJdx,16oZKvXb6WkQlVAjwo2Wbg,2619293,74,folk-pop,indie folk,indie pop,Universal Music Group,66
3,3h4pyWRJIB9ZyRKXChbX22,Call Me Irresponsible (Standard Edition),2007-04-27,[1GxkXlMwML1oSg5eLPiAz3],4T6HLdP6OcAtqC6tGnQelG,212373,False,Everything,70,0.686,...,123.125,4T6HLdP6OcAtqC6tGnQelG,1GxkXlMwML1oSg5eLPiAz3,2627273,72,adult standards,canadian pop,lounge,143/Reprise,66
4,6QJw1UxLAFYQfy8XirsXiW,Losing Sleep,2009-01-01,[2PCUhxD40qlMqsKHjTZD2e],1q74TesvRCIo5RvJM5B84F,146066,False,She Is Love,53,0.629,...,134.297,1q74TesvRCIo5RvJM5B84F,2PCUhxD40qlMqsKHjTZD2e,339532,58,acoustic pop,indiecoustica,neo mellow,Universal Music Group,49
5,6Pb3K1oPXdhsqFXtzKe3Z1,19,2008-01-28,[4dpARuHxo51G3z768sgnrY],7rPLZ8Krm6CZIbraFUlnWZ,212040,False,Make You Feel My Love,55,0.325,...,72.416,7rPLZ8Krm6CZIbraFUlnWZ,4dpARuHxo51G3z768sgnrY,10592482,81,pop,,,XL Recordings/Columbia,50
6,10HuTnFqkhym6gXmIDxBsm,"For Emma, Forever Ago",2008-02-19,[4LEiUm1SRbFMgfqnQTwUbQ],3ZMv9EzGoteNi5Qnx0KpEO,232426,False,Skinny Love,49,0.591,...,76.396,3ZMv9EzGoteNi5Qnx0KpEO,4LEiUm1SRbFMgfqnQTwUbQ,2051334,71,chamber pop,folk-pop,indie folk,Jagjaguwar,45
7,39xrkt8RILtwa9DMyLkv32,Up from Below,2009-07-14,[7giUHu5pv6YTZgSkxxCcgh],7w5cxTEzp1rfV3KCy0Bd5N,303200,False,Home,73,0.542,...,111.665,7w5cxTEzp1rfV3KCy0Bd5N,7giUHu5pv6YTZgSkxxCcgh,720594,65,chamber pop,folk-pop,indie folk,Vagrant Records,67
8,5HHP1ctEzugQvbPCKJiWkc,Ok Go,2002,[3hozsZ9hqNq7CoBGYNlFTz],27VCODQk90XyqJB1FdwqyT,213906,False,1000 Miles Per Hour,16,0.524,...,81.898,27VCODQk90XyqJB1FdwqyT,3hozsZ9hqNq7CoBGYNlFTz,288245,55,alternative rock,chicago indie,comic,Capitol Records,32
9,1hyorM32T35U5HAxdh6glI,Stop Drop And Roll!!!,2008-02-05,[5tI0Vj2FhioY8AKtEqu9d3],5S1jZIl6Hviwdr7KFTG4kz,135586,False,The Pedestrian,31,0.344,...,135.116,5S1jZIl6Hviwdr7KFTG4kz,5tI0Vj2FhioY8AKtEqu9d3,16135,31,,,,Jingle Town Records,39


In [145]:
# export that data to a csv, which we will merge with all other csvs in Excel
track_records.to_csv("20000-20999,1988.csv", sep=',')