## Imports

In [1]:
# Read json file
import json
import numpy as np
import requests
import os
import pandas as pd

## Get playlist from `spotify_million_playlist_dataset`

Get only the playlist name and tracks

In [2]:
playlists = []

for i in range(30000, 60000, 1000):
    with open(f'spotify_million_playlist_dataset/data/mpd.slice.{i}-{i+999}.json') as f:
        data = json.load(f)
    playlists.extend([[playlist['name'], playlist['tracks']] for playlist in data['playlists']])

len(playlists)

30000

## Get all tracks' features

Using the Spotify API

In [3]:
from dotenv import load_dotenv
load_dotenv()
token = os.getenv('SPOTIFY_TOKEN')

In [4]:
def get_audio_features(track_uris):
    return requests.get(
        "https://api.spotify.com/v1/audio-features?ids=" + \
            ",".join([uri.split(':')[-1] for uri in  track_uris[:100]]),
        headers={"Authorization": f"Bearer {token}"}
    ).json()['audio_features']

In [5]:
track_features = json.load(open('track_features.json'))

In [9]:
for playlist_title, playlist_tracks in playlists:
    for track in playlist_tracks:
        if track['track_uri'] not in track_features:
            track_features[track['track_uri']] = False

print(len(track_features))

513068


In [10]:
track_uris = [uri for uri, value in track_features.items() if value == False]
print(len(track_uris))

175384


In [None]:
try:
    for i in range(62500, len(track_uris), 100):

        batch = track_uris[i:i+100]
        track_features.update(
            {uri: features for uri, features in zip(batch, get_audio_features(batch))})
        
        print(f"{i} / {len(track_uris)}")
        
except Exception as e:
    print(e)
    print(f"{i} failed")

In [137]:
assert all(len(track)==18 for track in track_features.values())

In [138]:
json.dump(track_features, open('track_features.json', 'w'))

## Get all tracks' popularity

In [26]:
def get_popularity(track_uris):
    res = requests.get(
        "https://api.spotify.com/v1/tracks?ids=" + \
            ",".join([uri.split(':')[-1] for uri in  track_uris[:100]]),
        headers={"Authorization": f"Bearer {token}"}
    ).json()['tracks']

    return [track['popularity'] if track else -1 for track in res]

In [None]:
track_popularity = {}

In [48]:
try:
    for i in range(161650, 249400, 50):

        batch = track_uris[i:i+50]
        track_popularity.update(
            {uri: popularity for uri, popularity in zip(batch, get_popularity(batch))})
        
        print(f"{i} / {len(track_uris)}")
        
except Exception as e:
    print(e)
    print(f"{i} failed")

161650 / 337684
161700 / 337684
161750 / 337684
161800 / 337684
161850 / 337684
161900 / 337684
161950 / 337684
162000 / 337684
162050 / 337684
162100 / 337684
162150 / 337684
162200 / 337684
162250 / 337684
162300 / 337684
162350 / 337684
162400 / 337684
162450 / 337684
162500 / 337684
162550 / 337684
162600 / 337684
162650 / 337684
162700 / 337684
162750 / 337684
162800 / 337684
162850 / 337684
162900 / 337684
162950 / 337684
163000 / 337684
163050 / 337684
163100 / 337684
163150 / 337684
163200 / 337684
163250 / 337684
163300 / 337684
163350 / 337684
163400 / 337684
163450 / 337684
163500 / 337684
163550 / 337684
163600 / 337684
163650 / 337684
163700 / 337684
163750 / 337684
163800 / 337684
163850 / 337684
163900 / 337684
163950 / 337684
164000 / 337684
164050 / 337684
164100 / 337684
164150 / 337684
164200 / 337684
164250 / 337684
164300 / 337684
164350 / 337684
164400 / 337684
164450 / 337684
164500 / 337684
164550 / 337684
164600 / 337684
164650 / 337684
164700 / 337684
164750 /

In [49]:
assert len(track_popularity.values()) == len(track_uris)

In [50]:
json.dump(track_popularity, open('track_popularity.json', 'w'))

## Clean the data

### Remove unnecessary columns of track features

In [52]:
cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'duration_ms', 'time_signature']

In [140]:
track_features_clean = {uri: {col: track[col] for col in cols} 
                        for uri, track in track_features.items()}

In [141]:
json.dump(track_features_clean, open('track_features_clean.json', 'w'))

### Reformat playlists

In [53]:
track_features_clean = json.load(open('track_features_clean.json'))

In [12]:
for playlist_title, playlist_tracks in playlists:
    for track in playlist_tracks:
        track.update({col: track_features_clean[track['track_uri']][col] for col in cols})

In [16]:
json.dump(playlists, open('playlists.json', 'w'))

## Get playlist dataframe

In [54]:
playlists = json.load(open('playlists.json'))

In [58]:
def get_playlist_features(playlist_tracks):
    features = [np.mean([track[col] for track in playlist_tracks]) for col in cols]
    features += [np.mean([track_popularity[track['track_uri']] for track in playlist_tracks])]
    return features

In [65]:
playlists_df = []
for i, (playlist_title, playlist_tracks) in enumerate(playlists):
    playlists_df.append([i, playlist_title, *get_playlist_features(playlist_tracks)])

In [66]:
headers = ['index'] + ['title'] + cols + ['popularity']

print('\n'.join(str(tuple((headers[i], playlists_df[100][i])))
      for i in range(len(headers))))

('index', 100)
('title', 'Disneyland')
('danceability', 0.4568181818181818)
('energy', 0.46818181818181814)
('key', 6.045454545454546)
('loudness', -11.28118181818182)
('mode', 0.9545454545454546)
('speechiness', 0.11273636363636363)
('acousticness', 0.7036354545454544)
('instrumentalness', 0.16244817272727274)
('liveness', 0.23448636363636366)
('valence', 0.42904545454545445)
('tempo', 104.25490909090908)
('duration_ms', 217383.81818181818)
('time_signature', 3.8181818181818183)
('popularity', 14.272727272727273)


In [61]:
playlists_df = pd.DataFrame(playlists_df, columns=headers)

In [62]:
playlists_df.to_csv('playlists_popularity.csv', index=False)