## Imports

In [1]:
# Read json file
import json
import numpy as np
import requests
import os
import pandas as pd

## Get playlist from `spotify_million_playlist_dataset`

Get only the playlist name and tracks

In [2]:
playlists = []

for i in range(30000, 60000, 1000):
    with open(f'spotify_million_playlist_dataset/data/mpd.slice.{i}-{i+999}.json') as f:
        data = json.load(f)
    playlists.extend([[playlist['name'], playlist['tracks']] for playlist in data['playlists']])

len(playlists)

30000

## Get all tracks' features

Using the Spotify API

In [3]:
from dotenv import load_dotenv
load_dotenv()
token = os.getenv('SPOTIFY_TOKEN')

In [4]:
def get_audio_features(track_uris):
    return requests.get(
        "https://api.spotify.com/v1/audio-features?ids=" + \
            ",".join([uri.split(':')[-1] for uri in  track_uris[:100]]),
        headers={"Authorization": f"Bearer {token}"}
    ).json()['audio_features']

In [7]:
track_features = json.load(open('track_features.json'))

In [9]:
for playlist_title, playlist_tracks in playlists:
    for track in playlist_tracks:
        if track['track_uri'] not in track_features:
            track_features[track['track_uri']] = False

print(len(track_features))

513068


In [10]:
track_uris = [uri for uri, value in track_features.items() if value == False]
print(len(track_uris))

175384


In [28]:
try:
    for i in range(62500, len(track_uris), 100):

        batch = track_uris[i:i+100]
        track_features.update(
            {uri: features for uri, features in zip(batch, get_audio_features(batch))})
        
        print(f"{i} / {len(track_uris)}")
        
except Exception as e:
    print(e)
    print(f"{i} failed")

'audio_features'
62500 failed


In [137]:
assert all(len(track)==18 for track in track_features.values())

In [138]:
json.dump(track_features, open('track_features.json', 'w'))

## Clean the data

### Remove unnecessary columns of track features

In [22]:
cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'duration_ms', 'time_signature']

In [140]:
track_features_clean = {uri: {col: track[col] for col in cols} 
                        for uri, track in track_features.items()}

In [141]:
json.dump(track_features_clean, open('track_features_clean.json', 'w'))

### Reformat playlists

In [9]:
track_features_clean = json.load(open('track_features_clean.json'))

In [12]:
for playlist_title, playlist_tracks in playlists:
    for track in playlist_tracks:
        track.update({col: track_features_clean[track['track_uri']][col] for col in cols})

In [16]:
json.dump(playlists, open('playlists.json', 'w'))

## Get playlist dataframe

In [20]:
playlists = json.load(open('playlists.json'))

In [None]:
def get_playlist_features(playlist_tracks):
    features = [np.mean([track[col] for track in playlist_tracks]) for col in cols]
    return features

In [23]:
playlists_df = []
for i, (playlist_title, playlist_tracks) in enumerate(playlists):
    playlists_df.append([i, playlist_title, *get_playlist_features(playlist_tracks)])

In [24]:
headers = ['index'] + ['title'] + cols

print('\n'.join(str(tuple((headers[i], playlists_df[2][i])))
      for i in range(len(headers))))

('index', 2)
('title', 'korean ')
('danceability', 0.6710624999999999)
('energy', 0.6929531250000001)
('key', 5.0)
('loudness', -4.87559375)
('mode', 0.515625)
('speechiness', 0.09642500000000001)
('acousticness', 0.26910015624999994)
('instrumentalness', 0.0006378121874999999)
('liveness', 0.16889375)
('valence', 0.5650781250000001)
('tempo', 114.595984375)
('duration_ms', 219373.953125)
('time_signature', 4.0)


In [25]:
playlists_df = pd.DataFrame(playlists_df, columns=headers)

In [27]:
playlists_df.to_csv('playlists.csv', index=False)