In [1]:
import os
import json
from pprint import pprint

import numpy as np
import pandas as pd

import datetime

In [2]:
def get_track_data(item):
    track = {}
    track['played_at'] = item['played_at']
    track['name'] = item['track']['name']
    track['duration_ms'] = item['track']['duration_ms']
    track['id'] = item['track']['id']
    track['popularity'] = item['track']['popularity']
    track['artist'] = ' '.join(sorted([artist['name'] for artist in item['track']['artists']]))
    track['album'] = item['track']['album']['name']   
    track['album_id'] = item['track']['album']['id']
    
    return track

In [3]:
def get_track_feature_data(item):
    audio_features = ['id',
                      'acousticness', 
                      'danceability', 
                      'energy', 
                      'instrumentalness', 
                      'liveness', 
                      'loudness', 
                      'speechiness',
                      'tempo',
                      'valence',
                      'key',
                      'mode', 
                      'time_signature']

    return {feature: item[feature] for feature in audio_features}

In [4]:
data_dir = '../data/'

In [5]:
tracks = []
features = []
for file in os.listdir(data_dir):
    filename = os.fsdecode(file)
    if filename.endswith('.json'):
        with open(os.path.join(os.fsdecode(data_dir), filename), 'r') as f:
            data_json = json.load(f)
            if filename.startswith('recently'):
                for item in data_json['items']:
                    track = get_track_data(item)
                    tracks.append(track)
                    
            elif filename.startswith('audio'):
                for item in data_json.values():
                    track_feature = get_track_feature_data(item[0])
                    features.append(track_feature)

In [6]:
tracks = pd.DataFrame(tracks)
tracks.drop_duplicates(inplace=True)
tracks.shape

(3106, 8)

In [7]:
tracks.head()

Unnamed: 0,played_at,name,duration_ms,id,popularity,artist,album,album_id
0,2019-12-12T10:57:09.063Z,Trans Siberian,472769,2mSCnlqQ0mvHrPyVLuo5kE,18,Juno Reactor,The Golden Sun of the Great East,4cgvNUnZM2QHxhhY12lWk7
1,2019-12-12T10:49:16.309Z,Guillotine,417761,0m6n7veetHyEr3NuUreI4h,20,Juno Reactor,The Golden Sun of the Great East,4cgvNUnZM2QHxhhY12lWk7
2,2019-12-12T10:42:18.604Z,Invisible,525548,3Jw9OuPa9AXjaJXKbic26y,21,Juno Reactor,The Golden Sun of the Great East,4cgvNUnZM2QHxhhY12lWk7
3,2019-12-12T10:33:32.932Z,Final Frontier,602813,78WeMIirolQgyaUqFnqOCi,26,Juno Reactor,The Golden Sun of the Great East,4cgvNUnZM2QHxhhY12lWk7
4,2019-12-11T20:55:29.018Z,O pato,122866,6u6E9DZgzyyo1oFofn7zIs,45,João Gilberto,Bossa Nova Essentials,2hVlKPHcnN1xpdnIQOfTfi


In [8]:
features = pd.DataFrame(features)
features.drop_duplicates(inplace=True)
features.shape

(1717, 13)

In [9]:
features.head()

Unnamed: 0,id,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key,mode,time_signature
0,03d5V0uvYp3hU7wE9mJzJa,0.0225,0.454,0.447,0.889,0.103,-13.387,0.0273,74.474,0.429,9,0,4
1,0CufyIMTfy8YFJXjNfDXvl,0.528,0.326,0.279,0.696,0.222,-15.306,0.0336,136.48,0.148,7,1,4
2,121IyQ9weu5MXAfXAFVDcH,0.000334,0.454,0.579,0.00625,0.163,-9.987,0.0286,84.146,0.221,0,1,4
3,1Nc5cKez6FmEbsFwYwUbqg,0.795,0.527,0.825,0.0,0.0973,-5.025,0.0939,95.958,0.772,3,1,4
4,1TbgAIgWDO1qzwem0g0Qyy,0.877,0.487,0.659,0.311,0.134,-10.989,0.054,107.135,0.854,10,0,4


In [10]:
tracks.to_csv('../data/processed/tracks.csv')
features.to_csv('../data/processed/track_features.csv')