In [2]:
import os
import json
import pandas as pd
import time
from tqdm import tqdm
import gc
from bs4 import BeautifulSoup
import requests

In [3]:
project_path = "."

In [4]:

def create_df_data(path):
    
    playlist_col = ['collaborative', 'duration_ms', 'modified_at', 
                    'name', 'num_albums', 'num_artists', 'num_edits',
                    'num_followers', 'num_tracks', 'pid']
    tracks_col = ['album_name', 'album_uri', 'artist_name', 'artist_uri', 
                  'duration_ms', 'track_name', 'track_uri'] 
    
    filenames = os.listdir(path + '/mpd/data')
    
    data_playlists = []
    data_tracks = []
    playlists = []

    tracks = set()
    
    total_time = 0
    
    print("Reading the dataset")
    for filename in tqdm(filenames):
        start_time = time.time()  
        fullpath = os.sep.join((path + '/mpd/data/', filename))
        f = open(fullpath)
        js = f.read()
        f.close()

        mpd_slice = json.loads(js)

        for playlist in mpd_slice['playlists']:
            data_playlists.append([playlist[col] for col in playlist_col])
            for track in playlist['tracks']:
                playlists.append([playlist['pid'], track['track_uri'], track['pos']])
                if track['track_uri'] not in tracks:
                    data_tracks.append([track[col] for col in tracks_col])
                    tracks.add(track['track_uri'])
        duration = time.time() - start_time
        total_time += duration

    print("Total time elapsed: ", total_time)
    gc.collect()
    
    df_playlists_info = pd.DataFrame(data_playlists, columns=playlist_col)
    df_playlists_info['collaborative'] = df_playlists_info['collaborative'].map({'false': False, 'true': True})

    df_tracks = pd.DataFrame(data_tracks, columns=tracks_col)
    df_tracks['tid'] = df_tracks.index

    track_uri2tid = df_tracks.set_index('track_uri').tid

    df_playlists = pd.DataFrame.from_records(playlists, columns=['pid', 'tid', 'pos'])
    df_playlists.tid = df_playlists.tid.map(track_uri2tid)

    return {"playlist_info":df_playlists_info,
            "tracks":df_tracks,
            "playlist":df_playlists}


In [5]:
dfs = create_df_data(project_path)

Reading the dataset


100%|██████████| 10/10 [00:02<00:00,  3.50it/s]


Total time elapsed:  2.848677396774292


In [7]:
# Save the dataframes
dfs["playlist_info"].to_csv('data/playlist_info.csv', index=False)
dfs["tracks"].to_csv('data/tracks.csv', index=False)
dfs["playlist"].to_csv('data/playlist.csv', index=False)
