# Cache Million Song Dataset

This notebook is used to load, parse, and cache MSD into an array of dicts w/ useful information/features for use with other scripts and notebooks. Saves this cached file to `msd.pickle`.

## Imports

In [29]:
import sys
sys.path.append('../../MSongsDB/PythonSrc') 

import dill as pickle
import time, glob, os
from multiprocessing import Pool as ThreadPool
from hdf5_getters import *

## Utils

In [30]:
def msd_id_to_dirs(msd_id):
    """Given an MSD ID, generate the path prefix.
    E.g. TRABCD12345678 -> A/B/C/TRABCD12345678"""
    return os.path.join(msd_id[2], msd_id[3], msd_id[4], msd_id)

def process_h5(h5):
    h5 = open_h5_file_read(h5)
    parsed = parse_h5(h5)
    h5.close()
    return parsed

def parse_h5(h5):
    return {
        'path': msd_id_to_dirs(get_song_id(h5)),
        'audio_md5': get_audio_md5(h5),
        'song_year': get_year(h5),
        'song_title': get_title(h5),
        'song_id': get_song_id(h5),
        'song_time_signature': get_time_signature(h5),
        'song_time_signature_confidence': get_time_signature_confidence(h5),
        'song_tempo': get_tempo(h5),
        'song_key': get_key(h5),
        'song_key_confidence': get_key_confidence(h5),
        'song_mode': get_mode(h5),
        'song_mode_confidence': get_mode_confidence(h5),
        'song_loudness': get_loudness(h5),
        'song_energy': get_energy(h5),
        'song_duration': get_duration(h5),
        'song_danceability': get_danceability(h5),
        'song_hotttnesss': get_song_hotttnesss(h5),
        'song_segments_start': get_segments_start(h5),
        'song_segments_confidence': get_segments_confidence(h5),
        'song_sections_start': get_sections_start(h5),
        'song_sections_confidence': get_sections_confidence(h5),
        'artist_id': get_artist_id(h5),
        'artist_mbid': get_artist_mbid(h5),
        'artist_name': get_artist_name(h5),
        'artist_terms': get_artist_terms(h5),
        'artist_terms_frequency': get_artist_terms_freq(h5),
        'artist_terms_weight': get_artist_terms_weight(h5),
        'artist_mbtags': get_artist_mbtags(h5),
        'artist_hotttnesss': get_artist_hotttnesss(h5),
        'artist_familiarity': get_artist_familiarity(h5),
        'artist_location': get_artist_location(h5)
    }

## Load, parse, and cache MSD

In [31]:
h5_dir = '/home/bbpwn2/Documents/code/midi-dataset/data/lmd_matched_h5'
h5_files = glob.glob('{}/*/*/*/*.h5'.format(h5_dir))
start_time = time.time()

Singlethreaded loading

In [32]:
# msd = []
# for i, h5 in enumerate(h5_files):
#     meta.append(process_h5(h5))
#     if i % 100 == 0:
#         print(i)

Multithreaded loading

In [33]:
num_threads = 8
pool = ThreadPool(num_threads)
msd = pool.map(process_h5, h5_files)

Load/save from pickle

In [34]:
## Uncomment to save msd as pickle
with open('msd.pickle', 'w') as f:
    pickle.dump(msd, f)

# with open('msd.pickle', 'r') as f:
#     msd = pickle.load(f)

In [35]:
print('Finished in {:.2f} seconds'.format(time.time() - start_time))

Loaded in 145.92 seconds
