In [1]:
"""
This file is for playing around with song data from the MSD data set.
In particular, we are interesting in getting all of the data out in
an exportable manner.

We can't get all of the information from the summary file, we have to
open all files and extract the data to do this.
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import hdf5_getters
from pprint import pprint


In [2]:
# change these as appropriate.
msd_subset_path = '../MillionSongSubset'
msd_subset_data_path = os.path.join(msd_subset_path, 'data')
msd_subset_addf_path = os.path.join(msd_subset_path, 'AdditionalFiles')

In [3]:
# Create a mapping of the getter functions available in the hdf5_getters
# library and the names we want to assign their return value to. This
# defines the schema we want to export

getter_func_names = list(filter(lambda x: x.startswith('get'), dir(hdf5_getters)))
getter_mapping = {x[4:]:x for x in getter_func_names}
pprint(getter_mapping)


In [26]:
# functions

def apply_to_all_tracks(basedir, func, ext='.h5'):
    """
    Walk the directoy and apply a given function to a track file.
    """
    cnt = 0
    data = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            data.append(func(f))
            cnt += 1
            print(cnt)
    return (data, cnt)


def get_song_attr(file_name):
    """
    Apply all possible getters to a track file. this completely exports
    all of the data for a given track file.
    """
    f = hdf5_getters.open_h5_file_read(file_name)
    data = {}
    
    for attr_name, func_name in getter_mapping.iteritems():
        data[attr_name] = getattr(hdf5_getters, func_name)(f)
    f.close()
    return data
    


In [22]:
# try it out for a single file

FILE_PATH = os.path.join(msd_subset_data_path, 'A/A/A/TRAAAAW128F429D538.h5')

song = get_song_attr(FILE_PATH)

pprint(song)

In [28]:
# now lets create a single json file of this all of the data.
# This will be used to seed our postgres database. This will
# take forever, don't do this in a notebook.

#data, cnt = apply_to_all_tracks(msd_subset_data_path, get_song_attr)
#print 'Exported {} songs'.format(cnt)