### Before Starting
Go here and download the "MILLION SONG SUBSET" (the 1.8GB one). Extract it to `millionsongsubset/` in this directory.

http://millionsongdataset.com/pages/getting-dataset/#subset

In [3]:
import os
import numpy as np
import pandas as pd
import hd5f_getters as GETTERS


def test_one_file(filepath: str) -> None:
    """
    Playing around to see what we can pull out of one file.
    Example usage: 
        >>> filepath = './millionsongsubset/A/A/A/TRAAAAW128F429D538.h5'
        >>> test_one_file(filepath)
    """
    h5 = GETTERS.open_h5_file_read(filepath)
    num_songs = GETTERS.get_num_songs(h5)
    track_id = GETTERS.get_track_id(h5)
    track_title = GETTERS.get_title(h5)
    play_count = GETTERS.get_song_hotttnesss(h5)
    artist_name = GETTERS.get_artist_name(h5)

    print(f'BEFORE decode(): {track_id = }, {track_title = }, {artist_name = }, {play_count = }')
    print(f'AFTER decode(): {track_id.decode() = }, {track_title.decode() = }, {artist_name.decode() = }, {play_count = }')
    print(f'{num_songs = } for {filepath = }')

def process_files(base_dir: str) -> pd.DataFrame:
    """
    Process all the millionsongsubset files and return a dataframe
        with columns: ['track_id', 'track_title', 'artist_name', 'play_count'].
    This will take a few minutes.
    """

    play_data = []
    print('Processing files...')
    for root, dirs, files in os.walk(base_dir):
        # Loop over all .h5 files
        h5_files = [f for f in files if f.endswith('.h5')]
        for file in h5_files:
            file_path = os.path.join(root, file)
            h5 = GETTERS.open_h5_file_read(file_path)
            try:
                # loop over all the songs in this one h5 file
                num_songs = GETTERS.get_num_songs(h5)
                for song_idx in range(num_songs):
                    # get relevant information from the dataset
                    track_id = GETTERS.get_track_id(h5, songidx=song_idx).decode()
                    track_title = GETTERS.get_title(h5, songidx=song_idx).decode()
                    artist_name = GETTERS.get_artist_name(h5, songidx=song_idx).decode()

                    # using song_hotttnesss as a proxy for play counts
                    play_count = GETTERS.get_song_hotttnesss(h5, songidx=song_idx)
                    if np.isnan(play_count):
                        play_count = 0
                play_data.append([track_id, track_title, artist_name, play_count])

            finally:
                h5.close()

    return pd.DataFrame(play_data, columns=['track_id', 'track_title', 'artist_name', 'play_count'])

# playing around with one file to see what we can pull out
# filepath = './millionsongsubset/A/A/A/TRAAAAW128F429D538.h5'
# test_one_file(filepath)

# process the files then check out the dataframe
csv_path = 'million_song_data.csv'
if os.path.exists(csv_path):
    print(f'Loading data from {csv_path}')
    million_song_df = pd.read_csv(csv_path)
else:
    # specify the path to the dataset
    base_dir = "./millionsongsubset"
    million_song_df = process_files(base_dir)
    million_song_df.to_csv(csv_path, index=False)

million_song_df


Processing files...


Unnamed: 0,track_id,track_title,artist_name,play_count
0,TRAAAAW128F429D538,I Didn't Mean To,Casual,0.602120
1,TRAAABD128F429CF47,Soul Deep,The Box Tops,0.000000
2,TRAAADZ128F9348C2E,Amor De Cabaret,Sonora Santanera,0.000000
3,TRAAAEF128F4273421,Something Girls,Adam Ant,0.000000
4,TRAAAFD128F92F423A,Face the Ashes,Gob,0.604501
...,...,...,...,...
9995,TRBIJMU12903CF892B,The Hanged Man,Moonspell,0.594080
9996,TRBIJNF128F14815A7,The Wonderful World Of The Young,Danny Williams,0.334707
9997,TRBIJNK128F93093EC,Sentimental Man,Winston Reedy,0.000000
9998,TRBIJRN128F425F3DD,Zydeco In D-Minor,"Myrick ""Freeze"" Guillory",0.000000
