### Before Starting
Go here and download the "MILLION SONG SUBSET" (the 1.8GB one). Extract it to `millionsongsubset/` in this directory.

http://millionsongdataset.com/pages/getting-dataset/#subset

In [1]:
import os
import numpy as np
import pandas as pd
import hd5f_getters as GETTERS


def test_one_file(filepath: str) -> None:
    """
    Playing around to see what we can pull out of one file.
    Example usage: 
        >>> filepath = './millionsongsubset/A/A/A/TRAAAAW128F429D538.h5'
        >>> test_one_file(filepath)
    """
    h5 = GETTERS.open_h5_file_read(filepath)
    num_songs = GETTERS.get_num_songs(h5)
    track_id = GETTERS.get_track_id(h5)
    song_id = GETTERS.get_song_id(h5)
    track_title = GETTERS.get_title(h5)
    play_count = GETTERS.get_song_hotttnesss(h5)
    artist_name = GETTERS.get_artist_name(h5)

    print(f'BEFORE decode(): {track_id = }, {track_title = }, {artist_name = }, {play_count = }, {song_id = }')
    print(f'AFTER decode(): {track_id.decode() = }, {track_title.decode() = }, {artist_name.decode() = }, {play_count = }, {song_id.decode() = }')
    print(f'{num_songs = } for {filepath = }')

def process_files(base_dir: str) -> pd.DataFrame:
    """
    Process all the millionsongsubset files and return a dataframe
        with columns: ['track_id', 'track_title', 'artist_name', 'play_count'].
    This will take a few minutes.
    """

    play_data = []
    print('Processing files...')
    for root, dirs, files in os.walk(base_dir):
        # Loop over all .h5 files
        h5_files = [f for f in files if f.endswith('.h5')]
        for file in h5_files:
            file_path = os.path.join(root, file)
            h5 = GETTERS.open_h5_file_read(file_path)
            try:
                # loop over all the songs in this one h5 file
                num_songs = GETTERS.get_num_songs(h5)
                for song_idx in range(num_songs):
                    # get relevant information from the dataset
                    # track_id = GETTERS.get_track_id(h5, songidx=song_idx).decode()
                    song_id = GETTERS.get_song_id(h5).decode()
                    track_title = GETTERS.get_title(h5, songidx=song_idx).decode()
                    artist_name = GETTERS.get_artist_name(h5, songidx=song_idx).decode()

                    # using song_hotttnesss as a proxy for play counts
                    play_count = GETTERS.get_song_hotttnesss(h5, songidx=song_idx)
                    if np.isnan(play_count):
                        play_count = 0
                play_data.append([song_id, track_title, artist_name, play_count])

            finally:
                h5.close()

    return pd.DataFrame(play_data, columns=['track_id', 'track_title', 'artist_name', 'play_count'])

# playing around with one file to see what we can pull out
# filepath = './millionsongsubset/A/A/A/TRAAAAW128F429D538.h5'
# filepath = './millionsongsubset/A/A/A/TRAAAVO128F93133D4.h5'
# test_one_file(filepath)

# process the files then check out the dataframe
csv_path = 'million_song_data.csv'
if os.path.exists(csv_path):
    print(f'Loading data from {csv_path}')
    million_song_df = pd.read_csv(csv_path)
else:
    # specify the path to the dataset
    base_dir = "./millionsongsubset"
    million_song_df = process_files(base_dir)
    million_song_df.to_csv(csv_path, index=False)

million_song_df.sort_values("play_count", inplace=True, ascending=False)
million_song_df


Processing files...


Unnamed: 0,track_id,track_title,artist_name,play_count
6437,SOULTKQ12AB018A183,Nothin' On You [feat. Bruno Mars] (Album Version),B.o.B,1.000000
9137,SOAAXAK12A8C13C030,Immigrant Song (Album Version),Led Zeppelin,1.000000
539,SOTRSHW12A58A79E7C,This Christmas (LP Version),Donny Hathaway,0.997758
7847,SOWFUUS12AB01800E7,If Today Was Your Last Day (Album Version),Nickelback,0.984347
5699,SOOXLKF12A6D4F594A,Harder To Breathe,Maroon 5,0.979837
...,...,...,...,...
4644,SOUZQFG12AB017F6B1,No Hace Falta Que Lo Digas,Alejandro Lerner,0.000000
4643,SOSWHER12A81C21BFF,Organic Echo (Part II),Lester Bowie,0.000000
543,SOYOZCL12AB0189C16,Heart it Races (Frank Tetaz Remix),Architecture In Helsinki,0.000000
544,SOHFVFR12AB0186969,Very Well,Wailing Souls,0.000000


In [2]:
# read in the data triples
# be sure to download "train_triplets.txt"(the 500MB) file from http://millionsongdataset.com/tasteprofile/
df_train_triplets = pd.read_csv('train_triplets.txt', sep='\t', header=None, names=['userID', 'itemID', 'rating'])

In [32]:
# we're going to find all the song id's in million_song_df that are also in the triplets
unique_triplet_item_ids = set(df_train_triplets["itemID"])
unique_million_song_subset_ids = set(million_song_df["track_id"])
items_in_both = unique_triplet_item_ids & unique_million_song_subset_ids
print(f'found {len(items_in_both)} songs') 


found 3675 songs


In [31]:
million_song_df = million_song_df[million_song_df['track_id'].isin(items_in_both)]
million_song_df.head()

Unnamed: 0,track_id,track_title,artist_name,play_count
6437,SOULTKQ12AB018A183,Nothin' On You [feat. Bruno Mars] (Album Version),B.o.B,1.0
539,SOTRSHW12A58A79E7C,This Christmas (LP Version),Donny Hathaway,0.997758
7847,SOWFUUS12AB01800E7,If Today Was Your Last Day (Album Version),Nickelback,0.984347
5699,SOOXLKF12A6D4F594A,Harder To Breathe,Maroon 5,0.979837
9724,SOMKGQN12A8C1339D2,Blue Orchid,The White Stripes,0.972387
