# Configuration

In [8]:
# directory where MSD audio samples are stored (root directory)
MSD_SAMPLE_DIR                      = "/mnt/dataset_storage/audio/music/MSD/audio/"

# path to "msd_amglabels_all.h5" - leave blank to skip
MSD_AMG_TAGS_PATH                   = "/home/schindlera/experiments/representation_from_album_review/data/msd_amglabels_all.h5"

# Album Reviews - leave blank to skip
MSD_AMG_REVIEW_TRACKID_MAPPING_PATH = "/home/schindlera/experiments/representation_from_album_review/data/MSD_AMG_REVIEW_TRACKID_MAPPING.h5"

# directory to store intermediate and final results of the experiment
EXPERIMENT_DIR                      = "/home/schindlera/experiments/ismir2020_reviews/"

# Imports

In [9]:
import os
import pandas as pd
import numpy as np

# Load Data

## Find all MSD audio samples

In [3]:
sample_fullpaths = []
sample_track_ids = []

for root, dirs, files in os.walk(MSD_SAMPLE_DIR):
    for file in files:
        if file.endswith(".mp3"):
            sample_fullpaths.append(os.path.join(root, file))
            sample_track_ids.append(file.replace(".mp3", ""))

metadata_audio          = pd.DataFrame(sample_fullpaths, index=sample_track_ids, columns=["audio_path"])
metadata_audio["audio"] = True

print("%d MSD audio samples found" % metadata_audio.shape[0])

995000 MSD audio samples found


## Join with Tagset Data

In [10]:
if os.path.exists(MSD_AMG_TAGS_PATH):

    # load labelsets from hdf5 archive
    msd_amgtags_all         = pd.read_hdf(MSD_AMG_TAGS_PATH, "data")

    # only use instances where at least one genre tag is provided
    msd_amgtags_all         = msd_amgtags_all[~msd_amgtags_all.genres.isna()]
    msd_amgtags_all["tags"] = True

    # join with audio metadata
    metadata_final          = metadata_audio.join(msd_amgtags_all, how="inner")

    print("Intersection size: %d" % metadata_final.shape[0])
    
else:
    
    metadata_final = metadata_audio

Intersection size: 504992


## Join with Album Review Dataset

In [11]:
if os.path.exists(MSD_AMG_REVIEW_TRACKID_MAPPING_PATH):

    MSD_lastfmid_trackid_mapping            = pd.read_hdf(MSD_AMG_REVIEW_TRACKID_MAPPING_PATH, "data")
    MSD_lastfmid_trackid_mapping            = MSD_lastfmid_trackid_mapping.set_index("track_id")
    MSD_lastfmid_trackid_mapping["reviews"] = True

    metadata_final = metadata_final.join(MSD_lastfmid_trackid_mapping, how="left")

    print("Intersection size: %d" % metadata_final.shape[0])

Intersection size: 504992


# Store Data

Store Metadata

In [12]:
metadata_final.to_hdf(EXPERIMENT_DIR + "/experiment_partition.h5", "data")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['audio_path', 'genres', 'styles', 'moods', 'themes', 'artist_name',
       'title', 'release', 'reviews'],
      dtype='object')]

  encoding=encoding,


Store trackids for feature extraction

In [13]:
metadata_final.reset_index()[["index","audio_path"]].to_csv(EXPERIMENT_DIR + "/eval_trackids.csv", header=None, index=None)