### This notebook acquires song titles and audio features for 10,000 songs, a subset of the Million Song Dataset. 


#### I initially set out to produce a project based on these features. With too many missing values from this dataset, I used the Spotify API to acquire audio features for the same 10,000 songs.

In [1]:
import os
import sys
import h5py
import numpy as np
import tables as tables
import glob
import time
import datetime


### Instructions and functions for reading HDF5 files taken from here https://github.com/tbertinmahieux/MSongsDB

In [2]:
def get_all_files(basedir,ext='.h5') :
    """
    From a root directory, go through all subdirectories
    and find all files with the given extension.
    Return all absolute paths in a list.
    """
    allfiles = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files :
            allfiles.append( os.path.abspath(f) )
 
    return allfiles


In [3]:
allh5 = get_all_files('MillionSongSubset',ext='.h5')


In [21]:
def open_h5_file_read(h5filename):
    """
    Open an existing H5 in read mode.
    Same function as in hdf5_utils, here so we avoid one import
    """
    return tables.File(h5filename, mode='r')

def get_artist_name(h5,songidx=0):
    """
    Get artist name from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_name[songidx]

def get_title(h5,songidx=0):
    """
    Get title from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.title[songidx]

def get_num_songs(h5):
    """
    Return the number of songs contained in this h5 file, i.e. the number of rows
    for all basic informations like name, artist, ...
    """
    return h5.root.metadata.songs.nrows

def get_artist_familiarity(h5,songidx=0):
    """
    Get artist familiarity from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_familiarity[songidx]

def get_artist_hotttnesss(h5,songidx=0):
    """
    Get artist hotttnesss from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_hotttnesss[songidx]

def get_artist_id(h5,songidx=0):
    """
    Get artist id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_id[songidx]

def get_artist_mbid(h5,songidx=0):
    """
    Get artist musibrainz id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_mbid[songidx]

def get_artist_playmeid(h5,songidx=0):
    """
    Get artist playme id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_playmeid[songidx]

def get_artist_7digitalid(h5,songidx=0):
    """
    Get artist 7digital id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_7digitalid[songidx]

def get_artist_latitude(h5,songidx=0):
    """
    Get artist latitude from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_latitude[songidx]

def get_artist_longitude(h5,songidx=0):
    """
    Get artist longitude from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_longitude[songidx]

def get_artist_location(h5,songidx=0):
    """
    Get artist location from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_location[songidx]

def get_release(h5,songidx=0):
    """
    Get release from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.release[songidx]

def get_release_7digitalid(h5,songidx=0):
    """
    Get release 7digital id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.release_7digitalid[songidx]

def get_song_id(h5,songidx=0):
    """
    Get song id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.song_id[songidx]

def get_song_hotttnesss(h5,songidx=0):
    """
    Get song hotttnesss from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.song_hotttnesss[songidx]

def get_track_7digitalid(h5,songidx=0):
    """
    Get track 7digital id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.track_7digitalid[songidx]

def get_similar_artists(h5,songidx=0):
    """
    Get similar artists array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return h5.root.metadata.similar_artists[h5.root.metadata.songs.cols.idx_similar_artists[songidx]:]
    return h5.root.metadata.similar_artists[h5.root.metadata.songs.cols.idx_similar_artists[songidx]:
                                            h5.root.metadata.songs.cols.idx_similar_artists[songidx+1]]

def get_artist_terms(h5,songidx=0):
    """
    Get artist terms array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return h5.root.metadata.artist_terms[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:]
    return h5.root.metadata.artist_terms[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
                                            h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]]

def get_artist_terms_freq(h5,songidx=0):
    """
    Get artist terms array frequencies. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return h5.root.metadata.artist_terms_freq[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:]
    return h5.root.metadata.artist_terms_freq[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
                                              h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]]

def get_artist_terms_weight(h5,songidx=0):
    """
    Get artist terms array frequencies. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return h5.root.metadata.artist_terms_weight[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:]
    return h5.root.metadata.artist_terms_weight[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
                                                h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]]

def get_analysis_sample_rate(h5,songidx=0):
    """
    Get analysis sample rate from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.analysis_sample_rate[songidx]

def get_audio_md5(h5,songidx=0):
    """
    Get audio MD5 from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.audio_md5[songidx]

def get_danceability(h5,songidx=0):
    """
    Get danceability from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.danceability[songidx]

def get_duration(h5,songidx=0):
    """
    Get duration from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.duration[songidx]

def get_end_of_fade_in(h5,songidx=0):
    """
    Get end of fade in from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.end_of_fade_in[songidx]

def get_energy(h5,songidx=0):
    """
    Get energy from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.energy[songidx]

def get_key(h5,songidx=0):
    """
    Get key from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.key[songidx]

def get_key_confidence(h5,songidx=0):
    """
    Get key confidence from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.key_confidence[songidx]

def get_loudness(h5,songidx=0):
    """
    Get loudness from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.loudness[songidx]

def get_mode(h5,songidx=0):
    """
    Get mode from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.mode[songidx]

def get_mode_confidence(h5,songidx=0):
    """
    Get mode confidence from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.mode_confidence[songidx]

def get_start_of_fade_out(h5,songidx=0):
    """
    Get start of fade out from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.start_of_fade_out[songidx]

def get_tempo(h5,songidx=0):
    """
    Get tempo from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.tempo[songidx]

def get_time_signature(h5,songidx=0):
    """
    Get signature from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.time_signature[songidx]

def get_time_signature_confidence(h5,songidx=0):
    """
    Get signature confidence from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.time_signature_confidence[songidx]

def get_track_id(h5,songidx=0):
    """
    Get track id from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.track_id[songidx]

def get_segments_start(h5,songidx=0):
    """
    Get segments start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_start[h5.root.analysis.songs.cols.idx_segments_start[songidx]:]
    return h5.root.analysis.segments_start[h5.root.analysis.songs.cols.idx_segments_start[songidx]:
                                           h5.root.analysis.songs.cols.idx_segments_start[songidx+1]]
    
def get_segments_confidence(h5,songidx=0):
    """
    Get segments confidence array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_confidence[h5.root.analysis.songs.cols.idx_segments_confidence[songidx]:]
    return h5.root.analysis.segments_confidence[h5.root.analysis.songs.cols.idx_segments_confidence[songidx]:
                                                h5.root.analysis.songs.cols.idx_segments_confidence[songidx+1]]

def get_segments_pitches(h5,songidx=0):
    """
    Get segments pitches array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_pitches[h5.root.analysis.songs.cols.idx_segments_pitches[songidx]:,:]
    return h5.root.analysis.segments_pitches[h5.root.analysis.songs.cols.idx_segments_pitches[songidx]:
                                             h5.root.analysis.songs.cols.idx_segments_pitches[songidx+1],:]

def get_segments_timbre(h5,songidx=0):
    """
    Get segments timbre array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_timbre[h5.root.analysis.songs.cols.idx_segments_timbre[songidx]:,:]
    return h5.root.analysis.segments_timbre[h5.root.analysis.songs.cols.idx_segments_timbre[songidx]:
                                            h5.root.analysis.songs.cols.idx_segments_timbre[songidx+1],:]

def get_segments_loudness_max(h5,songidx=0):
    """
    Get segments loudness max array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_loudness_max[h5.root.analysis.songs.cols.idx_segments_loudness_max[songidx]:]
    return h5.root.analysis.segments_loudness_max[h5.root.analysis.songs.cols.idx_segments_loudness_max[songidx]:
                                                  h5.root.analysis.songs.cols.idx_segments_loudness_max[songidx+1]]

def get_segments_loudness_max_time(h5,songidx=0):
    """
    Get segments loudness max time array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_loudness_max_time[h5.root.analysis.songs.cols.idx_segments_loudness_max_time[songidx]:]
    return h5.root.analysis.segments_loudness_max_time[h5.root.analysis.songs.cols.idx_segments_loudness_max_time[songidx]:
                                                       h5.root.analysis.songs.cols.idx_segments_loudness_max_time[songidx+1]]

def get_segments_loudness_start(h5,songidx=0):
    """
    Get segments loudness start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_loudness_start[h5.root.analysis.songs.cols.idx_segments_loudness_start[songidx]:]
    return h5.root.analysis.segments_loudness_start[h5.root.analysis.songs.cols.idx_segments_loudness_start[songidx]:
                                                    h5.root.analysis.songs.cols.idx_segments_loudness_start[songidx+1]]

def get_sections_start(h5,songidx=0):
    """
    Get sections start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.sections_start[h5.root.analysis.songs.cols.idx_sections_start[songidx]:]
    return h5.root.analysis.sections_start[h5.root.analysis.songs.cols.idx_sections_start[songidx]:
                                           h5.root.analysis.songs.cols.idx_sections_start[songidx+1]]

def get_sections_confidence(h5,songidx=0):
    """
    Get sections confidence array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.sections_confidence[h5.root.analysis.songs.cols.idx_sections_confidence[songidx]:]
    return h5.root.analysis.sections_confidence[h5.root.analysis.songs.cols.idx_sections_confidence[songidx]:
                                                h5.root.analysis.songs.cols.idx_sections_confidence[songidx+1]]

def get_beats_start(h5,songidx=0):
    """
    Get beats start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.beats_start[h5.root.analysis.songs.cols.idx_beats_start[songidx]:]
    return h5.root.analysis.beats_start[h5.root.analysis.songs.cols.idx_beats_start[songidx]:
                                        h5.root.analysis.songs.cols.idx_beats_start[songidx+1]]

def get_beats_confidence(h5,songidx=0):
    """
    Get beats confidence array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.beats_confidence[h5.root.analysis.songs.cols.idx_beats_confidence[songidx]:]
    return h5.root.analysis.beats_confidence[h5.root.analysis.songs.cols.idx_beats_confidence[songidx]:
                                             h5.root.analysis.songs.cols.idx_beats_confidence[songidx+1]]

def get_bars_start(h5,songidx=0):
    """
    Get bars start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.bars_start[h5.root.analysis.songs.cols.idx_bars_start[songidx]:]
    return h5.root.analysis.bars_start[h5.root.analysis.songs.cols.idx_bars_start[songidx]:
                                       h5.root.analysis.songs.cols.idx_bars_start[songidx+1]]

def get_bars_confidence(h5,songidx=0):
    """
    Get bars start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.bars_confidence[h5.root.analysis.songs.cols.idx_bars_confidence[songidx]:]
    return h5.root.analysis.bars_confidence[h5.root.analysis.songs.cols.idx_bars_confidence[songidx]:
                                            h5.root.analysis.songs.cols.idx_bars_confidence[songidx+1]]

def get_tatums_start(h5,songidx=0):
    """
    Get tatums start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.tatums_start[h5.root.analysis.songs.cols.idx_tatums_start[songidx]:]
    return h5.root.analysis.tatums_start[h5.root.analysis.songs.cols.idx_tatums_start[songidx]:
                                         h5.root.analysis.songs.cols.idx_tatums_start[songidx+1]]

def get_tatums_confidence(h5,songidx=0):
    """
    Get tatums confidence array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.tatums_confidence[h5.root.analysis.songs.cols.idx_tatums_confidence[songidx]:]
    return h5.root.analysis.tatums_confidence[h5.root.analysis.songs.cols.idx_tatums_confidence[songidx]:
                                              h5.root.analysis.songs.cols.idx_tatums_confidence[songidx+1]]

def get_artist_mbtags(h5,songidx=0):
    """
    Get artist musicbrainz tag array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.musicbrainz.songs.nrows == songidx + 1:
        return h5.root.musicbrainz.artist_mbtags[h5.root.musicbrainz.songs.cols.idx_artist_mbtags[songidx]:]
    return h5.root.musicbrainz.artist_mbtags[h5.root.metadata.songs.cols.idx_artist_mbtags[songidx]:
                                             h5.root.metadata.songs.cols.idx_artist_mbtags[songidx+1]]

def get_artist_mbtags_count(h5,songidx=0):
    """
    Get artist musicbrainz tag count array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.musicbrainz.songs.nrows == songidx + 1:
        return h5.root.musicbrainz.artist_mbtags_count[h5.root.musicbrainz.songs.cols.idx_artist_mbtags[songidx]:]
    return h5.root.musicbrainz.artist_mbtags_count[h5.root.metadata.songs.cols.idx_artist_mbtags[songidx]:
                                                   h5.root.metadata.songs.cols.idx_artist_mbtags[songidx+1]]

def get_year(h5,songidx=0):
    """
    Get release year from a HDF5 song file, by default the first song in it
    """
    return h5.root.musicbrainz.songs.cols.year[songidx]


### Loop through functions and zip data to a Pandas dataframe

In [34]:
headers = ['artist', 'title', 'familiarity', 'hotttnesss_artist', 'artist_id',
                              'mbid', 'playmeid', 'seven_digital', 'latitude', 'longitude', 'location', 'release', 'song_id', 'hotttness_song',
                                  'seven_trackid', 'sample_rate_analysis', 'danceability', 'duration', 'end_of_fade',
                                  'energy', 'key', 'key_confidence', 'loudness', 'mode', 'mode_confidence', 'start_fade', 'tempo', 'time_signature',
                                  'time_signature_confidence', 'year']

song_data = []

for f in allh5:
    filename = open_h5_file_read(f)
    artist = get_artist_name(filename)
    title = get_title(filename)
    familiarity = get_artist_familiarity(filename)
    hotttnesss_artist = get_artist_hotttnesss(filename)
    artist_id = get_artist_id(filename)
    mbid = get_artist_mbid(filename)
    playmeid = get_artist_playmeid(filename)
    seven_digital = get_artist_7digitalid(filename)
    latitude = get_artist_latitude(filename)
    longitude = get_artist_longitude(filename)
    location = get_artist_location(filename)
    release = get_release(filename)
    song_id = get_song_id(filename)
    hotttness_song = get_song_hotttnesss(filename)
    seven_trackid = get_track_7digitalid(filename)
    #similar_artists = get_similar_artists(filename)
    sample_rate_analysis = get_analysis_sample_rate(filename)
    danceability = get_danceability(filename)
    duration = get_duration(filename)
    end_of_fade = get_end_of_fade_in(filename)
    energy = get_energy(filename)
    key = get_key(filename)
    key_confidence = get_key_confidence(filename)
    loudness = get_loudness(filename)
    mode = get_mode(filename)
    mode_confidence = get_mode_confidence(filename)
    start_fade = get_start_of_fade_out(filename)
    tempo = get_tempo(filename)
    time_signature = get_time_signature(filename)
    time_signature_confidence = get_time_signature_confidence(filename)
    #artist_mbtags = get_artist_mbtags(filename)
    #mbtags_count = get_artist_mbtags_count(filename)
    year = get_year(filename)
    filename.close()
    
    data_dict = dict(zip(headers, [artist, title, familiarity, hotttnesss_artist, artist_id,
                              mbid, playmeid, seven_digital, latitude, longitude, location, release, song_id, hotttness_song,
                                  seven_trackid, sample_rate_analysis, danceability, duration, end_of_fade,
                                  energy, key, key_confidence, loudness, mode, mode_confidence, start_fade, tempo, time_signature,
                                  time_signature_confidence, year]))

    song_data.append(data_dict)

song_data

[{'artist': b'Mastodon',
  'artist_id': b'ARMQHX71187B9890D3',
  'danceability': 0.0,
  'duration': 280.21505999999999,
  'end_of_fade': 0.23799999999999999,
  'energy': 0.0,
  'familiarity': 0.78046174877704066,
  'hotttness_song': 0.59764079771477685,
  'hotttnesss_artist': 0.57427473051685607,
  'key': 5,
  'key_confidence': 0.55500000000000005,
  'latitude': nan,
  'location': b'Atlanta, GA',
  'longitude': nan,
  'loudness': -3.306,
  'mbid': b'bc5e2ad6-0a4a-4d90-b911-e9a7e6861727',
  'mode': 1,
  'mode_confidence': 0.5,
  'playmeid': -1,
  'release': b'Call of the Mastodon',
  'sample_rate_analysis': 22050,
  'seven_digital': 29785,
  'seven_trackid': 2442524,
  'song_id': b'SOVLGJY12A8C13FBED',
  'start_fade': 275.52800000000002,
  'tempo': 173.20500000000001,
  'time_signature': 5,
  'time_signature_confidence': 0.12,
  'title': b'Deep Sea Creature',
  'year': 2001},
 {'artist': b'Casual',
  'artist_id': b'ARD7TVE1187B99BFB1',
  'danceability': 0.0,
  'duration': 218.9317900000

In [35]:
import pandas as pd

df = pd.DataFrame(song_data)

In [36]:
df.head()

Unnamed: 0,artist,artist_id,danceability,duration,end_of_fade,energy,familiarity,hotttness_song,hotttnesss_artist,key,...,sample_rate_analysis,seven_digital,seven_trackid,song_id,start_fade,tempo,time_signature,time_signature_confidence,title,year
0,b'Mastodon',b'ARMQHX71187B9890D3',0.0,280.21506,0.238,0.0,0.780462,0.597641,0.574275,5,...,22050,29785,2442524,b'SOVLGJY12A8C13FBED',275.528,173.205,5,0.12,b'Deep Sea Creature',2001
1,b'Casual',b'ARD7TVE1187B99BFB1',0.0,218.93179,0.247,0.0,0.581794,0.60212,0.401998,1,...,22050,165270,3401791,b'SOMZWCG12A8C13C480',218.932,92.198,4,0.778,"b""I Didn't Mean To""",0
2,b'The Box Tops',b'ARMJAGH1187FB546F3',0.0,148.03546,0.148,0.0,0.63063,,0.4175,6,...,22050,1998,3400270,b'SOCIWDW12A8C13D406',137.915,121.274,4,0.384,b'Soul Deep',1969
3,b'Sonora Santanera',b'ARKRRTF1187B9984DA',0.0,177.47546,0.282,0.0,0.487357,,0.343428,8,...,22050,290021,5703798,b'SOXVLOJ12AB0189215',172.304,100.07,1,0.0,b'Amor De Cabaret',0
4,b'Adam Ant',b'AR7G5I41187FB4CE6C',0.0,233.40363,0.0,0.0,0.630382,,0.454231,0,...,22050,19072,3226795,b'SONHOTT12A8C13493C',217.124,119.293,4,0.0,b'Something Girls',1982


In [40]:
with open('dictionaries/songdata1.pkl', 'rb') as picklefile:
	data_dict = pickle.load(picklefile)
    
data_dict

[{'artist': b'Mastodon',
  'artist_id': b'ARMQHX71187B9890D3',
  'danceability': 0.0,
  'duration': 280.21505999999999,
  'end_of_fade': 0.23799999999999999,
  'energy': 0.0,
  'familiarity': 0.78046174877704066,
  'hotttness_song': 0.59764079771477685,
  'hotttnesss_artist': 0.57427473051685607,
  'key': 5,
  'key_confidence': 0.55500000000000005,
  'latitude': nan,
  'location': b'Atlanta, GA',
  'longitude': nan,
  'loudness': -3.306,
  'mbid': b'bc5e2ad6-0a4a-4d90-b911-e9a7e6861727',
  'mode': 1,
  'mode_confidence': 0.5,
  'playmeid': -1,
  'release': b'Call of the Mastodon',
  'sample_rate_analysis': 22050,
  'seven_digital': 29785,
  'seven_trackid': 2442524,
  'song_id': b'SOVLGJY12A8C13FBED',
  'start_fade': 275.52800000000002,
  'tempo': 173.20500000000001,
  'time_signature': 5,
  'time_signature_confidence': 0.12,
  'title': b'Deep Sea Creature',
  'year': 2001},
 {'artist': b'Casual',
  'artist_id': b'ARD7TVE1187B99BFB1',
  'danceability': 0.0,
  'duration': 218.9317900000

In [43]:
with open('dataframes/df1.pkl', 'rb') as picklefile:
	df1 = pickle.load(picklefile)
df1.head()

Unnamed: 0,artist,artist_id,danceability,duration,end_of_fade,energy,familiarity,hotttness_song,hotttnesss_artist,key,...,sample_rate_analysis,seven_digital,seven_trackid,song_id,start_fade,tempo,time_signature,time_signature_confidence,title,year
0,b'Mastodon',b'ARMQHX71187B9890D3',0.0,280.21506,0.238,0.0,0.780462,0.597641,0.574275,5,...,22050,29785,2442524,b'SOVLGJY12A8C13FBED',275.528,173.205,5,0.120,b'Deep Sea Creature',2001
1,b'Casual',b'ARD7TVE1187B99BFB1',0.0,218.93179,0.247,0.0,0.581794,0.602120,0.401998,1,...,22050,165270,3401791,b'SOMZWCG12A8C13C480',218.932,92.198,4,0.778,"b""I Didn't Mean To""",0
2,b'The Box Tops',b'ARMJAGH1187FB546F3',0.0,148.03546,0.148,0.0,0.630630,,0.417500,6,...,22050,1998,3400270,b'SOCIWDW12A8C13D406',137.915,121.274,4,0.384,b'Soul Deep',1969
3,b'Sonora Santanera',b'ARKRRTF1187B9984DA',0.0,177.47546,0.282,0.0,0.487357,,0.343428,8,...,22050,290021,5703798,b'SOXVLOJ12AB0189215',172.304,100.070,1,0.000,b'Amor De Cabaret',0
4,b'Adam Ant',b'AR7G5I41187FB4CE6C',0.0,233.40363,0.000,0.0,0.630382,,0.454231,0,...,22050,19072,3226795,b'SONHOTT12A8C13493C',217.124,119.293,4,0.000,b'Something Girls',1982
5,b'Gob',b'ARXR32B1187FB57099',0.0,209.60608,0.066,0.0,0.651046,0.604501,0.401724,2,...,22050,30973,6795666,b'SOFSOCN12A8C143F5D',198.699,129.738,4,0.562,b'Face the Ashes',2007
6,b'Jeff And Sheri Easter',b'ARKFYS91187B98E58F',0.0,267.70240,2.264,0.0,0.535293,,0.385471,5,...,22050,432935,444964,b'SOYMRWW12A6D4FAB14',254.270,147.782,3,0.454,b'The Moon And I (Ordinary Day Album Version)',0
7,b'Rated R',b'ARD0S291187B9B7BF5',0.0,114.78159,0.096,0.0,0.556496,,0.261941,1,...,22050,17970,276593,b'SOMJBYD12A6D4F8557',114.782,111.787,1,0.000,b'Keepin It Real (Skit)',0
8,b'Tweeterfriendly Music',b'AR10USD1187B99F3F1',0.0,189.57016,0.319,0.0,0.801136,,0.605507,4,...,22050,21128,90004,b'SOHKNRJ12A6701D1F8',181.023,101.430,3,0.408,b'Drop of Rain',0
9,b'Planet P Project',b'AR8ZCNI1187B9A069B',0.0,269.81832,5.300,0.0,0.426668,0.265861,0.332276,4,...,22050,276891,3996579,b'SOIAZJW12AB01853F1',258.990,86.643,4,0.487,b'Pink World',1984


In [None]:
def text_clean(x):
    try:
        x = x.decode('utf-8')
    except:
        None
    return x

df1 = df1.applymap(text_clean)

In [None]:
with open('dataframes/msd_clean.pkl', 'wb') as picklefile:
    pickle.dump(df1, picklefile)