# Create general scripts to process entire dataset

*Andrea Soto*  
*MIDS W205 Final Project*  
*Project Name: Graph Model of the Million Song Dataset*

---

In [17]:
import sys
import glob
import os

In [37]:
not []

True

In [36]:
bool(glob.glob('MillionSongSubset/*/*/*/*.h5') )

False

In [42]:
os.path.exists('MillionSongSubset/')

True

# Step 1: Create a list of HDF5 and JSON files

In [52]:
%%writefile scripts/list_MDS_files.py
#!/usr/bin/env python
import os
import glob
import sys
import shutil

def main(inFile, outFile, overwrite = False):
    
    try:
        os.path.exists(inFile)
    except:
        print "Input file: '%s' does not exist"%(inFile)
    else:
        outFile = outFile + '/list_hdf5_files.txt'
        if not os.path.exists(outFile) or overwrite:
            # List all paths of songs
            get_song_paths = glob.glob(inFile+'/*/*/*/*.h5')
            
            if not get_song_paths:
                print "No HDF5 (.h5) files foung in '%s'"%(inFile)
                print "Check that the file structure under '%s' is /*/*/*/song_files.h5"%(inFile)
            else:
                with open(outFile,'w') as f:
                    f.writelines('\n'.join(p for p in get_song_paths))
                    f.close()
                print "File '%s' successfully created"%(outFile)
        else:
            print "File '%s' already exists"%(outFile)
    
if __name__ == '__main__':
    '''
    Creates the file 'list_hdf5_files.txt' with the list of HDF5 files
    
    USE:
    python list_MDS_files.py <path to songs> <save list path> <OPTIONAL overwrite>
    
    Paths should NOT include '/' at the end
    If the file already exists, it will not be overwritten. Send 'True' to overwrite
    '''
    
    input_path = sys.argv[1]  
    output_path = sys.argv[2]
    
    # Option to overwrite current file
    overwrite = False
    if len(sys.argv) > 3:
        overwrite  = sys.argv[3]
    
    main(input_path, output_path, overwrite)

Overwriting scripts/list_MDS_files.py


In [45]:
%%writefile scripts/list_LastFM_files.py
#!/usr/bin/env python
import os
import glob
import sys
import shutil

def main(inFile, outFile, overwrite = False):
    
    try:
        os.path.exists(inFile)
    except:
        print "Input file: '%s' does not exist"%(inFile)
    else:
        outFile = outFile + '/list_lastfm_files.txt'
        if not os.path.exists(outFile) or overwrite:
            # List all paths of songs
            get_song_paths = glob.glob(inFile+'/*/*/*/*.json')
            
            if not get_song_paths:
                print "No JSON files foung in '%s'"%(inFile)
                print "Check that the file structure under '%s' is /*/*/*/song_files.json"%(inFile)
            else:
                with open(outFile,'w') as f:
                    f.writelines('\n'.join(p for p in get_song_paths))
                    f.close()
                print  "File '%s' successfully created"%(outFile)
        else:
            print "File '%s' already exists"%(outFile)
    
if __name__ == '__main__':
    '''
    Creates the file 'list_lastfm_files.txt' with the list of HDF5 files
    
    USE:
    python list_MDS_files.py <path to songs> <save list path> <OPTIONAL overwrite>
    
    Paths should NOT include '/' at the end
    If the file already exists, it will not be overwritten. Send 'True' to overwrite
    '''
    input_path = sys.argv[1]  
    output_path = sys.argv[2]
    
    # Option to overwrite current file
    overwrite = False
    if len(sys.argv) > 3:
        overwrite  = sys.argv[3]
    
    main(input_path, output_path, overwrite)

Writing scripts/list_LastFM_files.py


In [40]:
!python scripts/list_MDS_files.py MillionSongSubset/data test

File 'test/list_hdf5_files.txt' successfully created


In [56]:
!python scripts/list_LastFM_files.py MillionSongSubset/lastfm_subset test

File 'test/list_lastfm_files.txt' successfully created


In [59]:
!ls -l test

total 1068
-rw-rw-r-- 1 asoto asoto 509999 Dec 14 07:01 list_hdf5_files.txt
-rw-rw-r-- 1 asoto asoto 578459 Dec 14 07:13 list_lastfm_files.txt


# Step 2: Load and Transform Data in Spark

In [None]:
%%writefile scripts/list_LastFM_files.py
#!/usr/bin/env python
import os
import glob
import sys
import shutil
from pyspark import SparkContext
import numpy as np
import h5py
import json

def main(inFile, outFile, mismatchFile):
    
    # === Start Spark Context ===
    sc = SparkContext(appName="SparkProcessing")
    
    # === Load mismatches ===
    toRemoveRDD = sc.textFile('file://'+mismatchFile+'/sid_mismatches.txt').map(parse_mismatches)
    songsToRemove = sc.broadcast(toRemoveRDD.collect())
    
    # === Load list of files ===
    song_pathsRDD   = sc.textFile('file://' + inFile + '/list_hdf5_files.txt')
    lastfm_pathsRDD = sc.textFile('file://' + inFile + '/list_lastfm_files.txt')
    
    # === Extract Song Data ===
    songsRDD = song_pathsRDD.map(get_h5_info).cache()
    lastfmRDD = lastfm_pathsRDD.map(get_json_info).cache()
    
    # === Create Nodes ===
    
    # === ARTISTS ===
    # CSV Format: artist_id, artist_mb_id, artist_7d_id, artist_name
    fields = ['artist_id', 'artist_mbid', 'artist_7did', 'artist_name']
    fieldsBrC = sc.broadcast(fields)
    # If directory already exists, delete it
    if os.path.exists(outFile+'/nodes_artists'):
        shutil.rmtree(outFile+'/nodes_artists')
    # Process and save
    songsRDD.map(makeCSVline).distinct().saveAsTextFile('file://'+outFile+'/nodes_artists')
    
    # ----------------------

    #End Time
    t2 = time.time()
    sec = t2-t1

    # === Stop Spark Context ===
    sc.stop()


def parse_mismatches(line):
    '''
    This function extracts the songID and trackID of the mismatched records.
    Returned value: ('songID', 'trackID')
    '''
    return line[8:45].split()


def get_h5_info(path):
    '''
    Takes a path to a song stored as an HDF5 file and returns a dictionary with the 
    information that will be included in the graph
    ''' 
    d = {}
    with h5py.File(path, 'r') as f:
        song_id = f['metadata']['songs']['song_id'][0]
        track_id = f['analysis']['songs']['track_id'][0]
        
        if (song_id, track_id) not in songsToRemove.value:

            # --- Artist Info -----------------------------
            d.setdefault('artist_id', f['metadata']['songs']['artist_id'][0])
            d.setdefault('artist_mbid', f['metadata']['songs']['artist_mbid'][0])
            d.setdefault('artist_7did', f['metadata']['songs']['artist_7digitalid'][0])
            d.setdefault('artist_name', f['metadata']['songs']['artist_name'][0])

            # --- Song Info -----------------------------
            d.setdefault('song_id', song_id)
            d.setdefault('track_id', track_id)
            d.setdefault('title', f['metadata']['songs']['title'][0])
            d.setdefault('dance', f['analysis']['songs']['danceability'][0])
            d.setdefault('dur', f['analysis']['songs']['duration'][0])
            d.setdefault('energy', f['analysis']['songs']['energy'][0])
            d.setdefault('loudness', f['analysis']['songs']['loudness'][0])

            # --- Year -----------------------------
            d.setdefault('year', f['musicbrainz']['songs']['year'][0])

            # --- Album -----------------------------
            d.setdefault('album', f['metadata']['songs']['release'][0])

            # --- Similar Artist -----------------------------
            d.setdefault('a_similar', np.array(f['metadata']['similar_artists']))

            # --- Artist Terms -----------------------------
            d.setdefault('a_terms', np.array(f['metadata']['artist_terms']))
            d.setdefault('a_tfrq', np.array(f['metadata']['artist_terms_freq']))
            d.setdefault('a_tw', np.array(f['metadata']['artist_terms_weight']))

            return d
        else: 
            pass

def get_json_info(path):
    with open(path) as data_file:    
        return json.load(data_file)
    
    
    
if __name__ == '__main__':
    '''
    input_path: path to where the list of hdf5 and json files was created
    output_path: a temporary directory where the Spark CSV files separated as part-000xx files will be stored
    mismatch_path: path to where the mismatches file is located
    
    DO NOT INCLUDE '/' AT THE END OF PATH
    Cannot change file names
    '''
    input_path = sys.argv[1]  
    output_path = sys.argv[2]
    mismatch_path = sys.argv[2]
    
    
    main(input_path, output_path, mismatch_path)

In [None]:

import numpy as np

import h5py
import json



---

---

# General script structure with spark context

In [None]:
%%writefile test_code/count_h5.py
#!/usr/bin/env python
from pyspark import SparkContext
import time
import h5py

def read_h5_file(path):
    with h5py.File(path, 'r') as f:
        return f['metadata']['songs']['title'][0]
#Start Time
t1 = time.time()

# --- Process files ----
sc = SparkContext(appName="SparkHDF5")
file_paths = sc.textFile('file:///data/asoto/projectW205/data/list_files.txt')

songs = file_paths.map(read_h5_file)
songs.count()
# ----------------------

#End Time
t2 = time.time()
sec = t2-t1

print "Run Time: %0.2f sec = %.2f min = %.2f h"%(sec,sec/60.0,sec/1440.0)
sc.stop()

In [None]:
!spark-submit test_code/count_h5.py