# Graph Model of the Million Song Dataset

*Andrea Soto*  
*MIDS W205 Final Project*

# Download the Subset Data - 10,000 songs

In [None]:
#Create a project directory
!mkdir msd_project
!cd msd_project

In [2]:
%%writefile download_subsetdata.sh
#!/usr/bin/env bash

#Create a directory for the data
mkdir data
cd data

# Download data subset of 10,000 songs, ~1GB to develop and test code
wget http://static.echonest.com/millionsongsubset_full.tar.gz data_subset
wait

tar xvzf millionsongsubset_full.tar.gz
wait

# Download list of all artist ID 
# The format is: artist id<SEP>artist mbid<SEP>track id<SEP>artist name
wget http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/unique_artists.txt
wait
wc -l unique_artists.txt #44745 unique_artists.txt

# Download list of all unique artist terms (Echo Nest tags) 
wget http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/unique_terms.txt
wait
wc -l unique_terms.txt #7643 unique_terms.txt
    
# Download list of all unique artist musicbrainz tags
wget http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/unique_mbtags.txt
wait
wc -l unique_mbtags.txt #2321 unique_mbtags.txt

cd ..

Writing download_subsetdata.sh


# Data Preparation

The Million Song Dataset is stored in HDF5 files. The data was transformed into csv files which can then be used in Neo4j to create the nodes and relationships of the graph.

In [None]:
# List the files and store them in a separate .txt file
import glob

song_files = glob.glob('./data/MillionSongSubset/data/*/*/*/*.h5')
list_file = './data/list_files.txt'

with open(list_file,'w') as f:
    f.writelines('\n'.join(p for p in song_files))
    f.close()

Start Spark
---

In [1]:
import os
import sys
#Escape L for line numbers
spark_home = os.environ['SPARK_HOME'] = '/data/spark15'
if not spark_home:
    raise ValueError('SPARK_HOME enviroment variable is not set')

sys.path.insert(0,os.path.join(spark_home,'python'))
sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.8.2.1-src.zip'))
execfile(os.path.join(spark_home,'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 1.5.0
      /_/

Using Python version 2.7.10 (default, Oct 19 2015 18:04:42)
SparkContext available as sc, HiveContext available as sqlContext.


In [None]:
sc

In [2]:
file_paths = sc.textFile('file:///data/asoto/projectW205/data/list_files.txt')

In [3]:
file_paths.take(3)

[u'./data/MillionSongSubset/data/B/B/O/TRBBOPX12903D106F7.h5',
 u'./data/MillionSongSubset/data/B/B/O/TRBBOKQ128F933AE7C.h5',
 u'./data/MillionSongSubset/data/B/B/O/TRBBOPV12903CFB50F.h5']

In [13]:
import h5py
import numpy as np

def get_h5_info(path):
    d = {}
    with h5py.File(path, 'r') as f:
        # --- Artist Info -----------------------------
        d.setdefault('artist_id', f['metadata']['songs']['artist_id'][0])
        d.setdefault('artist_mbid', f['metadata']['songs']['artist_mbid'][0])
        d.setdefault('artist_7did', f['metadata']['songs']['artist_7digitalid'][0])
        d.setdefault('artist_name', f['metadata']['songs']['artist_name'][0])
        
        # --- Song Info -----------------------------
        d.setdefault('song_id', f['metadata']['songs']['song_id'][0])
        d.setdefault('track_id', f['analysis']['songs']['track_id'][0])
        d.setdefault('title', f['metadata']['songs']['title'][0])
        d.setdefault('dance', f['analysis']['songs']['danceability'][0])
        d.setdefault('dur', f['analysis']['songs']['duration'][0])
        d.setdefault('energy', f['analysis']['songs']['energy'][0])
        d.setdefault('loudness', f['analysis']['songs']['loudness'][0])
        
        # --- Year -----------------------------
        d.setdefault('year', f['musicbrainz']['songs']['year'][0])
        
        # --- Album -----------------------------
        d.setdefault('album', f['metadata']['songs']['release'][0])
        
        # --- Similar Artist -----------------------------
        d.setdefault('a_similar', np.array(f['metadata']['similar_artists']))
        
        # --- Artist Terms -----------------------------
        d.setdefault('a_terms', np.array(f['metadata']['artist_terms']))
        d.setdefault('a_tfrq', np.array(f['metadata']['artist_terms_freq']))
        d.setdefault('a_tw', np.array(f['metadata']['artist_terms_weight']))
        
        return d

In [14]:
dics = file_paths.map(get_h5_info)
dics.take(1)

[{'a_similar': array(['ARRGFFD1187B9AF330', 'ARIVAXF122BCFCACF3', 'AR6LT5K1187FB562A9',
         'ARI8PQM1187B99577F', 'ARHYS6D1187FB5BBA4', 'AR1XPEO1187B9B560E',
         'AREUFRU1187FB49BEF', 'AR41E9U1187FB5573B', 'AR6AD5N1187FB52F22',
         'ARCF9FU119B866967B', 'ARBVIM21187FB520A2', 'ARISRD71187FB57AE8',
         'ARAMB6Q1187B99DE68', 'ARE3JFT1187FB589B6', 'ARJMAW61187B9A6148',
         'ARP6QCL1187FB36142', 'ARJ41O41187B9A0F53', 'AR1P7OW1187FB5B3E1',
         'ARVMRVW1187FB392FF', 'ARA8DDQ1187B9AE3A0', 'AR3QE2N1187FB588CA',
         'AROF8OV1187FB55B85', 'AR9JJ761187B9AF496', 'ARWCIR91187FB55D30',
         'ARXWXEB1187B9A8592', 'AR0WGKH11C8A414A0F', 'ARJ8S571187FB4550A',
         'ARWY36G11A348EFDFC', 'AR5SZEA1187B9BA0AA', 'ARPFC0M1187B9B969D',
         'ARAEZVZ1187FB573A8', 'AR52O1K1187FB4C98D', 'ARDEOJT1187B990229',
         'ARKWACN11A348F0476', 'ARL26PR1187FB576E5', 'ARE3RNX1187B9ADD8B',
         'AROLJZM1187B994C58', 'ARXOPQ911C8A41568B', 'ARAFF5A1187FB56142',
         'AR

In [1]:
import time

In [2]:
t1 = time.time()

In [3]:
t2 = time.time()

In [12]:
%%writefile test_code/count_h5.py
#!/usr/bin/env python
from pyspark import SparkContext
import time
import h5py

def read_h5_file(path):
    with h5py.File(path, 'r') as f:
        return f['metadata']['songs']['title'][0]
#Start Time
t1 = time.time()

# --- Process files ----
sc = SparkContext(appName="SparkHDF5")
file_paths = sc.textFile('file:///data/asoto/projectW205/data/list_files.txt')

songs = file_paths.map(read_h5_file)
songs.count()
# ----------------------

#End Time
t2 = time.time()
sec = t2-t1

print "Time: %0.2f sec = %.2f min = %.2f h"%(sec,sec/60.0,sec/1440.0)
sc.stop()

Overwriting test_code/count_h5.py


In [None]:
!spark-submit test_code/count_h5.py

15/11/29 07:08:53 INFO spark.SparkContext: Running Spark version 1.3.0
15/11/29 07:08:53 INFO spark.SecurityManager: Changing view acls to: asoto
15/11/29 07:08:53 INFO spark.SecurityManager: Changing modify acls to: asoto
15/11/29 07:08:53 INFO spark.SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(asoto); users with modify permissions: Set(asoto)
15/11/29 07:08:54 INFO slf4j.Slf4jLogger: Slf4jLogger started
15/11/29 07:08:54 INFO Remoting: Starting remoting
15/11/29 07:08:54 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriver@ip-10-149-10-206.ec2.internal:60992]
15/11/29 07:08:54 INFO Remoting: Remoting now listens on addresses: [akka.tcp://sparkDriver@ip-10-149-10-206.ec2.internal:60992]
15/11/29 07:08:54 INFO util.Utils: Successfully started service 'sparkDriver' on port 60992.
15/11/29 07:08:54 INFO spark.SparkEnv: Registering MapOutputTracker
15/11/29 07:08:54 INFO spark.SparkEnv: Registerin