In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local[1]") \
        .appName("Project_19")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

        
        
# Old API (RDD)
spark_context = spark_session.sparkContext

In [137]:
import h5py


# Sample file from http://millionsongdataset.com/pages/getting-dataset/
songs = h5py.File('b.h5') # type = HDF5 file.

print(f'keys are {list(songs.values())}')
group_analysis = songs['metadata'] # type: HDF5 group.
print()
print(group_analysis.keys()) # AttributeManager -- these are like a small metadata dictionary attached anywhere in the tree.

#bars_start = songs['analysis']['bars_start'] # Dataset.

# Lets look at a more complex table elsewhere in the tree...

songs = songs['metadata']['songs'] # Dataset.

# This dataset has rows and columns. Lets get the first row...
song_row = songs.value
print()
# print the column names
print(song_row.dtype.names)
print(song_row)
print(song_row.dtype.names[3])
print(song_row[0][3])
#print([song.decode("ASCII") for song in song_row])



# Note: these numpy and h5py data types will likely not survive or behave strangely when passed through Java.
# You may need to extract the information you need, and convert it to regular python lists to 'survive the trip'."""

keys are [<HDF5 group "/analysis" (16 members)>, <HDF5 group "/metadata" (5 members)>, <HDF5 group "/musicbrainz" (3 members)>]

<KeysViewHDF5 ['artist_terms', 'artist_terms_freq', 'artist_terms_weight', 'similar_artists', 'songs']>

('analyzer_version', 'artist_7digitalid', 'artist_familiarity', 'artist_hotttnesss', 'artist_id', 'artist_latitude', 'artist_location', 'artist_longitude', 'artist_mbid', 'artist_name', 'artist_playmeid', 'genre', 'idx_artist_terms', 'idx_similar_artists', 'release', 'release_7digitalid', 'song_hotttnesss', 'song_id', 'title', 'track_7digitalid')
[(b'', 165270, 0.58179377, 0.40199754, b'ARD7TVE1187B99BFB1', nan, b'California - LA', nan, b'e77e51a5-4761-45b3-9847-2051f811e366', b'Casual', 4479, b'', 0, 0, b'Fear Itself', 300848, 0.60211999, b'SOMZWCG12A8C13C480', b"I Didn't Mean To", 3401791)]
artist_hotttnesss
0.4019975433642836




In [153]:
songs = h5py.File('b.h5')
songs = songs["metadata"]
print(songs["songs"][0])
print(songs['similar_artists'][:])


(b'', 165270, 0.58179377, 0.40199754, b'ARD7TVE1187B99BFB1', nan, b'California - LA', nan, b'e77e51a5-4761-45b3-9847-2051f811e366', b'Casual', 4479, b'', 0, 0, b'Fear Itself', 300848, 0.60211999, b'SOMZWCG12A8C13C480', b"I Didn't Mean To", 3401791)
[b'ARV4KO21187FB38008' b'ARWHM281187FB3D381' b'ARJGOG11187B98D89F'
 b'AR9ODB41187FB459B2' b'ARXM6VQ1187FB5B1E0' b'ARNWZ1N1187B9B71BA'
 b'ARDWYZZ11F4C8413FA' b'ARTP3H51187B98FB75' b'ARWCDXN12454A4D1E8'
 b'ARJ54S61187B9ACD39' b'AR5PF241187B989C1D' b'ARR7MLL1187B99B636'
 b'ARLMHFV1187B9A3833' b'ARPRERY1187B99E2DC' b'AR34BCQ1187B9A68E4'
 b'ARFWBUC11F4C8413DA' b'ARPWGMN1187FB560E3' b'ARVCIVW12454A4D1E7'
 b'ARG89HY1187FB3CA15' b'AR9IGU51187FB40D6B' b'ARNNOYR11F4C845127'
 b'ARZMFNT11F4C8413DD' b'ARPR9W71187FB3723A' b'AR5VBGP1187B98EB43'
 b'ARFHDOI1187FB57230' b'ARBSQPF11F4C8413E0' b'AROYGID11F4C8413DB'
 b'ARDXUGZ11F4C84452F' b'ARMW4I01187B98AEF8' b'AR7AYQG1187B994B3F'
 b'ARHVZEM11F4C841FF9' b'ARP9H0U1187FB3FEA7' b'ARVSIGU11F4C8413E6'
 b'AROWKNS1187

In [154]:
import io, time
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id

#rdd = spark_context.binaryFiles("hdfs://host-192-168-1-153-ldsa:9000/millionsongs/data/A/B/*")
rdd = spark_context.binaryFiles("/mnt/ms/data/A/B/*")

def f(x):
    with h5py.File(io.BytesIO(x[1])) as f:
        
        f_meta = f['metadata']["songs"]
        f_analys = f['analysis']["songs"]
        f_brainz = f['musicbrainz']["songs"]
        
        return ((f_meta[0][-3].decode(), f_meta[0][-2].decode(),
                f_meta[0][9].decode(), f_meta[0][-6].decode(),
                float(f_analys[0][3]), float(f_meta[0][-4]),
                float(f_analys[0][2]), float(f_analys[0][23]), 
                float(f_analys[0][-4]), int(f_brainz[0][1])), 
               
                (f_meta[0][4].decode(), f_meta[0][9].decode(),
                f_meta[0][6].decode(), float(f_meta[0][3])))

rdd = rdd.map(f)
rdd_songs = rdd.map(lambda x : x[0])
rdd_artists = rdd.map(lambda x : x[1])

In [116]:
attributes = rdd_songs.map(lambda p: Row(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9]))
fields = [StructField("song_id", StringType(), True), StructField("title", StringType(), True), 
          StructField("artist_name", StringType(), True), StructField("release_album", StringType(), True), 
          StructField("duration", FloatType(), True), StructField("hotness", FloatType(), True), 
          StructField("danceability", FloatType(), True), StructField("loudness", FloatType(), True), 
          StructField("tempo", FloatType(), True), StructField("year", IntegerType())]
schema = StructType(fields)

df_songs = spark_session.createDataFrame(attributes, schema)
df_songs.show()

+------------------+--------------------+--------------------+--------------------+---------+----------+------------+--------+-------+----+
|           song_id|               title|         artist_name|       release_album| duration|   hotness|danceability|loudness|  tempo|year|
+------------------+--------------------+--------------------+--------------------+---------+----------+------------+--------+-------+----+
|SOPLQMB12AC4686313|Girl Of Mysteriou...|           MARC COHN|    Burning The Daze| 257.5669|       0.0|         0.0| -13.107|102.073|1998|
|SOFJWKK12AB01826CD|The Cookie Bakers...|      Laurie Berkner|           Buzz Buzz|233.45587|       0.0|         0.0| -16.263| 144.71|2001|
|SOHNDZB12A8C13EDB8|      Shooting Stars|           Cauterize|    So Far From Real|240.43057|0.71432906|         0.0|  -4.714| 97.148|2003|
|SOAPGDK12A6D4FAF5E|        The Blessing|                 Joi|        Without Zero|333.24362|0.44369507|         0.0|  -4.141| 95.973|   0|
|SOUYWLC12AB0181B41|

In [155]:
attributes = rdd_artists.map(lambda p: Row(p[0], p[1], p[2], p[3]))
fields = [StructField("id", StringType(), True), StructField("name", StringType(), True), 
          StructField("location", StringType(), True), StructField("hotness", FloatType(), True)]
schema = StructType(fields)

df_artists = spark_session.createDataFrame(attributes, schema)\
                          .dropDuplicates()
df_artists.show()

+------------------+--------------------+--------------------+----------+-----------+
|                id|                name|            location|   hotness| id_created|
+------------------+--------------------+--------------------+----------+-----------+
|AR53H581187B9A93F9|        Sylvie Lewis|     California - LA|0.31954274|          0|
|ARX4VP21187FB37BF9|      The Alter Boys|                    |0.43073228|          1|
|ARV2MS01187FB39026|          Les Baxter|         Detroit, MI|0.41340813|          2|
|ARK893O1187B994919|    The Mayflies USA|                    |0.24001746|          3|
|AREC15G1187B9B84F2|           Sly Stone|   San Francisco, CA|0.47389132| 8589934592|
|AR6QMID1187FB53957|             Caravan|             England|0.41063988| 8589934593|
|ARN94E81187B9A124F|              Soweto|                    | 0.3330376| 8589934594|
|ARWXN5D1187FB470CF|             Buck 65|Sackville, Nova S...| 0.4442687| 8589934595|
|ARENGZO1187FB4CEB4|    Angelic Upstarts|Sheffield, Yo

In [114]:
df_artists.take(50)

[Row(id='3058822f-8e58-4427-820a-8f7a279887c1', name='Heinz Rudolf Kunze', location='Espelkamp-Mittwald, Germany', hotness=0.38667646050453186, id_created=0),
 Row(id='b4806361-a71f-4bf3-981b-e65412b5e536', name='Shola Ama_ Wiley_ Devlin & J2k', location='', hotness=0.4184960424900055, id_created=1),
 Row(id='1f57081f-3478-4824-ab72-8f1b419ea330', name='Neil Norman', location='', hotness=0.2580132484436035, id_created=2),
 Row(id='79051997-aa72-4c72-8993-33e78fe72668', name='Tsujiko Noriko', location='日本の大阪市 (Osaka, Japan)', hotness=0.4096820652484894, id_created=3),
 Row(id='01a5a6cb-f361-4738-bcba-f40bd5553f3c', name='Kelly Llorenna', location='', hotness=0.4227825701236725, id_created=4),
 Row(id='5ca3f318-d028-4151-ac73-78e2b2d6cdcc', name='Tom Petty', location='Gainesville, FL', hotness=0.5683504939079285, id_created=5),
 Row(id='3531715d-fee3-4c5b-82ff-5d85865b598b', name='Reinhard Mey', location='Berlin, Germany', hotness=0.3998229503631592, id_created=6),
 Row(id='e77e51a5-4761

df_artists.where(df_artists.name.like("%MARC%")).show()

In [None]:
df.count()

In [None]:
# release the cores for another application!
spark_context.stop()