# <center> Million Songs Analysis</center>

## Starting the Spark session

In [1]:
from pyspark.sql import SparkSession

spark_session = SparkSession\
        .builder\
        .master("local[1]") \
        .appName("Project_19")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()
            
spark_context = spark_session.sparkContext

## Importations

In [2]:
import h5py
import io, time
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id, col, when

## Loading the data into RDDs (Skip)

In [3]:
""" This function turns a list of binary elements to a string with the element separated by the character ',' """
def list_bin_to_str(list_terms) :
    string=""
    for term in list_terms :
        string += term.decode()+','
    return string[1:-1]   

In [37]:
""" We prepare first a RDD containing tuples (song_information, artist_information) """
#rdd = spark_context.binaryFiles("hdfs://host-192-168-1-153-ldsa:9000/millionsongs/data/A/B/*")
rdd = spark_context.binaryFiles("/mnt/ms/data/A/B/*")

# A function to open the h5 files one by one and take the information of interest for the song and the artist
def f(x):
    with h5py.File(io.BytesIO(x[1])) as f:
        
        f_meta = f['metadata']["songs"]
        f_analys = f['analysis']["songs"]
        f_brainz = f['musicbrainz']["songs"]
        
        artist_terms = list_bin_to_str(f['metadata']['artist_terms'][:])
        similar_artists = list_bin_to_str(f['metadata']['similar_artists'][:])
        
        return ((f_meta[0][-3].decode(), f_meta[0][-2].decode(),
                f_meta[0][9].decode(), f_meta[0][4].decode(),
                f_meta[0][-6].decode(),
                float(f_analys[0][3]), float(f_meta[0][-4]),
                float(f_analys[0][2]), float(f_analys[0][23]), 
                float(f_analys[0][-4]), int(f_brainz[0][1])), 
               
                (f_meta[0][4].decode(), f_meta[0][9].decode(),
                f_meta[0][6].decode(), float(f_meta[0][3]),
                artist_terms, similar_artists))

rdd = rdd.map(f)

""" We create two RDDs (songs, artists) from the general one """
rdd_songs = rdd.map(lambda x : x[0])
rdd_artists = rdd.map(lambda x : x[1])

## Turning RDDs to Data frames (Skip)

<b> *   Table Songs </b>

In [7]:
attributes = rdd_songs.map(lambda p: Row(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10]))
fields = [StructField("song_id", StringType(), True), StructField("title", StringType(), True), 
          StructField("artist_name", StringType(), True), StructField("artist_id", StringType(), True),
          StructField("release_album", StringType(), True), 
          StructField("duration", FloatType(), True), StructField("hotness", FloatType(), True), 
          StructField("danceability", FloatType(), True), StructField("loudness", FloatType(), True), 
          StructField("tempo", FloatType(), True), StructField("year", IntegerType())]
songs_schema = StructType(fields)

df_songs = spark_session.createDataFrame(attributes, songs_schema)
df_songs.show()

+------------------+--------------------+--------------------+------------------+--------------------+---------+----------+------------+--------+-------+----+
|           song_id|               title|         artist_name|         artist_id|       release_album| duration|   hotness|danceability|loudness|  tempo|year|
+------------------+--------------------+--------------------+------------------+--------------------+---------+----------+------------+--------+-------+----+
|SOPLQMB12AC4686313|Girl Of Mysteriou...|           MARC COHN|ARWRVSR1187FB575FB|    Burning The Daze| 257.5669|       0.0|         0.0| -13.107|102.073|1998|
|SOFJWKK12AB01826CD|The Cookie Bakers...|      Laurie Berkner|ARC1CJB1187B99FCA4|           Buzz Buzz|233.45587|       0.0|         0.0| -16.263| 144.71|2001|
|SOHNDZB12A8C13EDB8|      Shooting Stars|           Cauterize|ARF0COO1187B99B9EF|    So Far From Real|240.43057|0.71432906|         0.0|  -4.714| 97.148|2003|
|SOAPGDK12A6D4FAF5E|        The Blessing|     

<b> *   Table Artists </b>

In [38]:
attributes = rdd_artists.map(lambda p: Row(p[0], p[1], p[2], p[3], p[4], p[5]))
fields = [StructField("id", StringType(), True), StructField("name", StringType(), True), 
          StructField("location", StringType(), True), StructField("hotness", FloatType(), True),
          StructField("terms", StringType(), True), 
          StructField("similar_artists", StringType(), True)]
artists_schema = StructType(fields)

df_artists = spark_session.createDataFrame(attributes, artists_schema)\
                          .dropDuplicates()
df_artists.show()

+------------------+--------------------+------------------+----------+--------------------+--------------------+
|                id|                name|          location|   hotness|               terms|     similar_artists|
+------------------+--------------------+------------------+----------+--------------------+--------------------+
|ARPET7C1187B996C8B|          Bajja Jedd|                  |       0.0|ancehall,reggae,j...|RCQ6GG1187B9A39DB...|
|ARTBBJ51187FB498F5|      Barrett Strong|     Westpoint, MS|0.38878328|otown,jazz funk,l...|RV03D41187FB413E8...|
|ARH39ZK1187B9ABC22|           Girl Talk|   California - SF| 0.5783318|litch,smooth jazz...|RFL5HZ1187B9AC31B...|
|ARYEUNC1187B9A3648|     utopia:banished|                  |0.33196872|roken beat,grunge...|R7TT521187B99DF76...|
|AR6AYVY1187B990E7C|          Tony Rebel|                  | 0.4064398|ancehall,roots re...|RVJGJE1187FB436EB...|
|ARX8MDO1187FB5A1B9| Metabass 'N' Breath|                  |       0.0|ip hop,breakbeat,

## Saving data frames in csv files (Skip)

<b> *   Table Songs </b>

In [8]:
df_songs.write.format('com.databricks.spark.csv')\
        .option("header", "true")\
        .save('/home/ubuntu/MySongs.csv')


<b> *   Table Artists </b>

In [40]:
df_artists.write.format('com.databricks.spark.csv')\
        .option("header", "true")\
        .save('/home/ubuntu/MyArtists.csv')


## Loading data frames from csv files

<b> *   Table Songs </b>

In [4]:
fields = [StructField("song_id", StringType(), True), StructField("title", StringType(), True), 
          StructField("artist_name", StringType(), True), StructField("artist_id", StringType(), True),
          StructField("release_album", StringType(), True), 
          StructField("duration", FloatType(), True), StructField("hotness", FloatType(), True), 
          StructField("danceability", FloatType(), True), StructField("loudness", FloatType(), True), 
          StructField("tempo", FloatType(), True), StructField("year", IntegerType())]
songs_schema = StructType(fields)

df_songs = spark_session.read.load('file:/home/ubuntu/MySongs.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          schema=songs_schema)\
                        .cache()


<b> *   Table Artists </b>

In [5]:
fields = [StructField("id", StringType(), True), StructField("name", StringType(), True), 
          StructField("location", StringType(), True), StructField("hotness", FloatType(), True),
          StructField("terms", StringType(), True), 
          StructField("similar_artists", StringType(), True)]
artists_schema = StructType(fields)

df_artists = spark_session.read.load('file:/home/ubuntu/MyArtists.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          schema=artists_schema)\
                          .cache()

# Functions

In [6]:
""" This function asks the user for the title of a song and returns its attributes in the database if it exists """
def ask_for_song() :
    song_title = input(" Title of the song : ")
    song = df_songs.filter(df_songs["title"].like("%" + song_title + "%"))
    while song.count() == 0 :
        print(" This song doesn't exist in the database")
        song = df_songs.filter(df_songs["title"].like("%" + input("Title of the song : ") + "%"))
    print(" The song is : ", song.first()["title"])
    return song.first()

In [7]:
""" This function returns the songs which belong to the same artist as the input song(Row) """
def same_artist(song) :
    similar_songs = df_songs.filter(df_songs.artist_name == song["artist_name"])
    return similar_songs.filter(similar_songs.title != song["title"])
  

In [75]:
""" This function returns the songs, from the dataframe, which have similar tempo as the input song"""
def similar_tempo(song, dataframe, margin) : 
    similar_songs = dataframe.filter(dataframe.tempo.between(song["tempo"] - margin, song["tempo"] + margin))\
                             .filter(dataframe.title != song["title"])
 
    return similar_songs

In [76]:
""" This function returns the songs, from the dataframe, which have similar loudness as the input song"""
def similar_loudness(song, dataframe, margin) : 
    similar_songs = dataframe.filter(dataframe.loudness.between(song["loudness"] - margin, song["loudness"] + margin))\
                             .filter(dataframe.title != song["title"])

    return similar_songs

In [77]:
""" This function returns the songs, from the dataframe, which have similar hotness as the input song"""
def similar_hotness(song, dataframe, margin) : 
    similar_songs = dataframe.filter(dataframe.hotness.between(song["hotness"] - margin, song["hotness"] + margin))\
                             .filter(dataframe.title != song["title"])

    return similar_songs

In [78]:
""" This function returns the songs, from the dataframe, which have similar danceability as the input song"""
def similar_danceability(song, dataframe, margin) : 
    similar_songs = dataframe.filter(dataframe.danceability.between(song["danceability"] - margin, song["danceability"] + margin))\
                             .filter(dataframe.title != song["title"])

    return similar_songs

In [88]:
""" This function returns the songs, from the dataframe, which are similar to the input song"""
def similar_song_1(song) :
    x = 1
    similar_songs = spark_session.createDataFrame(spark_context.emptyRDD(), songs_schema)
    while similar_songs.count() == 0 :
        similar_songs_inter = similar_tempo(song, df_songs, 0.1*x)
        similar_songs_inter = similar_loudness(song, similar_songs_inter, 0.1*x)
        similar_songs_inter = similar_danceability(song, similar_songs_inter, 0.1*x)
        similar_songs = similar_hotness(song, similar_songs_inter, 0.1*x)
        x *= 2
    return similar_songs

## Core Analysis

In [10]:
user_song = ask_for_song()

 Title of the song : Rattle
 The song is :  Rattlesnake!


In [17]:
songs_same_artist = same_artist(user_song)
if songs_same_artist != None :
    some_songs = songs_same_artist.take(5)
    print(" Songs from the same artist :", end ='\t')
    for song in some_songs : 
        print(song["title"], sep = ", ")
    

 Songs from the same artist :	God Bless You (Goddamn it)


In [21]:
songs_similar_tempo = similar_tempo(user_song, df_songs, 1).collect()
if songs_similar_tempo != None :
    print(" Songs with similar tempo :", end ='\t')
    for song in songs_similar_tempo : 
        print(song["title"], end = ", ")

 Songs with similar tempo :	Fast Girls, High Tonight, Ma Promese, Road Song, My Heart is Yours, Chlebnikov, 

In [43]:
songs_similar_loudness = similar_loudness(user_song, df_songs, 0.1).collect()
if songs_similar_loudness != None :
    print(" Songs with similar loudness :", end ='\t')
    for song in songs_similar_loudness : 
        print(song["title"], end = ", ")

 Songs with similar loudness :	Guajira, I'll Be Autumn_ You Be Winter, Je Me Lâche, Something, Sunshine After The Rain, The Lair Of The White Worm, Heaven, Kinda New (We All Live & Die), She's Gone, Psychobilly Is All Around, The Nothing Doctrine, Medley Leïla Chico, Other Girls, Kids In America, Stanotte, Self Destruct, Ringa Ding Ding, 

In [66]:
songs_similar_hotness = similar_hotness(user_song, df_songs, 0.1).collect()
if songs_similar_hotness != None :
    print(" Songs with similar hotness :", end ='\t')
    for song in songs_similar_hotness : 
        print(song["title"], end = ", ")

 Songs with similar hotness :	Forgotten, The Story, Abominations, Rigor Mortis, When You Wake For Certain, Talk About, Almost Genuine, Stuldt Håjt, Farlig (feat. Tshawe), Is That All You've Got For Me (Album Version), We're Back, Den zu´ich ihm ´rein, Welcome To Goodbye, Love Has Passed Away, The Red The White The Black The Blue, Es dificil, Eyes Of A Child (Album), The Modern Leper, Closing Scene, Over To You, Un Tipo Como Yo, Ladies in Their Sensitivities, Get It For Free, I Love The Lovers, Landing Feet First (Album Version), Into A Swan, Two Good Reasons, Baddest Of The Bad (Album), Guajira, Je Me Lâche, Compulsory Resurrection, Two Hands, Fuzzy, A Heart Without A Home, The Rainbow Connection, Guiding Light (LP Version), The Sound Of Goodbye (Above & Beyond Vocal Mix), Walk the Edge, Come Home, Sooner Or Later, Lake Of Dreams, Kater Street Rag, Avalanche, Heiliges Herz (Thomas Rainer - Remix), Show Me What You Got, Chapter, The Melting Moon (Edit), Chapter III: La Terra Santa, Cras

In [68]:
songs_similar_danceability = similar_danceability(user_song, df_songs, 0.1).collect()
if songs_similar_danceability != None :
    print(" Songs with similar danceability :", end ='\t')
    for song in songs_similar_danceability : 
        print(song["title"], end = ", ")

 Songs with similar danceability :	Girl Of Mysterious Sorrow (LP Version), The Cookie Bakers of the Night, Shooting Stars, The Blessing, Sappy Love Song, Forgotten, Fast Girls, Our Day Will Come, The Story, The Creatures Are Having Fun ..., Falstaff, Doctor Of Hearts (Album Version), Feel It, (There'll Be) Peace On The Valley (For Me), Qui Es-tu ?, Torturing My Soul, La Vida En Rosa, Paginas Muertas - Original, Come See About Me, Little Bitty Pretty One, My Romance, Una Foto En Blanco Y Negro, Digging For Gold, Concluding Speech, Born In The Famine feat. Sizzla & Jesse Jendah, The Fisherman, Whip It, Tir Na Nog, Touching Ground (Prognosis), ILL 04, Crying Time, Checkin' It Out, Viagem, Por um dia e nada mais, Rehtorin luiseva Salli, Abominations, Bit By Bit (Baxter Baxter Remix), Is It In? (Remix), Love Me Now (Rockwilder Remix) (Feat. Wyclef And Redman), Cincinnati Blues, Rigor Mortis, Stark Honesty, Boogie, Girls From The North Country, Citizen Of The Planet, Jenny Had A Birthday, Gl

In [64]:
songs_similar = similar_hotness(user_song, similar_tempo(user_song, similar_loudness(user_song, df_songs, 0.2), 6), 0.5).collect()
if songs_similar != None :
    print(" Songs with similar loudness, tempo & hotness :", end ='\t')
    for song in songs_similar : 
        print(song["title"], end = ", ")

 Songs with similar loudness, tempo & hotness :	Guajira, 

In [89]:
similar_song_1(user_song).collect()

[Row(song_id='SOLKEAF12AB0186985', title='Yoko Mono', artist_name='Rui Reininho', artist_id='AR595Y91187B9A4EF0', release_album='Companhia Das Indias', duration=192.62649536132812, hotness=0.0, danceability=0.0, loudness=-7.446000099182129, tempo=200.07400512695312, year=2008),
 Row(song_id='SORSTBQ12A8C144986', title='Tumba Tumba', artist_name='Charanga Forever', artist_id='ARQ7XJG1187B991CEC', release_album='La Charanga Soy Yo', duration=367.4378967285156, hotness=0.0, danceability=0.0, loudness=-6.622000217437744, tempo=203.96400451660156, year=2000),
 Row(song_id='SOTIFMI12AB0186852', title='K#k* On The Mic (feat. Paul Shapiro)', artist_name='Hip Hop Hoodios', artist_id='ARJ6QUG1187B98BE13', release_album="Agua Pa' La Gente", duration=233.92608642578125, hotness=0.2836011052131653, danceability=0.0, loudness=-4.146999835968018, tempo=200.22000122070312, year=2005),
 Row(song_id='SODPLYE12AB01898FB', title='A New Hope', artist_name='Blink-182', artist_id='ARA3I0J1187FB57869', releas

In [87]:
user_song

Row(song_id='SOLUIMX12AB0189599', title='Rattlesnake!', artist_name='A Static Lullaby', artist_id='ARQ05GM1187B990512', release_album='Rattlesnake!', duration=213.49832153320312, hotness=0.590252697467804, danceability=0.0, loudness=-5.5370001792907715, tempo=202.23300170898438, year=2008)

## Tests (To delete) 

In [4]:
songs = h5py.File('b.h5') # type = HDF5 file.

print(f'keys are {list(songs.values())}'
group_analysis = songs['metadata'] # type: HDF5 group.
print()
print(group_analysis.keys()) # AttributeManager -- these are like a small metadata dictionary attached anywhere in the tree.

#bars_start = songs['analysis']['bars_start'] # Dataset.

# Lets look at a more complex table elsewhere in the tree...

songs = songs['metadata']['songs'] # Dataset.

# This dataset has rows and columns. Lets get the first row...
song_row = songs.value
print()
# print the column names
print(song_row.dtype.names)
print(song_row)
print(song_row.dtype.names[3])
print(song_row[0][3])
#print([song.decode("ASCII") for song in song_row])


keys are [<HDF5 group "/analysis" (16 members)>, <HDF5 group "/metadata" (5 members)>, <HDF5 group "/musicbrainz" (3 members)>]

<KeysViewHDF5 ['artist_terms', 'artist_terms_freq', 'artist_terms_weight', 'similar_artists', 'songs']>

('analyzer_version', 'artist_7digitalid', 'artist_familiarity', 'artist_hotttnesss', 'artist_id', 'artist_latitude', 'artist_location', 'artist_longitude', 'artist_mbid', 'artist_name', 'artist_playmeid', 'genre', 'idx_artist_terms', 'idx_similar_artists', 'release', 'release_7digitalid', 'song_hotttnesss', 'song_id', 'title', 'track_7digitalid')
[(b'', 165270, 0.58179377, 0.40199754, b'ARD7TVE1187B99BFB1', nan, b'California - LA', nan, b'e77e51a5-4761-45b3-9847-2051f811e366', b'Casual', 4479, b'', 0, 0, b'Fear Itself', 300848, 0.60211999, b'SOMZWCG12A8C13C480', b"I Didn't Mean To", 3401791)]
artist_hotttnesss
0.4019975433642836




## Releasing the resources

In [106]:
spark_context.stop()