In [1]:
import os
import sys
import time
import glob
import datetime
import sqlite3
import numpy as np
import pandas as pd
import hdf5getters as GETTERS
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.35:7077") \
        .appName("Jenny")\
        .config("spark.executor.cores",2)\
        .config("spark.dynamicAllocation.enabled", False) \
        .config("spark.shuffle.service.enabled", False) \
        .getOrCreate()
        
sc = spark_session.sparkContext

In [2]:
msd_subset_path='/home/ubuntu/MillionSongSubset/'
msd_subset_data_path=os.path.join(msd_subset_path,'data')
msd_subset_addf_path=os.path.join(msd_subset_path,'AdditionalFiles')
msd_code_path='/home/ubuntu/MSongsDB'
dbfile = '/home/ubuntu/MSongsDB/Tasks_Demos/SQLite/track_metadata.db'
sys.path.append(os.path.join(msd_code_path,'PythonSrc'))

def apply_to_all_files(basedir,func=lambda x: x,ext='.h5'):
    cnt = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        cnt += len(files)
        for f in files :
            func(f)       
    return cnt

print('number of song files:',apply_to_all_files(msd_subset_data_path))

number of song files: 10000


In [None]:
all_the_info = set()

def func_to_get_all_the_info(filename):
    h5 = GETTERS.open_h5_file_read(filename)
    year = GETTERS.get_year(h5)
    hotness = GETTERS.get_song_hotttnesss(h5)
    artist_hotness = GETTERS.get_artist_hotttnesss(h5)
    loudness = GETTERS.get_loudness(h5)
    tempo = GETTERS.get_tempo(h5)
    key = GETTERS.get_key(h5)
    
    all_the_info.add((year, hotness, artist_hotness, loudness, tempo, key))
    h5.close()

apply_to_all_files(msd_subset_data_path,func=func_to_get_all_the_info)

In [None]:
rdd = sc.parallelize(list(all_the_info))
rdd1 = rdd.map(lambda x: list(x)).map(lambda y: ([int(y[0]), y[1], y[2], y[3], y[4], y[5]]))
rdd2 = rdd1.filter(lambda x: len(x) == 6)\
    .filter(lambda x: type(x[0]) == int and x[0] >= 1980)\
    .filter(lambda x: x[1] > 0)
rdd3 = rdd2.map(lambda x: (x[0], [x[1], x[2], x[3], x[4], x[5], 1]))
rdd4 = rdd3.reduceByKey(lambda a,b: [a[0]+b[0], a[1]+b[1], a[2]+b[2], a[3]+b[3], a[4]+b[4], a[5]+b[5]])\
    .map(lambda x: (x[0], [x[1][0]/x[1][5], x[1][1]/x[1][5], x[1][2]/x[1][5], x[1][3]/x[1][5], x[1][4]/x[1][5]]))
rdd4.cache()

In [None]:
list_song_hot = rdd2.map(lambda x: x[1]).collect()
list_artist_hot = rdd2.map(lambda x: x[2]).collect()
list_loudness = rdd2.map(lambda x: x[3]).collect()
list_tempo = rdd2.map(lambda x: x[4]).collect()


from scipy.stats import pearsonr
pearson_corr1, p_value1 = pearsonr(list_song_hot, list_artist_hot)
pearson_corr2, p_value2 = pearsonr(list_song_hot, list_loudness)
pearson_corr3, p_value3 = pearsonr(list_song_hot, list_tempo)

print('Compared attribute             Pearson correlation coefficient    p-value')
print(f'Song hotness vs artist hotness {pearson_corr1}                {p_value1}')
print(f'Song hotness vs loudness       {pearson_corr2}                {p_value2}')
print(f'Song hotness vs tempo          {pearson_corr3}                {p_value3}')


In [3]:
sc.stop()