In [1]:
import findspark
findspark.init() 

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.stat import Correlation
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType, StringType
import matplotlib.pyplot as plt 
import seaborn as sns
import pyspark.sql.types as T


In [4]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

def fudf(val):
    return functools.reduce(lambda x, y:x+y, val)

In [5]:
df = spark.read.json('cleaned_dataset')

In [6]:
df.toPandas()

Unnamed: 0,acousticness,danceability,duration_ms,energy,genres,id_track,instrumentalness,key,liveness,loudness,mode,popularity_track,release_date,speechiness,sum_artist_followers,sum_artist_popularity,tempo,time_signature,valence
0,0.658000,0.602,156067,0.552,"[classic czech pop, czech folk, czech rock]",00AeAaSNbe92PRrstQskvH,0.000000,0,0.0972,-6.667,1,3,1980-01-01,0.4040,10807,80,182.229,3,0.650
1,0.543000,0.770,220133,0.891,"[afrobeat, afropop, world]",00DJt4PjkzeXhKKVDekw2n,0.000796,1,0.0684,-7.306,1,9,1976-01-01,0.1720,19833,43,135.573,4,0.898
2,0.000048,0.212,250960,0.986,"[alternative metal, gothenburg metal, melodic ...",00HgVIkZrAL8WjAN9Et6WW,0.918000,0,0.3240,-6.690,0,33,1996-02-20,0.1400,874600,68,140.917,4,0.231
3,0.144000,0.362,457040,0.453,"[corrosion, dark wave, ethereal wave, gothabil...",00Lx2f8NRiFKMGWa0GBGEt,0.827000,11,0.1170,-17.744,0,35,1990-01-01,0.0398,69129,42,118.853,4,0.257
4,0.957000,0.343,282891,0.225,"[brazilian rock, mpb, pop rock brasileiro, roc...",00fzPML4lrNag28OPzsB93,0.000249,1,0.6610,-14.937,0,52,2017-09-22,0.0384,1709414,68,144.533,4,0.101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583358,0.158000,0.775,198549,0.694,[],7zQEJoYjxZBKUfzGO57isE,0.000000,1,0.1000,-8.266,1,37,2019-04-30,0.1920,0,0,88.022,4,0.866
583359,0.757000,0.544,213787,0.844,[hungarian pop],7zSqJS5YBEzpeLxqR6WNnm,0.000005,10,0.2020,-3.735,1,28,2012-11-12,0.0967,4286,24,79.836,4,0.620
583360,0.001150,0.580,279786,0.877,"[chilean indie, chilean rock, latin alternativ...",7zZESEhPkn3qWqfsgcdiQT,0.024200,7,0.6220,-9.005,1,46,2001-10-08,0.0454,200001,56,89.983,4,0.671
583361,0.000092,0.621,421907,0.544,"[alternative rock, big beat, britpop, dance ro...",7zxDWJtcsVvXsbMBxxbSba,0.579000,8,0.8890,-14.754,1,57,1991-09-23,0.0514,362972,60,91.147,4,0.885


# Data Transformation

droppo le colonne inutili ai fini del clustering e la colonna energy per l'alta correlazione con loudness (che è più correlata a popularity track)

In [6]:
df=df.drop('id_track','popularity_track','sum_artist_followers','sum_artist_popularity','energy')

Vector Assembler. A vector assembler is a transformer that converts a set of features into a single vector column often referred to as an array of features 

In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["duration_ms", "danceability", "loudness", "speechiness","acousticness","instrumentalness","liveness","valence","tempo","time_signature"],
    outputCol="features")

output = assembler.transform(df)

In [8]:
print("Assembled columns")
output.select("features").show(truncate=False)

Assembled columns
+---------------------------------------------------------------------+
|features                                                             |
+---------------------------------------------------------------------+
|[156067.0,0.602,-6.667,0.404,0.658,0.0,0.0972,0.65,182.229,3.0]      |
|[220133.0,0.77,-7.306,0.172,0.543,7.96E-4,0.0684,0.898,135.573,4.0]  |
|[250960.0,0.212,-6.69,0.14,4.8E-5,0.918,0.324,0.231,140.917,4.0]     |
|[457040.0,0.362,-17.744,0.0398,0.144,0.827,0.117,0.257,118.853,4.0]  |
|[282891.0,0.343,-14.937,0.0384,0.957,2.49E-4,0.661,0.101,144.533,4.0]|
|[217000.0,0.893,-5.331,0.0362,0.119,9.76E-4,0.0952,0.958,120.021,4.0]|
|[545107.0,0.351,-17.481,0.0413,0.96,0.0339,0.083,0.237,107.86,4.0]   |
|[256933.0,0.422,-3.806,0.0799,0.168,0.0,0.129,0.616,97.573,4.0]      |
|[249133.0,0.627,-5.604,0.0694,0.775,0.0,0.0886,0.891,152.277,4.0]    |
|[189800.0,0.554,-14.1,0.0438,0.971,0.0,0.142,0.549,103.292,1.0]      |
|[181733.0,0.189,-14.585,0.0359,0.945,0.831,0.

# Problema con dimensionality reduction, come faccio a modificare il dataset?

variance treshold per eliminare gli attributi con alta collinearità (a quanto si setta la treshold?)

In [16]:
from pyspark.ml.feature import VarianceThresholdSelector


selector = VarianceThresholdSelector(varianceThreshold=8.0, outputCol="selectedFeatures")

result = selector.fit(output).transform(output)

print("Output: Features with variance lower than %f are removed." %
      selector.getVarianceThreshold())


Output: Features with variance lower than 8.000000 are removed.


In [11]:
result.show()

+------------+------------+-----------+--------------------+----------------+---+--------+--------+----+------------+-----------+-------+--------------+-------+--------------------+--------------------+
|acousticness|danceability|duration_ms|              genres|instrumentalness|key|liveness|loudness|mode|release_date|speechiness|  tempo|time_signature|valence|            features|    selectedFeatures|
+------------+------------+-----------+--------------------+----------------+---+--------+--------+----+------------+-----------+-------+--------------+-------+--------------------+--------------------+
|       0.658|       0.602|     156067|[classic czech po...|             0.0|  0|  0.0972|  -6.667|   1|  1980-01-01|      0.404|182.229|             3|   0.65|[156067.0,0.602,-...|[156067.0,-6.667,...|
|       0.543|        0.77|     220133|[afrobeat, afropo...|         7.96E-4|  1|  0.0684|  -7.306|   1|  1976-01-01|      0.172|135.573|             4|  0.898|[220133.0,0.77,-7...|[220133

PCA sugli attributi continui per trovare nuovi attributi che riassumano (catturando la varianza) quelli presenti


In [17]:
from pyspark.ml.feature import PCA


pca = PCA(k=3, inputCol="selectedFeatures", outputCol="pcaFeatures")
model = pca.fit(result)

pca_result = model.transform(result).select("pcaFeatures")

In [18]:
pca_result.toPandas()

Unnamed: 0,pcaFeatures
0,"[-156066.9999696477, -181.93801029787437, 12.6..."
1,"[-220132.99997738644, -135.2967676931547, 11.7..."
2,"[-250959.99997650666, -140.66320370840626, 11...."
3,"[-457039.99998002214, -118.28254047004285, 21...."
4,"[-282890.9999757969, -144.01197820344618, 19.6..."
...,...
583358,"[-198548.99998527073, -87.73629670743517, 11.1..."
583359,"[-213786.9999866897, -79.70587755182788, 6.356..."
583360,"[-279785.9999849342, -89.6854445497448, 11.957..."
583361,"[-421906.99998466315, -90.68373396645897, 17.7..."


Scalo gli attributi numerici con il codice utilizzato nella correlation

In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import functions as f

In [19]:
columns_to_scale = ["duration_ms", "danceability", "loudness", "speechiness","acousticness","instrumentalness","liveness","valence","tempo","time_signature","features"]
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(output)
enriched_df = scalerModel.transform(output)

In [39]:
names = {x + "_scaled": x for x in columns_to_scale}
scaledData = enriched_df.select([f.col(c).alias(names[c]) for c in names.keys()])

In [40]:
scaledData.toPandas()

Unnamed: 0,duration_ms,danceability,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,features
0,[0.027185194968772888],[0.6074672048435923],[0.8157886686245716],[0.41606591143151395],[0.6606425702811246],[0.0],[0.0972],[0.65],[0.7396227793539275],[0.6000000000000001],"[0.027185194968772888, 0.6074672048435923, 0.8..."
1,[0.03858915312091372],[0.7769929364278506],[0.8060144395496818],[0.17713697219361482],[0.5451807228915664],[0.000796],[0.0684],[0.898],[0.5502575279749656],[0.8],"[0.03858915312091372, 0.7769929364278506, 0.80..."
2,[0.04407646024100932],[0.21392532795156408],[0.8154368575624081],[0.14418125643666324],[4.8192771084337354e-05],[0.918],[0.324],[0.231],[0.5719475121864105],[0.8],"[0.04407646024100932, 0.21392532795156408, 0.8..."
3,[0.08075937623378523],[0.36528758829465185],[0.6463534018600098],[0.04098867147270855],[0.14457831325301204],[0.827],[0.117],[0.257],[0.48239515222358864],[0.8],"[0.08075937623378523, 0.36528758829465185, 0.6..."
4,[0.049760282982494804],[0.34611503531786075],[0.6892896475770924],[0.03954685890834191],[0.9608433734939759],[0.000249],[0.661],[0.101],[0.5866239685690049],[0.8],"[0.049760282982494804, 0.34611503531786075, 0...."
...,...,...,...,...,...,...,...,...,...,...,...
583358,[0.03474713032011754],[0.7820383451059536],[0.7913301517376407],[0.1977342945417096],[0.15863453815261044],[0.0],[0.1],[0.866],[0.35725969129113044],[0.8],"[0.03474713032011754, 0.7820383451059536, 0.79..."
583359,[0.037459544304482444],[0.5489404641775985],[0.8606369309838472],[0.0995880535530381],[0.7600401606425703],[4.79e-06],[0.202],[0.62],[0.3240347267037637],[0.8],"[0.037459544304482444, 0.5489404641775985, 0.8..."
583360,[0.049207582797335794],[0.5852674066599394],[0.7800263093489964],[0.046755921730175086],[0.0011546184738955823],[0.0242],[0.622],[0.671],[0.3652189089256071],[0.8],"[0.049207582797335794, 0.5852674066599394, 0.7..."
583361,[0.07450558698895704],[0.6266397578203834],[0.6920888399412628],[0.0529351184346035],[9.26706827309237e-05],[0.579],[0.889],[0.885],[0.36994329919920776],[0.8],"[0.07450558698895704, 0.6266397578203834, 0.69..."


# K-Means

In [41]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

confronto silhouette score per scegliere il miglior numero di cluster

In [69]:
silhouette_score=[]
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='features', \
                                metricName='silhouette', distanceMeasure='squaredEuclidean')
for i in range(2,20):
    
    KMeans_algo=KMeans(featuresCol='features', k=i)
    
    KMeans_fit=KMeans_algo.fit(scaledData)
    
    output=KMeans_fit.transform(scaledData)
    
    
    
    score=evaluator.evaluate(output)
    
    silhouette_score.append(score)
    
    print("k={}, Silhouette Score:{}".format(i,score))

k=2, Silhouette Score:0.428595164302526
k=3, Silhouette Score:0.44395608623094546
k=4, Silhouette Score:0.4763976071468858
k=5, Silhouette Score:0.3993148505654331
k=6, Silhouette Score:0.4359135719740443
k=7, Silhouette Score:0.2988913478266516
k=8, Silhouette Score:0.4146590220089115
k=9, Silhouette Score:0.41355096233907296
k=10, Silhouette Score:0.3846082357256498
k=11, Silhouette Score:0.3667236844525029
k=12, Silhouette Score:0.3023173341414543
k=13, Silhouette Score:0.2996991762944456
k=14, Silhouette Score:0.3016084075579669
k=15, Silhouette Score:0.322677573706461
k=16, Silhouette Score:0.29370696462627294
k=17, Silhouette Score:0.2872023799435611
k=18, Silhouette Score:0.3032156294549406
k=19, Silhouette Score:0.27383824596196477


In [78]:
# Trains a k-means model.
kmeans = KMeans(featuresCol='features', k=4)
model = kmeans.fit(scaledData)

# Make predictions
predictions = model.transform(scaledData)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.4763976071468858
Cluster Centers: 
[0.02837088 0.66888505 0.6849827  0.88449293 0.6375654  0.00304249
 0.3924098  0.56443483 0.41657326 0.71449965]
[0.04412259 0.46102479 0.68943858 0.06213289 0.68555107 0.81509509
 0.19448088 0.4179651  0.45859238 0.76127627]
[0.04148343 0.61430549 0.79879841 0.08037168 0.15493278 0.02941043
 0.20831805 0.62017253 0.50161222 0.79116178]
[0.03909471 0.52963244 0.74418724 0.06305429 0.75766156 0.02147904
 0.20499052 0.5032176  0.4681169  0.76419739]


In [68]:
predictions.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+--------+-------+--------------------+--------------------+--------------------+----------+
|         duration_ms|        danceability|            loudness|         speechiness|        acousticness|instrumentalness|liveness|valence|               tempo|      time_signature|            features|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+--------+-------+--------------------+--------------------+--------------------+----------+
|[0.02718519496877...|[0.6074672048435923]|[0.8157886686245716]|[0.41606591143151...|[0.6606425702811246]|           [0.0]|[0.0972]| [0.65]|[0.7396227793539275]|[0.6000000000000001]|[0.02718519496877...|         3|
|[0.03858915312091...|[0.7769929364278506]|[0.8060144395496818]|[0.17713697219361...|[0.5451807228915664]|       [7.96E-4]|[0.0684]|[0.898]|