In [1]:
!pip install pyspark==3.1.2 py4j==0.10.9 



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.stat import Correlation
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType

spark = SparkSession.builder\
        .master("local[*]")\
        .appName('spotify_classification')\
        .getOrCreate()

In [3]:
from google.colab import drive               
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# CAMBIARE DATASET

In [4]:
df = spark.read.json("drive/My Drive/spot_data.json")

In [5]:
df.show(5)

+------------+------------------+------------+-----------+------+--------------------+--------------------+----------------+---+--------+--------+----+----------------+-----------+--------------------+---------------------+-------+--------------+-------+
|acousticness|               age|danceability|duration_ms|energy|              genres|            id_track|instrumentalness|key|liveness|loudness|mode|popularity_track|speechiness|sum_artist_followers|sum_artist_popularity|  tempo|time_signature|valence|
+------------+------------------+------------+-----------+------+--------------------+--------------------+----------------+---+--------+--------+----+----------------+-----------+--------------------+---------------------+-------+--------------+-------+
|       0.658|41.794520547945204|       0.602|     156067| 0.552|[classic czech po...|00AeAaSNbe92PRrst...|             0.0|  0|  0.0972|  -6.667|   1|               3|      0.404|               10807|                   80|182.229|    

# Preprocessing

In [6]:
from pyspark.ml.feature import QuantileDiscretizer 

qds = QuantileDiscretizer(relativeError=0.0001, handleInvalid="error", numBuckets=10, inputCol="popularity_track", outputCol="label")

df = qds.setHandleInvalid("keep").fit(df).transform(df)

In [7]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler( 
inputCols=[
 'duration_ms',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'sum_artist_followers',
 'sum_artist_popularity'], 
outputCol="feat")
df=assembler.transform(df)

In [8]:
from pyspark.ml.feature import StandardScaler


scaler = StandardScaler(inputCol="feat", outputCol="scaledFeatures")


scalerModel = scaler.fit(df)


df = scalerModel.transform(df)

In [9]:
from pyspark.ml.feature import VarianceThresholdSelector

selector = VarianceThresholdSelector(varianceThreshold=0.1,featuresCol='scaledFeatures', outputCol="features")

df = selector.fit(df).transform(df)

print("Output: Features with variance lower than %f are removed." %
      selector.getVarianceThreshold())

Output: Features with variance lower than 0.100000 are removed.


In [10]:
df.show(5)

+------------+------------------+------------+-----------+------+--------------------+--------------------+----------------+---+--------+--------+----+----------------+-----------+--------------------+---------------------+-------+--------------+-------+-----+--------------------+--------------------+--------------------+
|acousticness|               age|danceability|duration_ms|energy|              genres|            id_track|instrumentalness|key|liveness|loudness|mode|popularity_track|speechiness|sum_artist_followers|sum_artist_popularity|  tempo|time_signature|valence|label|                feat|      scaledFeatures|            features|
+------------+------------------+------------+-----------+------+--------------------+--------------------+----------------+---+--------+--------+----+----------------+-----------+--------------------+---------------------+-------+--------------+-------+-----+--------------------+--------------------+--------------------+
|       0.658|41.79452054794

# Scegliere bene treshold e printare le features rimaste

In [11]:
final_data = df.select("id_track", "features", "label")

In [12]:
train, test = final_data.randomSplit([0.7, 0.3], seed = 10)

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Naive Bayes

In [14]:
from pyspark.ml.classification import NaiveBayes

In [15]:
nb = NaiveBayes()                               # bisogna togliere i valori negativi, credo siano tutti in loudness, come si fa a toglierli senza rifare tutto da capo a parte solo per il nb?
nbModel = nb.fit(train)
predictions = nbModel.transform(test)

Py4JJavaError: ignored

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

# Random forest

In [17]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol="features",labelCol="label",maxDepth = 10)
rfModel = rf.fit(train)
predictions = rfModel.transform(test)

In [45]:
rfModel.featureImportances

SparseVector(15, {0: 0.0683, 1: 0.021, 2: 0.0288, 3: 0.0024, 4: 0.0949, 5: 0.0006, 6: 0.0463, 7: 0.1779, 8: 0.0455, 9: 0.011, 10: 0.0172, 11: 0.0054, 12: 0.0011, 13: 0.2029, 14: 0.2766})

In [19]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.292176987113696

In [21]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.2508571498484747

In [23]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="recall") # fare anche con recall e precision (non trova il nome)
evaluator.evaluate(predictions)

IllegalArgumentException: ignored

# MLP

In [25]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [24]:
layers = [10, 4, 2, 10]

In [26]:
mlp = MultilayerPerceptronClassifier(labelCol='label',
                                            featuresCol='features',
                                            maxIter=100,
                                            layers=layers,
                                            blockSize=128,
                                            seed=1234)

mlpModel = mlp.fit(train)


predictions = mlpModel.transform(test)

In [29]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

Py4JJavaError: ignored

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

# LSVM one vs others

In [30]:
from pyspark.ml.classification import LinearSVC, OneVsRest

In [31]:
lsvc = LinearSVC(maxIter=10, regParam=0.1)

In [32]:
ovr = OneVsRest(classifier=lsvc)

In [33]:
ovrModel = ovr.fit(train)

In [34]:
predictions = ovrModel.transform(test)

In [35]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.20261717413616148

In [36]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.1471408697855798

# Logistic regression one vs others

In [39]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # i parametri vanno bene?

In [40]:
ovr = OneVsRest(classifier=lr)

In [41]:
ovrModel = ovr.fit(train)

In [42]:
predictions = ovrModel.transform(test)

In [43]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.23162846390694491

In [44]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.18375327687152626