In [1]:
!pip install pyspark==3.1.2 py4j==0.10.9

Collecting pyspark==3.1.2
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 68 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 53.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=a41367fa3a0d0edc3e0913c2df3f922a3f61540605d26d92324186ac51dca98a
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.stat import Correlation
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType

spark = SparkSession.builder\
        .master("local[*]")\
        .appName('spotify_classification')\
        .getOrCreate()

In [3]:
from google.colab import drive               
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = spark.read.json("drive/My Drive/dataset.json")

In [5]:
df.show(5)

+------------+------------------+--------------------+---------------------+------------+-----------+------+--------------------+--------------------+----------------+---+--------+--------+----+----------------+------------+-----------+--------------------+---------------------+-------+--------------+-------+
|acousticness|               age|avg_artist_followers|avg_artist_popularity|danceability|duration_ms|energy|              genres|            id_track|instrumentalness|key|liveness|loudness|mode|popularity_track|release_date|speechiness|sum_artist_followers|sum_artist_popularity|  tempo|time_signature|valence|
+------------+------------------+--------------------+---------------------+------------+-----------+------+--------------------+--------------------+----------------+---+--------+--------+----+----------------+------------+-----------+--------------------+---------------------+-------+--------------+-------+
|       0.658| 41.83013698630137|              5403.5|             

In [6]:
from pyspark.ml.feature import QuantileDiscretizer 

qds = QuantileDiscretizer(relativeError=0.0001, handleInvalid="error", numBuckets=3, inputCol="popularity_track", outputCol="label_equi")

df = qds.setHandleInvalid("keep").fit(df).transform(df)

In [7]:
from pyspark.ml.feature import Bucketizer
bucketBorders=[0,33,66,100]
bucketer=Bucketizer().setSplits(bucketBorders).setInputCol("popularity_track").setOutputCol("label")
df = bucketer.transform(df)

In [8]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler( 
inputCols=[
  'age',
 'duration_ms',
 'danceability',
 'energy',
 'key',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'sum_artist_followers',
 'sum_artist_popularity',
 'avg_artist_popularity'], 
outputCol="feat")
df=assembler.transform(df)

In [9]:
from pyspark.ml.feature import StandardScaler


scaler = StandardScaler(inputCol="feat", outputCol="features")


scalerModel = scaler.fit(df)


df = scalerModel.transform(df)

In [10]:
final_data = df.select("id_track", "features", "label_equi","label")

In [11]:
train, test = final_data.randomSplit([0.7, 0.3], seed = 10)

In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Random forest

In [13]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol="features",labelCol="label",maxDepth = 10)
rfModel = rf.fit(train)
predictions = rfModel.transform(test)

In [14]:
rfModel.featureImportances

SparseVector(16, {0: 0.4371, 1: 0.024, 2: 0.0103, 3: 0.0287, 4: 0.0008, 5: 0.0002, 6: 0.0104, 7: 0.0405, 8: 0.0162, 9: 0.0057, 10: 0.0034, 11: 0.0023, 12: 0.0007, 13: 0.1476, 14: 0.1184, 15: 0.1536})

In [15]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.7869412086534296

In [16]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.7781200159234531

In [17]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="recallByLabel") 
evaluator.evaluate(predictions)

0.8672141912275526

In [18]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="precisionByLabel") 
evaluator.evaluate(predictions)

0.820940163240302

In [19]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol="features",labelCol="label_equi",maxDepth = 10)
rfModel = rf.fit(train)
predictions = rfModel.transform(test)

In [20]:
rfModel.featureImportances

SparseVector(16, {0: 0.4794, 1: 0.0245, 2: 0.0089, 3: 0.0279, 4: 0.0007, 5: 0.0002, 6: 0.0109, 7: 0.0421, 8: 0.0206, 9: 0.0041, 10: 0.0038, 11: 0.0021, 12: 0.0004, 13: 0.1247, 14: 0.1195, 15: 0.1301})

In [21]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.6821414730891102

In [22]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.6853778456317526

In [23]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="recallByLabel") 
evaluator.evaluate(predictions)

0.7067516508351284

In [24]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="precisionByLabel") 
evaluator.evaluate(predictions)

0.9474264344060624

# MLP

In [25]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [29]:
layers = [16,10,3,3]

In [30]:
mlp = MultilayerPerceptronClassifier(labelCol='label',
                                            featuresCol='features',
                                            maxIter=100,
                                            layers=layers,
                                            blockSize=128,
                                            seed=1234)

mlpModel = mlp.fit(train)


predictions = mlpModel.transform(test)

In [31]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.7702031139581006

In [32]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.7633414941900989

In [33]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="recallByLabel") 
evaluator.evaluate(predictions)

0.8489041578241133

In [34]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="precisionByLabel") 
evaluator.evaluate(predictions)

0.8114953194508884

In [35]:
mlp = MultilayerPerceptronClassifier(labelCol='label_equi',
                                            featuresCol='features',
                                            maxIter=100,
                                            layers=layers,
                                            blockSize=128,
                                            seed=1234)

mlpModel = mlp.fit(train)


predictions = mlpModel.transform(test)

In [36]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.6531732281669592

In [37]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.655317192966532

In [38]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="recallByLabel") 
evaluator.evaluate(predictions)

0.7001659663123698

In [39]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="precisionByLabel") 
evaluator.evaluate(predictions)

0.7245884266111202