In [1]:
!pip install pyspark==3.1.2 py4j==0.10.9

Collecting pyspark==3.1.2
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 62 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 43.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=872582a77afb1536900479249b2428d468ec28df55eae8fec376cde52a4d653d
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.stat import Correlation
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType

spark = SparkSession.builder\
        .master("local[*]")\
        .appName('spotify_classification')\
        .getOrCreate()

In [3]:
from google.colab import drive               
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = spark.read.json("drive/My Drive/dataset.json")

In [5]:
df.show(5)

+------------+------------------+--------------------+---------------------+------------+-----------+------+--------------------+--------------------+----------------+---+--------+--------+----+----------------+------------+-----------+--------------------+---------------------+-------+--------------+-------+
|acousticness|               age|avg_artist_followers|avg_artist_popularity|danceability|duration_ms|energy|              genres|            id_track|instrumentalness|key|liveness|loudness|mode|popularity_track|release_date|speechiness|sum_artist_followers|sum_artist_popularity|  tempo|time_signature|valence|
+------------+------------------+--------------------+---------------------+------------+-----------+------+--------------------+--------------------+----------------+---+--------+--------+----+----------------+------------+-----------+--------------------+---------------------+-------+--------------+-------+
|       0.658| 41.83013698630137|              5403.5|             

In [6]:
from pyspark.ml.feature import QuantileDiscretizer 

qds = QuantileDiscretizer(relativeError=0.0001, handleInvalid="error", numBuckets=3, inputCol="popularity_track", outputCol="label_equi")

df = qds.setHandleInvalid("keep").fit(df).transform(df)

In [7]:
from pyspark.ml.feature import Bucketizer
bucketBorders=[0,33,66,100]
bucketer=Bucketizer().setSplits(bucketBorders).setInputCol("popularity_track").setOutputCol("label")
df = bucketer.transform(df)

In [8]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler( 
inputCols=[
 'duration_ms',
 'danceability',
 'energy',
 'key',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'sum_artist_followers',
 'sum_artist_popularity',
 'avg_artist_popularity'], 
outputCol="feat")
df=assembler.transform(df)

In [9]:
from pyspark.ml.feature import StandardScaler


scaler = StandardScaler(inputCol="feat", outputCol="scaledFeatures")


scalerModel = scaler.fit(df)


df = scalerModel.transform(df)

In [10]:
from pyspark.ml.feature import VarianceThresholdSelector

selector = VarianceThresholdSelector(varianceThreshold=1,featuresCol='scaledFeatures', outputCol="features")

df = selector.fit(df).transform(df)

print("Output: Features with variance lower than %f are removed." %
      selector.getVarianceThreshold())

Output: Features with variance lower than 1.000000 are removed.


In [11]:
df.select("features").show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                       |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[1.2361331222982912,2.1964380167735986,2.2425341538161,1.8900121366507203,0.0,2.5283072549510672,6.364381858546468,0.002309747438951722,1.9235483655073946]                    |
|[1.743569701544143,3.545337450987819,0.9547422635058642,1.5596908665673879,0.002985334272949299,3.492953715301628,8.485842478061958,0.004238847132111549,2.067814492920449]    |
|[1.9877358337892006,3.923347616918058,0.7771157958768663,1.3787322577391272E-4,3.442885505738011,0.8985215013

In [12]:
final_data = df.select("id_track", "features", "label_equi","label")

In [13]:
train, test = final_data.randomSplit([0.7, 0.3], seed = 10)

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Naive Bayes

In [None]:
from pyspark.ml.classification import NaiveBayes

In [None]:
nb = NaiveBayes()                         
nbModel = nb.fit(train)
predictions = nbModel.transform(test)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.6759979027514988

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.41662867303439943

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.6714989388255861

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.3364840077465312

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="recallByLabel") 
evaluator.evaluate(predictions)

0.8070784838074211

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="recallByLabel") 
evaluator.evaluate(predictions)

0.9035100109467142

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="precisionByLabel")
evaluator.evaluate(predictions)

0.7348072199081491

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="precisionByLabel")
evaluator.evaluate(predictions)

0.4461265976004842

# LSVM one vs others

In [17]:
from pyspark.ml.classification import LinearSVC, OneVsRest

In [None]:
lsvc = LinearSVC(maxIter=10, regParam=0.1)

In [None]:
ovr = OneVsRest(classifier=lsvc)

In [None]:
ovrModel = ovr.fit(train)

In [None]:
predictions = ovrModel.transform(test)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.6696149725306039

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.35328948868169696

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.6213377368097371

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.22944241855101422

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="recallByLabel") 
evaluator.evaluate(predictions)

0.932812455390706

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="recallByLabel") 
evaluator.evaluate(predictions)

0.9666301776192662

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="precisionByLabel") 
evaluator.evaluate(predictions)

0.6716206216082881

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="precisionByLabel") 
evaluator.evaluate(predictions)

0.3751301869210108

# Logistic regression one vs others

In [15]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

In [18]:
ovr = OneVsRest(classifier=lr)

In [19]:
ovrModel = ovr.fit(train)

In [20]:
predictions = ovrModel.transform(test)

In [21]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.7006234755055053

In [22]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.40308204344951787

In [23]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.6901649218364982

In [24]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="f1")
evaluator.evaluate(predictions)

0.3037653707070569

In [25]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="recallByLabel") 
evaluator.evaluate(predictions)

0.8070784838074211

In [26]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="recallByLabel") 
evaluator.evaluate(predictions)

0.9035100109467142

In [27]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="precisionByLabel") 
evaluator.evaluate(predictions)

0.7479165013096277

In [29]:
vgevaluator = MulticlassClassificationEvaluator(labelCol="label_equi", predictionCol="prediction",metricName="precisionByLabel") 
evaluator.evaluate(predictions)

0.4512968401372243