In [1]:
!pip install pyspark==3.0.1 py4j==0.10.9

Collecting pyspark==3.0.1
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 33 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 48.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612246 sha256=26acb31d3b1c91df7a832bed0a5f0b4483aaee59525bc0f655d65f068d6c58b5
  Stored in directory: /root/.cache/pip/wheels/5e/34/fa/b37b5cef503fc5148b478b2495043ba61b079120b7ff379f9b
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.stat import Correlation
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .master("local[4]") \
    .config("spark.driver.maxResultSize", "8g") \
    .getOrCreate()

In [3]:
from google.colab import drive    
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
df = spark.read.parquet('../../data/cleanedDataset_parquet')

In [3]:
df.show(5)

+--------------------+----------------+-----------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+---------------------+--------------------+---------------------+------------------+
|            id_track|popularity_track|duration_ms|              genres|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|sum_artist_followers|sum_artist_popularity|avg_artist_followers|avg_artist_popularity|               age|
+--------------------+----------------+-----------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+---------------------+--------------------+---------------------+------------------+
|00AeAaSNbe92PRrst...|               3|     156067|[classic czech p

In [4]:
df = df.drop("loudness") #perchè ha valori negativi e il nb non li accetta

In [5]:
from pyspark.ml.feature import QuantileDiscretizer 

qds = QuantileDiscretizer(relativeError=0.0001, handleInvalid="error", numBuckets=10, inputCol="popularity_track", outputCol="label")

df = qds.setHandleInvalid("keep").fit(df).transform(df)

In [6]:
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, StringType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes

In [8]:
assembler = VectorAssembler( 
inputCols=['age',
 'duration_ms',
 'danceability',
 'energy',
 'key',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'avg_artist_followers',
 'avg_artist_popularity',
 'sum_artist_followers',
 'sum_artist_popularity'], 
outputCol="features")
output=assembler.transform(df)

In [9]:
final_data = output.select( "features", "label")

In [10]:
from pyspark.ml.feature import StandardScaler


scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(final_data)

# Normalize each feature to have unit standard deviation.
df = scalerModel.transform(final_data)

In [11]:
train, test = df.randomSplit([0.7, 0.3], seed = 10)

In [12]:
nb = NaiveBayes()
nbModel = nb.fit(train)
predictions = nbModel.transform(test)

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.14489503626227526

# NB with filtered dataset

In [14]:
df = spark.read.parquet('../../data/cleanedDatasetFiltered_parquet')

In [15]:
df = df.drop("loudness") #perchè ha valori negativi e il nb non li accetta

qds = QuantileDiscretizer(relativeError=0.0001, handleInvalid="error", numBuckets=10, inputCol="popularity_track", outputCol="label")

df = qds.setHandleInvalid("keep").fit(df).transform(df)

final_data = output.select( "features", "label")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(final_data)

# Normalize each feature to have unit standard deviation.
df = scalerModel.transform(final_data)

train, test = df.randomSplit([0.7, 0.3], seed = 10)

nb = NaiveBayes()
nbModel = nb.fit(train)
predictions = nbModel.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
evaluator.evaluate(predictions)

0.14489503626227526