In [1]:
import findspark
findspark.init() 

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.stat import Correlation
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType, StringType
import matplotlib.pyplot as plt 
import seaborn as sns
import pyspark.sql.types as T


In [4]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

def fudf(val):
    return functools.reduce(lambda x, y:x+y, val)

In [5]:
artist_df = spark.read.option("header", "true").csv("spotify_dataset/artists.csv")

In [6]:
tracks_df = spark.read.option("header", "true").csv("spotify_dataset/tracks.csv")

In [7]:
artist_df.show(10)

+--------------------+---------+------+--------------------+----------+
|                  id|followers|genres|                name|popularity|
+--------------------+---------+------+--------------------+----------+
|0DheY5irMjBUeLybb...|      0.0|    []|Armid & Amir Zare...|         0|
|0DlhY15l3wsrnlfGi...|      5.0|    []|         ปูนา ภาวิณี|         0|
|0DmRESX2JknGPQyO1...|      0.0|    []|               Sadaa|         0|
|0DmhnbHjm1qw6NCYP...|      0.0|    []|           Tra'gruda|         0|
|0Dn11fWM7vHQ3rinv...|      2.0|    []|Ioannis Panoutsop...|         0|
|0DotfDlYMGqkbzfBh...|      7.0|    []|       Astral Affect|         0|
|0DqP3bOCiC48L8SM9...|      1.0|    []|           Yung Seed|         0|
|0Drs3maQb99iRglyT...|      0.0|    []|               Wi'Ma|         0|
|0DsPeAi1gxPPnYjgp...|      0.0|    []|             lentboy|         0|
|0DtvnTxgZ9K5YaPS5...|     20.0|    []|            addworks|         0|
+--------------------+---------+------+--------------------+----

In [8]:
tracks_df.toPandas()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586667,5rgu12WBIHQtvej2MdHSH0,云与海,50,258267,0,['阿YueYue'],['1QLBXKM5GCpyQQSVMNZqrZ'],2020-09-26,0.56,0.518,0,-7.471,0,0.0292,0.785,0.0,0.0648,0.211,131.896,4
586668,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,['ROLE MODEL'],['1dy5WNgIKQU6ezkpZs4y8z'],2020-10-21,0.765,0.663,0,-5.223,1,0.0652,0.141,0.000297,0.0924,0.686,150.091,4
586669,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,['FINNEAS'],['37M5pPGs6V1fchFJSgCguX'],2020-09-02,0.535,0.314,7,-12.823,0,0.0408,0.895,0.00015,0.0874,0.0663,145.095,4
586670,45XJsGpFTyzbzeWK8VzR8S,A Day At A Time,58,142003,0,"['Gentle Bones', 'Clara Benin']","['4jGPdu95icCKVF31CcFKbS', '5ebPSE9YI5aLeZ1Z2g...",2021-03-05,0.696,0.615,10,-6.212,1,0.0345,0.206,2.53e-06,0.305,0.438,90.029,4


In [9]:
tracks_df.columns

['id',
 'name',
 'popularity',
 'duration_ms',
 'explicit',
 'artists',
 'id_artists',
 'release_date',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature']

In [10]:
tracks_df.count()

586672

In [11]:
artist_df.count()

1104349

In [10]:
artist_df.toPandas()

Unnamed: 0,id,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,[],Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,[],ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,[],Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,[],Tra'gruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,[],Ioannis Panoutsopoulos,0
...,...,...,...,...,...
1104344,6rJIG42vcWAf1UBdRFlQxB,3345.0,[],Cody Longo,8
1104345,1ljurfXKPlGncNdW3J8zJ8,2123.0,['deep acoustic pop'],Right the Stars,18
1104346,2vnT9YhKIvjVo9LnVjWmr2,26.0,[],Jesse Giddings,0
1104347,3ID0E5XCvnJIYZEq043ZoB,406.0,[],The Boy Band Project,0


In [13]:
artist_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- followers: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: string (nullable = true)



In [14]:
tracks_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- explicit: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- id_artists: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- instrumentalness: string (nullable = true)
 |-- liveness: string (nullable = true)
 |-- valence: string (nullable = true)
 |-- tempo: string (nullable = true)
 |-- time_signature: string (nullable = true)



# Data Cleaning

In [7]:
tracks_df.select("explicit").distinct().show(10)

+---------+
| explicit|
+---------+
|   687600|
|   362760|
|   240200|
|   296467|
|   404000|
|  Melot)"|
|   211160|
|  1191416|
|   185347|
| Hoffmann|
+---------+
only showing top 10 rows



In [8]:
#missing values
tracks_df.select([count(when(isnan(c), c)).alias(c) for c in tracks_df.columns]).show()
artist_df.select([count(when(isnan(c), c)).alias(c) for c in artist_df.columns]).show()

+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
| id|name|popularity|duration_ms|explicit|artists|id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
|  0|   0|         0|          0|       0|      0|         0|           0|           0|     0|  0|       0|   0|          0|           0|               0|       0|      0|    0|             0|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+

+---+---------+------+----+-------

In [9]:
#null values
tracks_df.select([count(when(col(c).isNull(), c)).alias(c) for c in tracks_df.columns]).show()
artist_df.select([count(when(col(c).isNull(), c)).alias(c) for c in artist_df.columns]).show()

+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
| id|name|popularity|duration_ms|explicit|artists|id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
|  0|  71|         0|          0|       0|      0|        12|          12|          12|    12| 12|      12|  12|         12|          12|              12|      12|     12|   12|            12|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+

+---+---------+------+----+-------

In [10]:
artist_df_1 = artist_df.withColumn("followers", artist_df.followers.cast(IntegerType())) \
         .withColumn("popularity", artist_df.popularity.cast(IntegerType()))

In [11]:
tracks_df_1 = tracks_df.withColumn("duration_ms", tracks_df.duration_ms.cast(IntegerType())) \
         .withColumn("popularity", tracks_df.popularity.cast(IntegerType())) \
         .withColumn("explicit", tracks_df.explicit.cast(IntegerType())) \
         .withColumn("release_date", tracks_df.release_date.cast(DateType())) \
         .withColumn("danceability", tracks_df.danceability.cast(FloatType())) \
         .withColumn("energy", tracks_df.energy.cast(FloatType())) \
         .withColumn("key", tracks_df.key.cast(IntegerType())) \
         .withColumn("loudness", tracks_df.loudness.cast(FloatType())) \
         .withColumn("mode", tracks_df.mode.cast(IntegerType())) \
         .withColumn("speechiness", tracks_df.speechiness.cast(FloatType())) \
         .withColumn("acousticness", tracks_df.acousticness.cast(FloatType())) \
         .withColumn("instrumentalness", tracks_df.instrumentalness.cast(FloatType())) \
         .withColumn("liveness", tracks_df.liveness.cast(FloatType())) \
         .withColumn("valence", tracks_df.valence.cast(FloatType())) \
         .withColumn("tempo", tracks_df.tempo.cast(FloatType())) \
         .withColumn("time_signature", tracks_df.time_signature.cast(IntegerType()))  

In [12]:
#null values
tracks_df_1.select([count(when(col(c).isNull(), c)).alias(c) for c in tracks_df_1.columns]).show()
artist_df_1.select([count(when(col(c).isNull(), c)).alias(c) for c in artist_df_1.columns]).show()

+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
| id|name|popularity|duration_ms|explicit|artists|id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
|  0|  71|      1854|        826|     416|      0|        12|        3309|        2286|   859|430|     268| 160|         91|          64|              45|      34|     27|   23|            55|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+

+---+---------+------+----+-------

In [13]:
tracks_df_2 = tracks_df_1.filter(col("release_date").isNotNull())
artist_df_2 = artist_df_1.withColumn('popularity', coalesce(artist_df_1['popularity'], lit(0))) \
                         .withColumn('followers', coalesce(artist_df_1['followers'], lit(0)))


In [14]:
#null values
tracks_df_2.select([count(when(col(c).isNull(), c)).alias(c) for c in tracks_df_2.columns]).show()
artist_df_2.select([count(when(col(c).isNull(), c)).alias(c) for c in artist_df_2.columns]).show()

+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
| id|name|popularity|duration_ms|explicit|artists|id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
|  0|  71|         0|          0|       0|      0|         0|           0|           0|     0|  0|       0|   0|          0|           0|               0|       0|      0|    0|             0|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+

+---+---------+------+----+-------

In [23]:
tracks_df_2.count()

583363

In [24]:
artist_df_2.count()

1104349

# Data Integration

In [15]:
artist_df_3 = artist_df_2.withColumn(
    "genres",
    split(regexp_replace(col("genres"), r"(^\[)|(\]$)|(')", ""), ", ")
)

In [16]:
tracks_df_wk0= tracks_df_2.withColumn(
    "id_artists",
    split(regexp_replace(col("id_artists"), r"(^\[)|(\]$)|(')", ""), ", ")
)
tracks_df_wk0

DataFrame[id: string, name: string, popularity: int, duration_ms: int, explicit: int, artists: string, id_artists: array<string>, release_date: date, danceability: float, energy: float, key: int, loudness: float, mode: int, speechiness: float, acousticness: float, instrumentalness: float, liveness: float, valence: float, tempo: float, time_signature: int]

In [17]:
windowSpec = Window.partitionBy("id_track") 

In [18]:
tracks_df_wk1 = tracks_df_wk0.select(col("id").alias("id_track"), "duration_ms", col("popularity").alias("popularity_track"),"explicit", explode(tracks_df_wk0.id_artists).alias("id_artist"),"release_date","danceability","energy","key","loudness","mode", "speechiness","acousticness","instrumentalness","liveness","valence","tempo","time_signature")



tracks_df_wk2 = tracks_df_wk1.join(artist_df_3, tracks_df_wk1.id_artist==artist_df.id,"left") \
           .withColumn("sum_artist_followers",sum(col("followers")).over(windowSpec)) \
           .withColumn("sum_artist_popularity",sum(col("popularity")).over(windowSpec)) \
           .withColumn("collect_list_genres", collect_list("genres").over(windowSpec)) \
           .withColumn("collect_list_genres", flatten(col("collect_list_genres"))) \
           .withColumn("collect_list_genres", array_distinct("collect_list_genres")) \
           .withColumn("genres", array_remove("collect_list_genres", "")) \
           .drop("collect_list_genres") \
           .select("id_track", "popularity_track",  "duration_ms", "genres", "release_date","danceability","energy","key","loudness","mode", "speechiness","acousticness","instrumentalness","liveness","valence","tempo","time_signature", "sum_artist_followers", "sum_artist_popularity").distinct()



In [19]:
tracks_df_wk2.select("genres").distinct().show(10, truncate=False)

+----------------------------------------------------------------------------------+
|genres                                                                            |
+----------------------------------------------------------------------------------+
|[mariachi, ranchera]                                                              |
|[chanson, french jazz, french pop]                                                |
|[czech folk, czech rock]                                                          |
|[downtempo, new age]                                                              |
|[colombian rock, latin, latin alternative, latin pop, latin rock, rock en espanol]|
|[peruvian rock, pop peruano, pop reggaeton]                                       |
|[adult standards, vocal jazz]                                                     |
|[irish country, irish folk]                                                       |
|[anime]                                                         

In [20]:
tracks_df_wk2.printSchema()

root
 |-- id_track: string (nullable = true)
 |-- popularity_track: integer (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- genres: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- release_date: date (nullable = true)
 |-- danceability: float (nullable = true)
 |-- energy: float (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: float (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: float (nullable = true)
 |-- acousticness: float (nullable = true)
 |-- instrumentalness: float (nullable = true)
 |-- liveness: float (nullable = true)
 |-- valence: float (nullable = true)
 |-- tempo: float (nullable = true)
 |-- time_signature: integer (nullable = true)
 |-- sum_artist_followers: long (nullable = true)
 |-- sum_artist_popularity: long (nullable = true)



In [20]:
tracks_df_wk2.select([count(when(col(c).isNull(), c)).alias(c) for c in tracks_df_wk2.columns]).show()

+--------+----------------+-----------+------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+--------------------+---------------------+
|id_track|popularity_track|duration_ms|genres|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|sum_artist_followers|sum_artist_popularity|
+--------+----------------+-----------+------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+--------------------+---------------------+
|       0|               0|          0|     0|           0|           0|     0|  0|       0|   0|          0|           0|               0|       0|      0|    0|             0|               11038|                11038|
+--------+----------------+-----------+------+------------+------------+------+---+--------+----+-----------+-------

In [21]:
df = tracks_df_wk2.withColumn('sum_artist_followers', coalesce(tracks_df_wk2['sum_artist_followers'], lit(0))) \
                  .withColumn('sum_artist_popularity', coalesce(tracks_df_wk2['sum_artist_popularity'], lit(0)))

In [22]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in tracks_df_wk2.columns]).show()

+--------+----------------+-----------+------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+--------------------+---------------------+
|id_track|popularity_track|duration_ms|genres|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|sum_artist_followers|sum_artist_popularity|
+--------+----------------+-----------+------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+--------------------+---------------------+
|       0|               0|          0|     0|           0|           0|     0|  0|       0|   0|          0|           0|               0|       0|      0|    0|             0|                   0|                    0|
+--------+----------------+-----------+------+------------+------------+------+---+--------+----+-----------+-------

# Correlation

In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler

In [29]:
from pyspark.sql import functions as f

### Normalization

In [24]:
columns_to_scale = ["popularity_track",  "duration_ms", "danceability","energy", "loudness", "speechiness","acousticness","instrumentalness","liveness","valence","tempo","time_signature", "sum_artist_followers", "sum_artist_popularity"]
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(df)
enriched_df = scalerModel.transform(df)

In [25]:

names = {x + "_scaled": x for x in columns_to_scale}
scaledData = enriched_df.select([f.col(c).alias(names[c]) for c in names.keys()])

In [51]:
scaledData.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+
|    popularity_track|         duration_ms|        danceability|              energy|            loudness|         speechiness|        acousticness|    instrumentalness|            liveness|             valence|               tempo|      time_signature|sum_artist_followers|sum_artist_popularity|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+
|               [0.0]|[0.3012114547246877]|[0.6125126088483968]|[0.48399999737739...|[0.7306350931069846]|[0.

In [52]:
scaledData.printSchema()

root
 |-- popularity_track: vector (nullable = true)
 |-- duration_ms: vector (nullable = true)
 |-- danceability: vector (nullable = true)
 |-- energy: vector (nullable = true)
 |-- loudness: vector (nullable = true)
 |-- speechiness: vector (nullable = true)
 |-- acousticness: vector (nullable = true)
 |-- instrumentalness: vector (nullable = true)
 |-- liveness: vector (nullable = true)
 |-- valence: vector (nullable = true)
 |-- tempo: vector (nullable = true)
 |-- time_signature: vector (nullable = true)
 |-- sum_artist_followers: vector (nullable = true)
 |-- sum_artist_popularity: vector (nullable = true)



CORRELATION

In [33]:
from pyspark.ml.stat import Correlation

In [34]:
import pandas as pd

In [28]:
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=scaledData.columns, outputCol=vector_col)
df_vector = assembler.transform(scaledData).select(vector_col)


matrix = Correlation.corr(df_vector, vector_col)
corrmatrix = matrix.collect()[0]["pearson({})".format(vector_col)].values

In [29]:
pd.DataFrame(corrmatrix.reshape(-1, len(scaledData.columns)), columns=scaledData.columns, index=scaledData.columns)

Unnamed: 0,popularity_track,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,sum_artist_followers,sum_artist_popularity
popularity_track,1.0,0.028081,0.18243,0.29859,0.324094,-0.048209,-0.367301,-0.237153,-0.0482,-0.00105,0.06931,0.08555,0.23849,0.298819
duration_ms,0.028081,1.0,-0.120047,0.025354,0.000329,-0.126162,-0.065286,0.068597,0.002507,-0.163324,-0.000705,0.037696,0.02675,0.067169
danceability,0.18243,-0.120047,1.0,0.235541,0.244946,0.198637,-0.237201,-0.22512,-0.106398,0.524472,-0.044804,0.144939,0.035361,-0.019843
energy,0.29859,0.025354,0.235541,1.0,0.763324,-0.054681,-0.713503,-0.196341,0.125956,0.367742,0.228457,0.188273,0.088351,0.005348
loudness,0.324094,0.000329,0.244946,0.763324,1.0,-0.169364,-0.516493,-0.330341,0.030301,0.269785,0.187479,0.163391,0.116417,-0.021696
speechiness,-0.048209,-0.126162,0.198637,-0.054681,-0.169364,1.0,0.07035,-0.102147,0.20766,0.045526,-0.087689,-0.115037,-0.020422,0.053293
acousticness,-0.367301,-0.065286,-0.237201,-0.713503,-0.516493,0.07035,1.0,0.204924,-0.00549,-0.17523,-0.193245,-0.173144,-0.109548,-0.023411
instrumentalness,-0.237153,0.068597,-0.22512,-0.196341,-0.330341,-0.102147,0.204924,1.0,-0.038537,-0.174835,-0.055352,-0.042454,-0.05027,0.005374
liveness,-0.0482,0.002507,-0.106398,0.125956,0.030301,0.20766,-0.00549,-0.038537,1.0,0.000301,-0.014603,-0.023988,0.001988,0.02834
valence,-0.00105,-0.163324,0.524472,0.367742,0.269785,0.045526,-0.17523,-0.174835,0.000301,1.0,0.132874,0.103954,-0.03362,-0.101226


# Correlation filtered dataframe

In [30]:
df.approxQuantile("popularity_track", [0.25, 0.5, 0.75], 0)

[13.0, 27.0, 41.0]

In [24]:
from pyspark.sql.functions import col
df1=df.where(col('popularity_track')>40)

In [25]:
dates = ("2015-01-01",  "2021-01-01")
df2=df1.where(col('release_date').between(*dates))

In [26]:
df2.count()

43512

In [27]:
columns_to_scale = ["popularity_track",  "duration_ms", "danceability","energy", "loudness", "speechiness","acousticness","instrumentalness","liveness","valence","tempo","time_signature", "sum_artist_followers", "sum_artist_popularity"]
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(df2)
enriched_df = scalerModel.transform(df2)

In [30]:
names = {x + "_scaled": x for x in columns_to_scale}
scaledData1 = enriched_df.select([f.col(c).alias(names[c]) for c in names.keys()])

In [31]:
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=scaledData1.columns, outputCol=vector_col)
df_vector = assembler.transform(scaledData1).select(vector_col)


matrix = Correlation.corr(df_vector, vector_col)
corrmatrix = matrix.collect()[0]["pearson({})".format(vector_col)].values

In [36]:
pd.DataFrame(corrmatrix.reshape(-1, len(scaledData1.columns)), columns=scaledData1.columns, index=scaledData1.columns)

Unnamed: 0,popularity_track,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,sum_artist_followers,sum_artist_popularity
popularity_track,1.0,-0.063758,0.058898,-0.035807,0.054327,0.022227,-0.025542,-0.022962,-0.014753,-0.030491,-0.002092,-0.006934,0.366034,0.262075
duration_ms,-0.063758,1.0,-0.17481,0.009745,0.058064,-0.118995,0.008331,-0.073686,0.032909,-0.10191,0.010779,0.000298,0.067448,0.169288
danceability,0.058898,-0.17481,1.0,0.160955,0.229642,0.224786,-0.207106,-0.175529,-0.115868,0.417518,-0.058508,0.211214,0.043887,0.120018
energy,-0.035807,0.009745,0.160955,1.0,0.690609,0.076605,-0.611425,-0.176833,0.176583,0.389824,0.148339,0.161743,0.046581,0.10973
loudness,0.054327,0.058064,0.229642,0.690609,1.0,0.005195,-0.451958,-0.470965,0.068536,0.326284,0.147739,0.186049,0.113654,0.133085
speechiness,0.022227,-0.118995,0.224786,0.076605,0.005195,1.0,-0.076903,-0.078186,0.00328,0.069485,0.059352,0.073266,0.010823,0.125766
acousticness,-0.025542,0.008331,-0.207106,-0.611425,-0.451958,-0.076903,1.0,0.112701,-0.070802,-0.183041,-0.102992,-0.12546,-0.074183,-0.103132
instrumentalness,-0.022962,-0.073686,-0.175529,-0.176833,-0.470965,-0.078186,0.112701,1.0,0.012656,-0.17064,-0.070849,-0.138824,-0.072379,-0.101938
liveness,-0.014753,0.032909,-0.115868,0.176583,0.068536,0.00328,-0.070802,0.012656,1.0,0.036381,0.008276,-0.013218,0.020023,0.024843
valence,-0.030491,-0.10191,0.417518,0.389824,0.326284,0.069485,-0.183041,-0.17064,0.036381,1.0,0.082515,0.097848,0.0011,0.045904
