# Data Understanding

In this section we'll present how we've studied, cleaned, integrated and explored our datasets.

## Importing Libraries 



In [2]:
import matplotlib.pyplot as plt
from collections import OrderedDict
import seaborn as sns

import pandas as pd
import numpy as np

#### Pyspark tools

In [8]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.ml.stat import Correlation
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType, StringType
from  pyspark.sql.functions import *

### Starting Pyspark Session

The configuation used, allow to use 4 cores in the local machine, and set a memory limit of 8GB of ram.

In [9]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .master("local[4]") \
    .config("spark.driver.maxResultSize", "8g") \
    .getOrCreate()

spark


## Data Loading

### Tracks


In [10]:
artist_df = spark.read.option("header", "true").csv("../data/artists.csv")
print(artist_df.count())
artist_df.printSchema()
artist_df.show(10)

1104349
root
 |-- id: string (nullable = true)
 |-- followers: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: string (nullable = true)

+--------------------+---------+------+--------------------+----------+
|                  id|followers|genres|                name|popularity|
+--------------------+---------+------+--------------------+----------+
|0DheY5irMjBUeLybb...|      0.0|    []|Armid & Amir Zare...|         0|
|0DlhY15l3wsrnlfGi...|      5.0|    []|         ปูนา ภาวิณี|         0|
|0DmRESX2JknGPQyO1...|      0.0|    []|               Sadaa|         0|
|0DmhnbHjm1qw6NCYP...|      0.0|    []|           Tra'gruda|         0|
|0Dn11fWM7vHQ3rinv...|      2.0|    []|Ioannis Panoutsop...|         0|
|0DotfDlYMGqkbzfBh...|      7.0|    []|       Astral Affect|         0|
|0DqP3bOCiC48L8SM9...|      1.0|    []|           Yung Seed|         0|
|0Drs3maQb99iRglyT...|      0.0|    []|               Wi'Ma|         0|
|0D

In [11]:
tracks_df = spark.read.option("header", "true").csv("../data/tracks.csv")
tracks_df.show()
tracks_df.printSchema()
tracks_df.count()

+--------------------+--------------------+----------+-----------+--------+-------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+
|                  id|                name|popularity|duration_ms|explicit|            artists|          id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|
+--------------------+--------------------+----------+-----------+--------+-------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+
|35iwgR4jXetI318WE...|               Carve|         6|     126903|       0|            ['Uli']|['45tIt06XoI0Iio4...|  1922-02-22|       0.645| 0.445|  0| -13.338|   1|      0.451|       0.674|           0.744|   0.151|  0.127|104.851|             3|


586672

## Data Cleaning
Both datasets have been cleaned of null values by deleting the records that presented any of them, given their small number of them compared to the total number of records. In second instance, we observed that the tracks dataset was presenting unexpected data for some records in the attribute "Explicit"

In [12]:

tracks_df.select([count(when(isnan(c), c)).alias(c) for c in tracks_df.columns]).show()
artist_df.select([count(when(isnan(c), c)).alias(c) for c in artist_df.columns]).show()

+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
| id|name|popularity|duration_ms|explicit|artists|id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
|  0|   0|         0|          0|       0|      0|         0|           0|           0|     0|  0|       0|   0|          0|           0|               0|       0|      0|    0|             0|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+

+---+---------+------+----+-------

In [13]:
#null values
tracks_df.select([count(when(col(c).isNull(), c)).alias(c) for c in tracks_df.columns]).show()
artist_df.select([count(when(col(c).isNull(), c)).alias(c) for c in artist_df.columns]).show()

+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
| id|name|popularity|duration_ms|explicit|artists|id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
|  0|  71|         0|          0|       0|      0|        12|          12|          12|    12| 12|      12|  12|         12|          12|              12|      12|     12|   12|            12|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+

+---+---------+------+----+-------

### Casting the Columns Types

In [14]:
artist_df = artist_df.withColumn("followers", artist_df.followers.cast(IntegerType())) \
         .withColumn("popularity", artist_df.popularity.cast(IntegerType()))

In [15]:
tracks_df = tracks_df.withColumn("duration_ms", tracks_df.duration_ms.cast(IntegerType())) \
         .withColumn("popularity", tracks_df.popularity.cast(IntegerType())) \
         .withColumn("explicit", tracks_df.explicit.cast(IntegerType())) \
         .withColumn("release_date", tracks_df.release_date.cast(DateType())) \
         .withColumn("danceability", tracks_df.danceability.cast(FloatType())) \
         .withColumn("energy", tracks_df.energy.cast(FloatType())) \
         .withColumn("key", tracks_df.key.cast(IntegerType())) \
         .withColumn("loudness", tracks_df.loudness.cast(FloatType())) \
         .withColumn("mode", tracks_df.mode.cast(IntegerType())) \
         .withColumn("speechiness", tracks_df.speechiness.cast(FloatType())) \
         .withColumn("acousticness", tracks_df.acousticness.cast(FloatType())) \
         .withColumn("instrumentalness", tracks_df.instrumentalness.cast(FloatType())) \
         .withColumn("liveness", tracks_df.liveness.cast(FloatType())) \
         .withColumn("valence", tracks_df.valence.cast(FloatType())) \
         .withColumn("tempo", tracks_df.tempo.cast(FloatType())) \
         .withColumn("time_signature", tracks_df.time_signature.cast(IntegerType()))  

After the casting, null values increased, cause some values were not convertible

In [16]:

tracks_df.select([count(when(col(c).isNull(), c)).alias(c) for c in tracks_df.columns]).show()
artist_df.select([count(when(col(c).isNull(), c)).alias(c) for c in artist_df.columns]).show()

+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
| id|name|popularity|duration_ms|explicit|artists|id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
|  0|  71|      1854|        826|     416|      0|        12|        3309|        2286|   859|430|     268| 160|         91|          64|              45|      34|     27|   23|            55|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+

+---+---------+------+----+-------

In [17]:
tracks_df = tracks_df.filter(col("release_date").isNotNull())
artist_df= artist_df.withColumn('popularity', coalesce(artist_df['popularity'], lit(0))) \
                         .withColumn('followers', coalesce(artist_df['followers'], lit(0)))


In [18]:
#null values
tracks_df.select([count(when(col(c).isNull(), c)).alias(c) for c in tracks_df.columns]).show()
artist_df.select([count(when(col(c).isNull(), c)).alias(c) for c in artist_df.columns]).show()

+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
| id|name|popularity|duration_ms|explicit|artists|id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+
|  0|  71|         0|          0|       0|      0|         0|           0|           0|     0|  0|       0|   0|          0|           0|               0|       0|      0|    0|             0|
+---+----+----------+-----------+--------+-------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+

+---+---------+------+----+-------

In [19]:
import datetime

tracks_df= tracks_df.withColumn('age',datediff(current_date(), tracks_df.release_date)/365)

In [20]:
tracks_df.select("explicit").distinct().show(500)

+--------+
|explicit|
+--------+
|       1|
|       0|
+--------+



## Data Integration

In [49]:
artist_df = artist_df.withColumn(
    "genres",
    split(regexp_replace(col("genres"), r"(^\[)|(\]$)|(')", ""), ", ")
)

In [50]:
tracks_df_wk0= tracks_df.withColumn(
    "id_artists",
    split(regexp_replace(col("id_artists"), r"(^\[)|(\]$)|(')", ""), ", ")
)
tracks_df_wk0

DataFrame[id: string, name: string, popularity: int, duration_ms: int, explicit: int, artists: string, id_artists: array<string>, release_date: date, danceability: float, energy: float, key: int, loudness: float, mode: int, speechiness: float, acousticness: float, instrumentalness: float, liveness: float, valence: float, tempo: float, time_signature: int, age: double]

In [51]:
windowSpec = Window.partitionBy("id_track") 

In [53]:
tracks_df_wk1 = tracks_df_wk0.select(col("id").alias("id_track"), "duration_ms", col("popularity").alias("popularity_track"),"explicit", explode(tracks_df_wk0.id_artists).alias("id_artist"),"release_date","danceability","energy","key","loudness","mode", "speechiness","acousticness","instrumentalness","liveness","valence","tempo","time_signature","age")



tracks_df_wk2 = tracks_df_wk1.join(artist_df, tracks_df_wk1.id_artist==artist_df.id,"left") \
           .filter(col("popularity").isNotNull()) \
           .filter(col("followers").isNotNull()) \
           .withColumn("sum_artist_followers",sum(col("followers")).over(windowSpec)) \
           .withColumn("sum_artist_popularity",sum(col("popularity")).over(windowSpec)) \
           .withColumn("avg_artist_followers",F.avg(col("followers")).over(windowSpec)) \
           .withColumn("avg_artist_popularity",F.avg(col("popularity")).over(windowSpec)) \
           .withColumn("collect_list_genres", collect_list("genres").over(windowSpec)) \
           .withColumn("collect_list_genres", flatten(col("collect_list_genres"))) \
           .withColumn("collect_list_genres", array_distinct("collect_list_genres")) \
           .withColumn("genres", array_remove("collect_list_genres", "")) \
           .drop("collect_list_genres") \
           .select("id_track", "popularity_track",  "duration_ms", "genres", "release_date","danceability","energy","key","loudness","mode", "speechiness","acousticness","instrumentalness","liveness","valence","tempo","time_signature", "sum_artist_followers", "sum_artist_popularity","avg_artist_followers","avg_artist_popularity","age").distinct()



In [54]:
tracks_df_wk2.select("genres").distinct().show(10, truncate=False)

+----------------------------------------------------------------------------------+
|genres                                                                            |
+----------------------------------------------------------------------------------+
|[mariachi, ranchera]                                                              |
|[chanson, french jazz, french pop]                                                |
|[czech folk, czech rock]                                                          |
|[downtempo, new age]                                                              |
|[colombian rock, latin, latin alternative, latin pop, latin rock, rock en espanol]|
|[peruvian rock, pop peruano, pop reggaeton]                                       |
|[adult standards, vocal jazz]                                                     |
|[irish country, irish folk]                                                       |
|[anime]                                                         

In [55]:
tracks_df_wk2.printSchema()

root
 |-- id_track: string (nullable = true)
 |-- popularity_track: integer (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- genres: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- release_date: date (nullable = true)
 |-- danceability: float (nullable = true)
 |-- energy: float (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: float (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: float (nullable = true)
 |-- acousticness: float (nullable = true)
 |-- instrumentalness: float (nullable = true)
 |-- liveness: float (nullable = true)
 |-- valence: float (nullable = true)
 |-- tempo: float (nullable = true)
 |-- time_signature: integer (nullable = true)
 |-- sum_artist_followers: long (nullable = true)
 |-- sum_artist_popularity: long (nullable = true)
 |-- avg_artist_followers: double (nullable = true)
 |-- avg_artist_popularity: double (nullable = true)
 |-- age: double (nullable = true)



In [56]:
tracks_df_wk2.select([count(when(col(c).isNull(), c)).alias(c) for c in tracks_df_wk2.columns]).show()

+--------+----------------+-----------+------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+--------------------+---------------------+--------------------+---------------------+---+
|id_track|popularity_track|duration_ms|genres|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|sum_artist_followers|sum_artist_popularity|avg_artist_followers|avg_artist_popularity|age|
+--------+----------------+-----------+------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+--------------------+---------------------+--------------------+---------------------+---+
|       0|               0|          0|     0|           0|           0|     0|  0|       0|   0|          0|           0|               0|       0|      0|    0|             0|                   

In [57]:
df = tracks_df_wk2

# dataset saving

In [58]:
df.write.mode("overwrite").parquet('../data/cleanedDataset_parquet/')

In [59]:
df_filtered = df.filter(col('popularity_track')>0)

In [60]:
df_filtered.write.mode("overwrite").parquet('../data/cleanedDatasetFiltered_parquet/')

# Correlation

In [61]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler

In [62]:
from pyspark.sql import functions as f

### Normalization

In [63]:
columns_to_scale = ["popularity_track",  "duration_ms", "danceability","energy", "loudness", "speechiness","acousticness","instrumentalness","liveness","valence","tempo","time_signature", "avg_artist_followers", "avg_artist_popularity",  "sum_artist_followers", "sum_artist_popularity","age"]
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(df)
enriched_df = scalerModel.transform(df)

In [64]:

names = {x + "_scaled": x for x in columns_to_scale}
scaledData = enriched_df.select([f.col(c).alias(names[c]) for c in names.keys()])

In [65]:
scaledData.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+---------------------+--------------------+
|    popularity_track|         duration_ms|        danceability|              energy|            loudness|         speechiness|        acousticness|    instrumentalness|            liveness|             valence|               tempo|      time_signature|avg_artist_followers|avg_artist_popularity|sum_artist_followers|sum_artist_popularity|                 age|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----

In [66]:
scaledData.printSchema()

root
 |-- popularity_track: vector (nullable = true)
 |-- duration_ms: vector (nullable = true)
 |-- danceability: vector (nullable = true)
 |-- energy: vector (nullable = true)
 |-- loudness: vector (nullable = true)
 |-- speechiness: vector (nullable = true)
 |-- acousticness: vector (nullable = true)
 |-- instrumentalness: vector (nullable = true)
 |-- liveness: vector (nullable = true)
 |-- valence: vector (nullable = true)
 |-- tempo: vector (nullable = true)
 |-- time_signature: vector (nullable = true)
 |-- avg_artist_followers: vector (nullable = true)
 |-- avg_artist_popularity: vector (nullable = true)
 |-- sum_artist_followers: vector (nullable = true)
 |-- sum_artist_popularity: vector (nullable = true)
 |-- age: vector (nullable = true)



CORRELATION

In [67]:
from pyspark.ml.stat import Correlation

In [68]:
import pandas as pd

In [69]:
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=scaledData.columns, outputCol=vector_col)
df_vector = assembler.transform(scaledData).select(vector_col)


matrix = Correlation.corr(df_vector, vector_col)
corrmatrix = matrix.collect()[0]["pearson({})".format(vector_col)].values

In [70]:
pd.DataFrame(corrmatrix.reshape(-1, len(scaledData.columns)), columns=scaledData.columns, index=scaledData.columns)

Unnamed: 0,popularity_track,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,avg_artist_followers,avg_artist_popularity,sum_artist_followers,sum_artist_popularity,age
popularity_track,1.0,0.036382,0.187216,0.308021,0.332595,-0.050129,-0.379428,-0.23668,-0.049918,-0.00315,0.071875,0.089161,0.239132,0.560317,0.238146,0.289265,-0.60958
duration_ms,0.036382,1.0,-0.125574,0.023709,-0.000336,-0.135599,-0.065325,0.070266,0.001978,-0.16748,-0.000753,0.040369,0.019331,0.006293,0.028669,0.074758,-0.056375
danceability,0.187216,-0.125574,1.0,0.235468,0.24443,0.199716,-0.235509,-0.231624,-0.10523,0.526467,-0.04831,0.141694,0.021801,0.039417,0.0363,-0.017918,-0.22418
energy,0.308021,0.023709,0.235468,1.0,0.76476,-0.054952,-0.714489,-0.201543,0.125767,0.369151,0.227515,0.187793,0.094059,0.16846,0.089817,0.007919,-0.462166
loudness,0.332595,-0.000336,0.24443,0.76476,1.0,-0.170993,-0.518274,-0.331347,0.029592,0.268675,0.186477,0.162221,0.120709,0.144123,0.117405,-0.024699,-0.454457
speechiness,-0.050129,-0.135599,0.199716,-0.054952,-0.170993,1.0,0.070266,-0.101961,0.208763,0.045535,-0.089222,-0.116968,-0.02931,0.052484,-0.020968,0.052302,0.082275
acousticness,-0.379428,-0.065325,-0.235509,-0.714489,-0.518274,0.070266,1.0,0.215057,-0.006041,-0.176285,-0.192577,-0.173623,-0.119497,-0.219779,-0.11197,-0.030148,0.526501
instrumentalness,-0.23668,0.070266,-0.231624,-0.201543,-0.331347,-0.101961,0.215057,1.0,-0.037321,-0.170356,-0.056607,-0.042552,-0.055176,-0.124316,-0.048134,0.023272,0.244716
liveness,-0.049918,0.001978,-0.10523,0.125767,0.029592,0.208763,-0.006041,-0.037321,1.0,0.000399,-0.014151,-0.023823,0.006899,0.05042,0.001481,0.026381,0.018199
valence,-0.00315,-0.16748,0.526467,0.369151,0.268675,0.045535,-0.176285,-0.170356,0.000399,1.0,0.131939,0.102432,-0.035995,-0.056572,-0.034816,-0.10846,0.026984


# Correlation and Normalization Filtered dataset

In [71]:
columns_to_scale = ["popularity_track",  "duration_ms", "danceability","energy", "loudness", "speechiness","acousticness","instrumentalness","liveness","valence","tempo","time_signature", "avg_artist_followers", "avg_artist_popularity",  "sum_artist_followers", "sum_artist_popularity","age"]
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(df_filtered)
enriched_df = scalerModel.transform(df_filtered)


names = {x + "_scaled": x for x in columns_to_scale}
scaledData = enriched_df.select([f.col(c).alias(names[c]) for c in names.keys()])



In [72]:
scaledData.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+---------------------+--------------------+
|    popularity_track|         duration_ms|        danceability|              energy|            loudness|         speechiness|        acousticness|    instrumentalness|            liveness|             valence|               tempo|      time_signature|avg_artist_followers|avg_artist_popularity|sum_artist_followers|sum_artist_popularity|                 age|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----

In [73]:
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=scaledData.columns, outputCol=vector_col)
df_vector = assembler.transform(scaledData).select(vector_col)


matrix = Correlation.corr(df_vector, vector_col)
corrmatrix = matrix.collect()[0]["pearson({})".format(vector_col)].values

In [74]:
pd.DataFrame(corrmatrix.reshape(-1, len(scaledData.columns)), columns=scaledData.columns, index=scaledData.columns)

Unnamed: 0,popularity_track,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,avg_artist_followers,avg_artist_popularity,sum_artist_followers,sum_artist_popularity,age
popularity_track,1.0,0.033927,0.183881,0.250134,0.305949,-0.022462,-0.307831,-0.16583,-0.055666,-0.016078,0.051712,0.076138,0.250309,0.513139,0.252914,0.275534,-0.557065
duration_ms,0.033927,1.0,-0.123365,0.017221,-0.002648,-0.149233,-0.058292,0.0716,0.000515,-0.157342,0.001088,0.040379,0.020172,-0.012053,0.028716,0.057902,-0.03923
danceability,0.183881,-0.123365,1.0,0.225898,0.235053,0.190714,-0.227859,-0.230043,-0.10663,0.519523,-0.068337,0.140885,0.014272,0.032054,0.031979,2.4e-05,-0.237185
energy,0.250134,0.017221,0.225898,1.0,0.763668,-0.033219,-0.700022,-0.169866,0.129641,0.381403,0.215175,0.185575,0.082891,0.110304,0.081107,-0.010588,-0.409324
loudness,0.305949,-0.002648,0.235053,0.763668,1.0,-0.143297,-0.518267,-0.325237,0.028035,0.263791,0.17363,0.155545,0.115361,0.111662,0.114957,-0.028688,-0.449045
speechiness,-0.022462,-0.149233,0.190714,-0.033219,-0.143297,1.0,0.067606,-0.09875,0.224581,0.038608,-0.089718,-0.113521,-0.025477,0.109719,-0.016189,0.090585,0.036217
acousticness,-0.307831,-0.058292,-0.227859,-0.700022,-0.518267,0.067606,1.0,0.164367,-0.006411,-0.192153,-0.18297,-0.174568,-0.106299,-0.14226,-0.101054,0.001825,0.447438
instrumentalness,-0.16583,0.0716,-0.230043,-0.169866,-0.325237,-0.09875,0.164367,1.0,-0.033033,-0.177621,-0.043464,-0.038565,-0.044391,-0.059807,-0.039885,0.050047,0.178743
liveness,-0.055666,0.000515,-0.10663,0.129641,0.028035,0.224581,-0.006411,-0.033033,1.0,-0.000846,-0.014522,-0.024734,0.009899,0.0617,0.004407,0.030925,0.023298
valence,-0.016078,-0.157342,0.519523,0.381403,0.263791,0.038608,-0.192153,-0.177621,-0.000846,1.0,0.124335,0.103256,-0.040722,-0.060194,-0.037706,-0.093217,0.032377
