In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

session_spark = SparkSession.builder\
                    .master('local[*]')\
                    .appName('Recomendador PySpark')\
                    .getOrCreate()
session_spark

In [2]:
from pyspark import SparkFiles

url_data = 'https://github.com/IgorNascAlves/dados/blob/main/dados_musicas.csv?raw=true'
session_spark.sparkContext.addFile(url_data)
path_data_file = 'file:///' + SparkFiles.get('dados_musicas.csv')

In [3]:
data = session_spark.read.csv(path_data_file, header=True, sep=';', inferSchema=True)

In [4]:
data.show()

+------------------+----+------------+--------------+------------------+-----------+------------------+--------+--------------------+--------------------+---+--------+-------------------+----+--------------------+----------+-----------+------------------+--------------------+
|           valence|year|acousticness|       artists|      danceability|duration_ms|            energy|explicit|                  id|    instrumentalness|key|liveness|           loudness|mode|                name|popularity|speechiness|             tempo|        artists_song|
+------------------+----+------------+--------------+------------------+-----------+------------------+--------+--------------------+--------------------+---+--------+-------------------+----+--------------------+----------+-----------+------------------+--------------------+
|             0.285|2000|     0.00239|      Coldplay|             0.429|     266773|0.6609999999999999|       0|3AJwUDP919kvQ9Qco...|             1.21E-4| 11|   0.234|  

In [5]:
data.count()

20311

In [6]:
len(data.columns)

19

In [7]:
import pyspark.sql.functions as f

In [8]:
data.select([f.count(f.when(f.isnull(c), 1)).alias(c) for c in data.columns])

DataFrame[valence: bigint, year: bigint, acousticness: bigint, artists: bigint, danceability: bigint, duration_ms: bigint, energy: bigint, explicit: bigint, id: bigint, instrumentalness: bigint, key: bigint, liveness: bigint, loudness: bigint, mode: bigint, name: bigint, popularity: bigint, speechiness: bigint, tempo: bigint, artists_song: bigint]

In [9]:
data.select([f.count(f.when(f.isnull(c), 1)).alias(c) for c in data.columns]).show()

+-------+----+------------+-------+------------+-----------+------+--------+---+----------------+---+--------+--------+----+----+----------+-----------+-----+------------+
|valence|year|acousticness|artists|danceability|duration_ms|energy|explicit| id|instrumentalness|key|liveness|loudness|mode|name|popularity|speechiness|tempo|artists_song|
+-------+----+------------+-------+------------+-----------+------+--------+---+----------------+---+--------+--------+----+----+----------+-----------+-----+------------+
|      0|   0|           0|      0|           0|          0|     0|       0|  0|               0|  0|       0|       0|   0|   0|         0|          0|    0|           0|
+-------+----+------------+-------+------------+-----------+------+--------+---+----------------+---+--------+--------+----+----+----------+-----------+-----+------------+



In [10]:
data.select('year').distinct().collect()

[Row(year=2003),
 Row(year=2007),
 Row(year=2018),
 Row(year=2015),
 Row(year=2006),
 Row(year=2013),
 Row(year=2014),
 Row(year=2019),
 Row(year=2004),
 Row(year=2020),
 Row(year=2012),
 Row(year=2009),
 Row(year=2016),
 Row(year=2001),
 Row(year=2005),
 Row(year=2000),
 Row(year=2010),
 Row(year=2011),
 Row(year=2008),
 Row(year=2017),
 Row(year=2002)]

In [11]:
print(sorted(data.select('year').distinct().collect()))

[Row(year=2000), Row(year=2001), Row(year=2002), Row(year=2003), Row(year=2004), Row(year=2005), Row(year=2006), Row(year=2007), Row(year=2008), Row(year=2009), Row(year=2010), Row(year=2011), Row(year=2012), Row(year=2013), Row(year=2014), Row(year=2015), Row(year=2016), Row(year=2017), Row(year=2018), Row(year=2019), Row(year=2020)]


In [12]:
url_year_data = 'https://github.com/IgorNascAlves/dados/blob/main/dados_musicas_ano.csv?raw=true'

session_spark.sparkContext.addFile(url_year_data)
path_data_file = 'file:///' + SparkFiles.get('dados_musicas_ano.csv')

In [13]:
data_year = session_spark.read.csv(path_data_file, header=True, sep=',', inferSchema=True)
data_year.show()

+----+----+------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+---+
|mode|year|      acousticness|       danceability|       duration_ms|             energy|   instrumentalness|           liveness|           loudness|        speechiness|             tempo|            valence|         popularity|key|
+----+----+------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+---+
|   1|1921|0.8868960000000005| 0.4185973333333336|260537.16666666663|0.23181513333333334|0.34487805886666656|            0.20571| -17.04866666666665|           0.073662|101.53149333333329|0.37932666666666665| 0.6533333333333333|  2|
|   1|1922|0.9385915492957748| 0.4820422535211267|165469.74647887325

In [14]:
data_year = data_year.filter('year >= 2000')
data_year.show()

+----+----+-------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+---+
|mode|year|       acousticness|      danceability|       duration_ms|            energy|   instrumentalness|           liveness|           loudness|        speechiness|             tempo|           valence|        popularity|key|
+----+----+-------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+---+
|   1|2000|0.28932270051635994| 0.590918047034764| 242724.6426380368|0.6254128323108387|0.10116776879345596| 0.1976860429447853| -8.247765848670758|0.08920541922290394| 118.9993231083843|0.5594754601226991|  46.6840490797546|  7|
|   1|2001| 0.2868424748428934|0.5833178553615969|240307.79600997505|0.626985522

In [15]:
data_year.count()

21

In [16]:
len(data_year.columns)

14

In [21]:
import plotly.express as px

fig = px.line(data_year.toPandas(), x='year', y='loudness', markers=True, title='Variação do loudness conforme os anos')
fig.show()

In [47]:
import plotly.graph_objects as go

fig = go.Figure()
temp = data_year.toPandas()

fig.add_trace(go.Scatter(x=temp['year'], y=temp['acousticness'], name='Acousticness'))
fig.show()

In [48]:
import plotly.graph_objects as go

fig = go.Figure()

temp = data_year.toPandas()

fig.add_trace(go.Scatter(x=temp['year'], y=temp['acousticness'], name='Acousticness'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['valence'],
                    name='Valence'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['danceability'],
                    name='Danceability'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['energy'],
                    name='Energy'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['instrumentalness'],
                    name='Instrumentalness'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['liveness'],
                    name='Liveness'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['speechiness'],
                    name='Speechiness'))

fig.show()

In [52]:
fig = px.imshow(data_year.toPandas().corr(), text_auto=True)
fig.show()

In [54]:
fig = px.imshow(data_year.drop('mode').toPandas().corr(), text_auto=True)
fig.show()