In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly_express as px

import os
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import plotly.graph_objs as go
import chart_studio.plotly as py
from plotly.offline import iplot, init_notebook_mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

In [3]:
df = pd.read_csv('spotify_data_cleaned.zip')

In [4]:
df.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,duration_min_secs,time_signature
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,E,-10.058,Major,0.0429,0.694,0.0,0.115,0.139,133.406,240166,4:00,3.0
1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,D#,-10.286,Major,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,3:36,4.0
2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,D#,-13.711,Major,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,2:38,4.0
3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,A#,-9.845,Major,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,5:04,4.0
4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,F#,-5.419,Minor,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4:04,4.0


In [5]:
def clean_outliers(df_aux, columns: list)->pd.DataFrame:
    """
    Función que elimina los valores atípicos de un dataframe en base a los cuartiles.
    
    Args: df_aux: DataFrame a limpiar.
        columns: Lista de columnas a limpiar.
    Returns: DataFrame sin valores atípicos
    """
    for column in columns:
        Q1 = df_aux[column].quantile(0.25)
        Q3 = df_aux[column].quantile(0.75)
        IQR = Q3 - Q1
        df_aux = df_aux[(df_aux[column] >= Q1-1.5*IQR) & (df_aux[column] <= Q3 + 1.5*IQR)]
    return df_aux

In [None]:
fig, ax = plt.subplots(5, 2)
ax[0, 0].hist(df['popularity'], bins=20, color="skyblue", edgecolor='black', linewidth=0.8)
ax[0,0].title.set_text('Popularity')
ax[0, 1].hist(df['energy'], bins=20, color="skyblue", edgecolor='black', linewidth=0.8)
ax[0,1].title.set_text('Energy')
ax[1, 0].hist(df['danceability'], bins=20, color="skyblue", edgecolor='black', linewidth=0.8)
ax[1,0].title.set_text('Danceability')
ax[1, 1].hist(df['loudness'], bins=20, color="skyblue", edgecolor='black', linewidth=0.8)
ax[1,1].title.set_text('Loudness')
ax[2, 0].hist(df['speechiness'], bins=20, color="skyblue", edgecolor='black', linewidth=0.8)
ax[2,0].title.set_text('Speachiness')
ax[2, 1].hist(df['acousticness'], bins=20, color="skyblue", edgecolor='black', linewidth=0.8)
ax[2,1].title.set_text('Acousticness')
ax[3, 0].hist(df['instrumentalness'], bins=20, color="skyblue", edgecolor='black', linewidth=0.8)
ax[3,0].title.set_text('Instrumentalness')
ax[3, 1].hist(df['liveness'], bins=20, color="skyblue", edgecolor='black', linewidth=0.8)
ax[3,1].title.set_text('Liveness')
ax[4, 0].hist(df['valence'], bins=20, color="skyblue", edgecolor='black', linewidth=0.8)
ax[4,0].title.set_text('Valence')
ax[4, 1].hist(df['tempo'], bins=20, color="skyblue", edgecolor='black', linewidth=0.8)
ax[4,1].title.set_text('Tempo')
fig.set_size_inches(10, 6)
fig.tight_layout()

In [None]:
fig = px.histogram(y=df['popularity'], x=df['genre'], histfunc="avg"
            , animation_frame=df["year"], title="Popularity by Genre and Year"
            , labels={"y": "Popularidad", "x": "Género", "animation_frame": "Año"}
            , category_orders={"animation_frame": [2000, 2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023]}
            )
fig.update_layout(xaxis_tickfont_size=11)
fig.update_xaxes(categoryorder="total ascending", tickangle=-35, title_standoff=0)
fig.show()

In [6]:
top=30
df_aux = df.sort_values(by='popularity', ascending=False).head(top)

In [None]:
px.area(df_aux, x='track_name', y='danceability', title=f'Top {top} canciones con mayor popularidad y su danceability'
        , hover_data=["artist_name", "popularity"], labels={"danceability": "Danceability", "track_name": "Canción", "artist_name": "Artista", "popularity": "Popularidad"}
        , markers=True)

In [10]:
df_aux = clean_outliers(df, ['tempo'])

In [None]:
px.histogram(df_aux, x='tempo', y='danceability', title=f'Tempo vs Danceability'
        , hover_data=["artist_name", "popularity"]
        , labels={"danceability": "Danceability", "track_name": "Canción", "artist_name": "Artista", "popularity": "Popularidad"}
        , histfunc="avg")

In [None]:
ax = sns.histplot(df, x="tempo", y="danceability", bins=30
                , cbar=True
                , cbar_kws=dict(shrink=.75))
ax.add_line(plt.axvline(df['tempo'].mean(), color='red', linestyle='--', label='Media', alpha=0.2))
ax.xaxis.set_label_text("Tempo")
ax.yaxis.set_label_text("Danceability");

In [None]:
px.histogram(df, x='popularity', y="energy", title='Media de la energía en base a la popularidad según el año'
        , labels={"popularity": "Popularidad", "energy": "Energía", "year": "Año"}
        , animation_frame="year"
        , category_orders={"year": [2000, 2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023]}
        , range_y=[0, 1]
        , range_x=[0, 100]
        , histfunc="avg")

In [None]:
px.histogram(df, x='popularity', y="valence", title='Media de la positividad en base a la popularidad según el año'
        , labels={"popularity": "Popularidad", "valence": "Positividad", "year": "Año"}
        , animation_frame="year"
        , category_orders={"year": [2000, 2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023]}
        , range_y=[0, 1]
        , range_x=[0, 100]
        , histfunc="avg")

In [None]:
px.histogram(df, x='mode', y="popularity", title='Media de la popularidad en base al modo'
            , labels={"popularity": "Popularidad", "mode": "Modo"}
            , histfunc="avg")

In [None]:
px.histogram(df, x='key', y="popularity", title='Media de la popularidad en base a la escala de la canción'
            , labels={"popularity": "Popularidad", "key": "Escala"}
            , color="key"
            , histfunc="avg"
            , category_orders={"key": ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]})

In [None]:
df_aux = df.sort_values(by='popularity', ascending=False).head(top)
fig = px.parallel_categories(df_aux, dimensions=['genre', 'key', 'mode', 'popularity'], color="popularity", color_continuous_scale=px.colors.sequential.Agsunset
                            , title=f'Top {top} canciones y su camino hacia la popularidad')