In [1]:
from getpass import getpass
import pandas as pd
import numpy as np

username = input("Username: ")
password = getpass("Password: ")

dburi = f'postgresql://{username}:{password}@codd.mines.edu:5433/csci403'

del password

Username: ccrandall
Password: ········


## Questions to answer
- Has the proportion of popular explicit songs increased in the past X years? Does this change by genre? Region?
- Are more popular songs more energetic? faster?
- Compare music profiles between regions

## Worldwide proportion of popular explicit songs
* Get a timeseries of top songs (Billboard?)
* Record the proportion of ranked songs that are explicit 
    * (or, record the proportion of explicit streams/purchases? $\to$ assume small time range)
* weekly granularity? daily?

In [2]:
# Explicit music from the Spotify Top Weekly database & song attributes
explicit = pd.read_sql_query('''
SELECT sp.week, sum(sa.explicit) as explicit_songs, count(*) as total
FROM colinsiles.song_attributes sa, colinsiles.spotify_top_weekly sp
WHERE sa.name = sp.name and sa.artist = sp.artist
GROUP BY sp.week;
                            ''', con=dburi)

explicit = explicit.assign(proportion=lambda explicit: explicit.explicit_songs/explicit.total)

In [3]:
explicit

Unnamed: 0,week,explicit_songs,total,proportion
0,2016-12-30,65,156,0.416667
1,2017-01-06,68,124,0.548387
2,2017-01-13,62,122,0.508197
3,2017-01-20,63,123,0.512195
4,2017-01-27,60,120,0.500000
...,...,...,...,...
127,2019-06-07,47,81,0.580247
128,2019-06-14,47,91,0.516484
129,2019-06-21,46,86,0.534884
130,2019-06-28,44,82,0.536585


In [7]:
import plotly.express as px
fig = px.line(explicit, x='week', y="proportion")
fig.update_layout(title="Proportion of Explicit Songs in Spotify Global Top 200 (Jan 2017 - July 2019)")
fig.show()

This looks moderately cyclical -- the last weeks of the year have the lowest proportion of popular explicit songs (Christmas music!), while the rest of the year enjoys a much higher proportion of explicit songs.

---

## Proportion of popular explicit songs by country
Considering the proportion of explicit songs worldwide is subject to some seasonal variation, it follows that there might be significant differences between countries.

In [8]:
# Viewing essentially the same data, divided by region, and with a higher, daily granularity
df = pd.read_sql_query('''
SELECT wr.date, wr.region, sum(sa.explicit) as explicit_songs, count(*) as total
FROM colinsiles.song_attributes sa, colinsiles.world_rankings wr
WHERE sa.name = wr.track_name and sa.artist = wr.artist
GROUP BY wr.date, wr.region;
                            ''', con=dburi)


In [9]:
# add proportion of explicit songs
df = df.assign(proportion=lambda df: df.explicit_songs / df.total)

In [10]:
# we need to convert the ISO 3166-1 alpha-2 to the alpha-3 encoding for 
# mapping with plotly express

#!pip install iso3166

from iso3166 import countries_by_alpha2

df['region'] = df['region'].str.upper()
df = df[df['region'] != 'GLOBAL']
df['iso_code'] = df['region'].apply(lambda x: countries_by_alpha2[x].alpha3)
df['country_name'] = df['region'].apply(lambda x: countries_by_alpha2[x].name)
df



Unnamed: 0,date,region,explicit_songs,total,proportion,iso_code,country_name
0,2017-01-01,AR,7,57,0.122807,ARG,Argentina
1,2017-01-01,AT,15,87,0.172414,AUT,Austria
2,2017-01-01,AU,31,120,0.258333,AUS,Australia
3,2017-01-01,BE,15,100,0.150000,BEL,Belgium
4,2017-01-01,BO,5,32,0.156250,BOL,"Bolivia, Plurinational State of"
...,...,...,...,...,...,...,...
19670,2018-01-09,SV,5,33,0.151515,SLV,El Salvador
19671,2018-01-09,TR,12,37,0.324324,TUR,Turkey
19672,2018-01-09,TW,7,43,0.162791,TWN,"Taiwan, Province of China"
19673,2018-01-09,US,59,111,0.531532,USA,United States of America


In [11]:
# plotly workaround for animating with timestamps
# need to be string values, not time values in the DF

df = df.sort_values(by='date') # sort rows based on timestamp

df['timestamp'] = df.date.apply(lambda x: str(x)) # convert timestamp to a string
df


Unnamed: 0,date,region,explicit_songs,total,proportion,iso_code,country_name,timestamp
0,2017-01-01,AR,7,57,0.122807,ARG,Argentina,2017-01-01
30,2017-01-01,IT,10,96,0.104167,ITA,Italy,2017-01-01
31,2017-01-01,JP,10,49,0.204082,JPN,Japan,2017-01-01
32,2017-01-01,LT,3,14,0.214286,LTU,Lithuania,2017-01-01
33,2017-01-01,LU,1,1,1.000000,LUX,Luxembourg,2017-01-01
...,...,...,...,...,...,...,...,...
19642,2018-01-09,GB,28,91,0.307692,GBR,United Kingdom of Great Britain and Northern I...,2018-01-09
19644,2018-01-09,GR,17,38,0.447368,GRC,Greece,2018-01-09
19645,2018-01-09,GT,11,47,0.234043,GTM,Guatemala,2018-01-09
19647,2018-01-09,HN,4,24,0.166667,HND,Honduras,2018-01-09


In [12]:
fig = px.choropleth(df, locations="iso_code",
                    color="proportion", # color is determined by proportion of explicit songs
                    hover_name="country_name", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma,
                    animation_frame='timestamp', # animate along the timestamp
                    range_color=[0, 0.7]) # make the range constant

fig.update_layout(transition = {'duration': 0.001})
fig.update_layout(
    title='Proportion of Daily Popular Explicit Songs by Country on Spotify (2017)'
)

fig.show()

In [24]:
# create line plots of the above data for some countries
selected_countries = ['Argentina', 
                      'Brazil', 
                      'Colombia',
                      'France', 
                      'Portugal',
                      'Turkey',
                      'United Kingdom of Great Britain and Northern Ireland',
                      'United States of America',
                      'Canada',
                      'Malaysia', 
                      'Indonesia',
                      'Japan'
                     ]

import matplotlib.pyplot as plt
plt.style.use('ggplot')

for country in selected_countries:
    fig = px.line(df[df.country_name == country], 
                  x='timestamp', 
                  y='proportion')
    if country == 'United States of America':
        country = 'USA'
    elif country == 'United Kingdom of Great Britain and Northern Ireland':
        country = 'UK'
    fig.update_layout(title=f'Proportion of Daily Popular Explicit Songs in {country} on Spotify (2017)')
    fig.show()
    fig.write_image(f'{country}_explicit_line.png')

