In [5]:
import warnings
import pandas as pd
import numpy as np
import datetime
from datetime import datetime, timedelta
from dateutil import tz
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:99.5% !important;} </style>"))
%config Completer.use_jedi = False

In [29]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import (ColumnDataSource, Label, Range1d, 
                          BasicTickFormatter, LabelSet, DatetimeTickFormatter, 
                          DataRange1d, Span, ColorBar, Title)
from bokeh.models.tools import HoverTool
from bokeh.layouts import layout, column, row
from bokeh.io import save
plot_width = 700
plot_height = 500

In [7]:
import matplotlib as mpl
rcparams = {'font.family':         'sans-serif',
            'font.sans-serif':     ['Helvetica'],
            'axes.labelsize':      28,
            'axes.titlesize':      28,
            'legend.fontsize':     20,
            'ytick.right':         'off',
            'xtick.top':           'off',
            'ytick.left':          'on',
            'xtick.bottom':        'on',
            'xtick.labelsize':     '28',
            'ytick.labelsize':     '28',
            'axes.linewidth':       2.5,
            'xtick.major.width':    1.8,
            'xtick.minor.width':    1.8,
            'xtick.major.size':     14,
            'xtick.minor.size':     7,
            'xtick.major.pad':      10,
            'xtick.minor.pad':      10,
            'ytick.major.width':    1.8,
            'ytick.minor.width':    1.8,
            'ytick.major.size':     14,
            'ytick.minor.size':     7,
            'ytick.major.pad':      10,
            'ytick.minor.pad':      10,
            'axes.labelpad':        15,
            'axes.titlepad':        15,
            'axes.spines.right':    False,
            'axes.spines.top':      False,
#             'axes.spines.left':      False
           }
mpl.rcParams.update(rcparams)

In [8]:
from_zone = tz.tzutc()
to_zone = tz.tzlocal()

# Data

In [9]:
df = pd.read_csv('data/scrobbles_ansesu.csv')
from_zone = tz.tzutc()
to_zone = tz.tzlocal()
df['date'] = [
    datetime.strptime(
        date, '%d %b %Y, %H:%M'
    ).replace(tzinfo=from_zone).astimezone(to_zone) for date in df.utc_time
]
df['date']= pd.to_datetime(df['date'].values)
df = df.set_index('date')

for idx in df[df.artist == '水曜日のカンパネラ'].index:
    df.at[idx, 'artist'] = 'Wednesday Campanella'
for idx in df[df.artist == '水曜日のカンパネラ×shu uemura'].index:
    df.at[idx, 'artist'] = 'Wednesday Campanella'    
for idx in df[df.artist == 'Frederic'].index:
    df.at[idx, 'artist'] = 'フレデリック'    

In [10]:
year = 2019
# Create dataframe of specific year
df['year'] = [date.year for date in df.index]
df_year = df[df.year==2019].copy(deep=True)

# Create a column 'track - artist'
which_ = 'track'
l=[]
for i in df_year[f'{which_}'].values:
    artist = df[df[f'{which_}'] == i].artist.values[0]
    l.append(f'{i} - {artist}')
df_year[f'{which_}_w_artist'] =l

# Create a column 'album - artist'
which_ = 'album'
l=[]
for i in df_year[f'{which_}'].values:
    try:
        artist = df[df[f'{which_}'] == i].artist.values[0]
        l.append(f'{i} - {artist}')
    except:
        artist = ''
        l.append(f'{i} - {artist}')        
df_year[f'{which_}_w_artist'] =l

In [84]:
f'{len(df_year):,} plays, {df_year.track.nunique():,} músicas únicas,\
 {df_year.artist.nunique():,} artistas e\
 {df_year.album.nunique():,} álbuns'

'10,215 plays, 2,270 músicas únicas, 555 artistas e 1,027 álbuns'

# Prepare data for flourish.io

## All-time artist

In [6]:
which_ = 'artist'
groupby_ = df.groupby(by=[df.index, which_]).agg('count')
groupby_.index.names = ['date', which_]
groupby_ = groupby_.reset_index()
min_year, min_month = groupby_.date.min().year, groupby_.date.min().month
max_year, max_month = groupby_.date.max().year, groupby_.date.max().month
f'starts at {min_month}/{min_year} and ends at {max_month}/{max_year}'

'starts at 7/2012 and ends at 7/2020'

In [7]:
start_date = datetime(min_year, min_month-1, 1)
end_date = datetime(max_year, max_month+1, 1)
date_range = pd.date_range(start_date, end_date, freq='M', closed='right').tolist()

In [8]:
all_artists = groupby_.artist.unique()
df_ = pd.DataFrame({'artist': all_artists})

for date1, date2 in zip(date_range[:-2], date_range[1:]):
    l=[]
    for artist in all_artists:
        mask = (groupby_.date >= date1) & (groupby_.date < date2) & (groupby_.artist==artist)
        l.append(groupby_[mask].track.sum())
    month = date1.month + 1
    year = date1.year
    if month == 13:
        month = 1
        year = year + 1
    df_[f'{month}/{year}']=l

l=[]
date1 = date_range[-2]
date2 = date_range[-1]
for artist in all_artists:
    mask = (groupby_.date >= date1) & (groupby_.date <= date2) & (groupby_.artist==artist)
    l.append(groupby_[mask].track.sum())
month = date1.month + 1
year = date1.year
if month == 13:
    month = 1
    year = year + 1
df_[f'{month}/{year}']=l

In [9]:
del df_['artist']
df_ = df_.cumsum(axis=1)
df_['artist'] = all_artists
df_.to_csv('ansesu_alltime.csv')

## Yearly artist

In [10]:
which_ = 'artist'
year = 2019
groupby_ = df.groupby(by=[df.index, which_]).agg('count')
groupby_.index.names = ['date', which_]
groupby_ = groupby_.reset_index()
groupby_['year'] = [date.year for date in groupby_.date]
groupby_= groupby_[groupby_['year']==year]

In [11]:
start_date = datetime(year, 1, 1)
end_date = datetime(year+1, 1, 2)
date_range = pd.date_range(start_date, end_date, freq='d', closed='left').tolist()

In [12]:
all_artists = groupby_.artist.unique()
df_ = pd.DataFrame({'artist': all_artists})

for date1, date2 in zip(date_range[:-2], date_range[1:]):
    l=[]
    for artist in all_artists:
        mask = (groupby_.date >= date1) & (groupby_.date < date2) & (groupby_.artist==artist)
        l.append(groupby_[mask].track.sum())
    day = date1.day    
    month = date1.month
    year = date1.year
    df_[f'{day}/{month}/{year}']=l
    
l=[]
date1 = date_range[-2]
date2 = date_range[-1]
for artist in all_artists:
    mask = (groupby_.date >= date1) & (groupby_.date <= date2) & (groupby_.artist==artist)
    l.append(groupby_[mask].track.sum())
day = date1.day 
month = date1.month
year = date1.year
df_[f'{day}/{month}/{year}']=l

In [13]:
del df_['artist']
df_ = df_.cumsum(axis=1)
df_['artist'] = all_artists
df_.to_csv('ansesu_artist_2019.csv')

## Yearly tracks

In [14]:
which_ = 'track'
year = 2019
groupby_ = df.groupby(by=[df.index, which_]).agg('count')
groupby_.index.names = ['date', which_]
groupby_ = groupby_.reset_index()
groupby_['year'] = [date.year for date in groupby_.date]
groupby_= groupby_[groupby_['year']==year]

In [15]:
start_date = datetime(year, 1, 1)
end_date = datetime(year+1, 1, 2)
date_range = pd.date_range(start_date, end_date, freq='d', closed='left').tolist()

In [16]:
all_ = groupby_[which_].unique()
df_ = pd.DataFrame({which_: all_})

for date1, date2 in zip(date_range[:-2], date_range[1:]):
    l=[]
    for i in all_:
        mask = (groupby_.date >= date1) & (groupby_.date < date2) & (groupby_[which_]==i)
        l.append(groupby_[mask].artist.sum())
    day = date1.day    
    month = date1.month
    year = date1.year
    df_[f'{day}/{month}/{year}']=l
    
l=[]
date1 = date_range[-2]
date2 = date_range[-1]
for i in all_:
    mask = (groupby_.date >= date1) & (groupby_.date <= date2) & (groupby_[which_]==i)
    l.append(groupby_[mask].artist.sum())
day = date1.day 
month = date1.month
year = date1.year
df_[f'{day}/{month}/{year}']=l

In [17]:
del df_[which_]
df_ = df_.cumsum(axis=1)
df_[which_] = all_
l = []
for i in df_[f'{which_}'].values:
    artist = df[df[f'{which_}'] == i].artist.values[0]
    l.append(f'{i} - {artist}')
df_['full_name'] = l
df_.to_csv(f'ansesu_{which_}_2019.csv')

## Yearly albums

In [18]:
which_ = 'album'
year = 2019
groupby_ = df.groupby(by=[df.index, which_]).agg('count')
groupby_.index.names = ['date', which_]
groupby_ = groupby_.reset_index()
groupby_['year'] = [date.year for date in groupby_.date]
groupby_= groupby_[groupby_['year']==year]

In [19]:
start_date = datetime(year, 1, 1)
end_date = datetime(year+1, 1, 2)
date_range = pd.date_range(start_date, end_date, freq='d', closed='left').tolist()

In [20]:
all_ = groupby_[which_].unique()
df_ = pd.DataFrame({which_: all_})

for date1, date2 in zip(date_range[:-2], date_range[1:]):
    l=[]
    for i in all_:
        mask = (groupby_.date >= date1) & (groupby_.date < date2) & (groupby_[which_]==i)
        l.append(groupby_[mask].artist.sum())
    day = date1.day    
    month = date1.month
    year = date1.year
    df_[f'{day}/{month}/{year}']=l
    
l=[]
date1 = date_range[-2]
date2 = date_range[-1]
for i in all_:
    mask = (groupby_.date >= date1) & (groupby_.date <= date2) & (groupby_[which_]==i)
    l.append(groupby_[mask].artist.sum())
day = date1.day 
month = date1.month
year = date1.year
df_[f'{day}/{month}/{year}']=l

In [21]:
del df_[which_]
df_ = df_.cumsum(axis=1)
df_[which_] = all_
l = []
for i in df_[f'{which_}'].values:
    artist = df[df[f'{which_}'] == i].artist.values[0]
    l.append(f'{i} - {artist}')
df_['full_name'] = l
df_.to_csv(f'ansesu_{which_}_2019.csv')

# My yearly top artists

In [86]:
which_='artist'
artist, counts = np.unique(df_year[f'artist'], return_counts=True)
df_track = pd.DataFrame({
    'artist': artist,
    'count': counts
})

In [87]:
for j, i in enumerate(df_track.sort_values('count', ascending=False)[['artist','count']].values[:10]):
    print(f'{j+1}) {i[0]} ({i[1]} plays)')

1) Vulfpeck (1785 plays)
2) Foster the People (1711 plays)
3) Saint Motel (427 plays)
4) The Kooks (377 plays)
5) Khruangbin (264 plays)
6) Red Hot Chili Peppers (247 plays)
7) The Strokes (201 plays)
8) Jamiroquai (185 plays)
9) J Rabbit (180 plays)
10) Wednesday Campanella (162 plays)


# My yearly top tracks

In [95]:
which_='track'
track, counts = np.unique(df_year[f'{which_}_w_artist'], return_counts=True)
df_track = pd.DataFrame({
    'track': track,
    'count': counts
})

In [107]:
for j, i in enumerate(df_track.sort_values('count', ascending=False)[['track','count']].values[:10]):
    print(f'{j+1}) {i[0]} ({i[1]} plays)')

1) Pick U Up - Foster the People (159 plays)
2) Imagination - Foster the People (102 plays)
3) Style - Foster the People (65 plays)
4) Half of the Way - Vulfpeck (61 plays)
5) Wait For The Moment - Vulfpeck (57 plays)
6) Worst Nites - Foster the People (52 plays)
7) 1612 - Vulfpeck (51 plays)
8) Don't Stop (Color on the Walls) - Foster the People (49 plays)
9) Goats in Trees - Foster the People (49 plays)
10) Helena Beat - Foster the People (48 plays)


# My yearly top albums

In [117]:
which_='album'
album, counts = np.unique(df_year[f'{which_}_w_artist'], return_counts=True)
df_album = pd.DataFrame({
    'album': album,
    'count': counts
})

In [118]:
for j, i in enumerate(df_album.sort_values('count', ascending=False)[['album','count']].values[:10]):
    print(f'{j+1}) {i[0]} ({i[1]} plays)')

1) Torches - Foster the People (418 plays)
2) Supermodel - Foster the People (339 plays)
3) Thrill of the Arts - Vulfpeck (312 plays)
4) Sacred Hearts Club - Foster the People (302 plays)
5) hill climber - Vulfpeck (286 plays)
6) The Beautiful Game - Vulfpeck (280 plays)
7) Saintmotelevision - Saint Motel (236 plays)
8) Pick U Up - Foster the People (216 plays)
9) Fugue State - Vulfpeck (213 plays)
10) Mr Finish Line - Vulfpeck (194 plays)


# My yearly top new artists

In [212]:
new_artists, count = np.unique(df_year[~df_year.artist.isin(df[df.year<year].artist.unique())].artist, return_counts=True)

In [223]:
for i, j in enumerate(sorted(zip(count[count>10],new_artists[count>10]), reverse=True)):
    print(f'{i+1}) {j[1]} ({j[0]} plays)')

1) Theo Katzman (66 plays)
2) Beach Fossils (64 plays)
3) Ripe (40 plays)
4) DJ Bean Ornish (28 plays)
5) Breakestra (23 plays)
6) Michael Haydn (22 plays)
7) おかん (19 plays)
8) Madeon (18 plays)
9) Scary Goldings (17 plays)
10) Ozawa Kenji (17 plays)
11) 小沢健二 (14 plays)
12) Aqueous (12 plays)


# Yearly listening time distribution

In [89]:
df_year['hour'] = [date.hour for date in df_year.index]
hour, count = np.unique(df_year['hour'], return_counts=True)
df_hour = pd.DataFrame(
    {'hour': hour,
    'count': count}
)

In [96]:
filename = 'daily_hours'

labels_dict = {
    'pt': {
        'y': 'Número de músicas',
        'x': 'Horário, h',
        'hover_1': 'Horário',
        'hover_2': 'Quantidade'
    },
    'en': {
        'y': 'Song count',
        'x': 'Hour, h',
        'hover_1': 'Hour',
        'hover_2': 'Count'
    } 
}  

for lang in ['en', 'pt']:
    fig = figure(plot_width=plot_width, plot_height=plot_height, tools="", 
                 sizing_mode='stretch_both')
    source = ColumnDataSource(df_hour)

    bar = fig.vbar(
        x='hour', top='count', 
        width=0.9, 
        source=source, 
        fill_color='#F8766D', 
        line_width=0, 
        alpha=0.85
    )  
    
    hover = HoverTool(renderers=[bar])
    hover.tooltips=[
        (labels_dict[lang]['hover_1'], '@hour'+':00h'),
        (labels_dict[lang]['hover_2'], '@count{,}'),
    ]
    fig.add_tools(hover)
    
    fig.yaxis.axis_label = labels_dict[lang]['y']
    fig.xaxis.axis_label = labels_dict[lang]['x']
    
    fig.xaxis.ticker = source.data['hour']
    fig.xaxis.formatter = BasicTickFormatter(use_scientific=False)

    fig.toolbar.logo = None
    fig.toolbar_location = None
    fig.y_range.start = 0
    fig.x_range.start = -.5
    fig.x_range.end = 23.5
    fig.xaxis.axis_label_text_font_size = "12pt"
    fig.yaxis.axis_label_text_font_size = "12pt"
    fig.xaxis.major_label_text_font_size = "10pt"
    fig.yaxis.major_label_text_font_size = "10pt"
    fig.xaxis.axis_label_text_font_style = 'normal'
    fig.yaxis.axis_label_text_font_style = 'normal'
    fig.xgrid.grid_line_color = None
    fig.ygrid.grid_line_color = None
    output_file(f"html/{filename}_{year}_{lang}.html")
    save(fig,f"html/{filename}_{year}_{lang}.html")

# Yearly month count distribution

In [97]:
df_year['month'] = [date.month for date in df_year.index]
month, count = np.unique(df_year['month'], return_counts=True)
df_month = pd.DataFrame(
    {'month': month,
     'count': count}
)
df_month['month_str_pt'] = [
    'Jan', 'Fev', 'Mar', 'Abr', 'Mai', 'Jun', 
    'Jul', 'Ago', 'Set', 'Out', 'Nov', 'Dez'
]
df_month['month_str_en'] = [
    'Jan', 'Feb', 'Mar', 'Apr', 'Mai', 'Jun', 
    'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'
]
df_month['month_str_complete_pt'] = [
    'Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 
    'Julho', 'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro'
]
df_month['month_str_complete_en'] = [
    'January', 'February', 'March', 'April', 'May', 'June', 
    'July', 'August', 'September', 'October', 'November', 'December'
]

In [98]:
filename = 'month_counts'

labels_dict = {
        'pt': {
            'y': 'Número de músicas',
            'x': 'Mês',
            'hover_1': 'Mês',
            'hover_2': 'Quantidade'
        },
        'en': {
            'y': 'Song count',
            'x': 'Month',
            'hover_1': 'Month',
            'hover_2': 'Count'
        } 
    }   

for lang in ['en', 'pt']:
    fig = figure(
        plot_width=plot_width, 
        plot_height=plot_height, 
        tools="", 
        sizing_mode='stretch_both'
    )
    source = ColumnDataSource(df_month)

    bar = fig.vbar(
        x='month', top='count', 
        width=0.9, 
        source=source, 
        fill_color='#756bb1', 
        line_width=0, 
        alpha=0.85
    )

    hover = HoverTool(renderers=[bar])
    hover.tooltips=[
        (labels_dict[lang]['hover_1'], f'@month_str_complete_{lang}'),
        (labels_dict[lang]['hover_2'], '@count{,}'),
    ]
    fig.yaxis.axis_label = labels_dict[lang]['y']
    fig.xaxis.axis_label = labels_dict[lang]['x']     

    fig.add_tools(hover)
    fig.xaxis.ticker = source.data['month']
    fig.xaxis.major_label_overrides = {int(i): month for i, month in zip(df_month['month'], df_month[f'month_str_{lang}'])}

    fig.toolbar.logo = None
    fig.toolbar_location = None
    fig.y_range.start = 0
    fig.x_range.start = .5
    fig.x_range.end = 12.5
    fig.xaxis.axis_label_text_font_size = "12pt"
    fig.yaxis.axis_label_text_font_size = "12pt"
    fig.xaxis.major_label_text_font_size = "10pt"
    fig.yaxis.major_label_text_font_size = "10pt"
    fig.xaxis.axis_label_text_font_style = 'normal'
    fig.yaxis.axis_label_text_font_style = 'normal'
    fig.xgrid.grid_line_color = None
    fig.ygrid.grid_line_color = None
    output_file(f"html/{filename}_{year}_{lang}.html")
    save(fig,f"html/{filename}_{year}_{lang}.html") 

# Yearly weekdays count distribution

In [99]:
df_year['weekday'] = [date.weekday() for date in df_year.index]
weekday, count = np.unique(df_year['weekday'], return_counts=True)
df_weekday = pd.DataFrame(
    {'weekday': weekday,
     'count': count}
)
df_weekday['weekday_str_en'] = [
    'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'
]
df_weekday['weekday_str_pt'] = [
    'Seg', 'Ter', 'Qua', 'Qui', 'Sex', 'Sáb', 'Dom'
]
df_weekday['weekday_str_complete_en'] = [
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
]
df_weekday['weekday_str_complete_pt'] = [
    'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado', 'Domingo'
]

In [100]:
filename = 'weekday_counts'

labels_dict = {
    'pt': {
        'y': 'Número de músicas',
        'x': 'Dia da semana',
        'hover_1': 'Dia da semana',
        'hover_2': 'Quantidade'
    },
    'en': {
        'y': 'Song count',
        'x': 'Weekday',
        'hover_1': 'Weekday',
        'hover_2': 'Count'
    } 
} 

for lang in ['en', 'pt']:
    fig = figure(plot_width=plot_width, plot_height=plot_height, tools="", sizing_mode='stretch_both')
    source = ColumnDataSource(df_weekday)

    bar = fig.vbar(
        x='weekday', top='count', 
        width=0.9, 
        source=source, 
        fill_color='#2ca25f', 
        line_width=0, 
        alpha=0.85
    )
   
    hover = HoverTool(renderers=[bar])
    hover.tooltips=[
        (labels_dict[lang]['hover_1'], f'@weekday_str_complete_{lang}'),
        (labels_dict[lang]['hover_2'], '@count{,}'),
    ]
    fig.add_tools(hover)
    fig.yaxis.axis_label = labels_dict[lang]['y']
    fig.xaxis.axis_label = labels_dict[lang]['x']  
    
    fig.xaxis.ticker = source.data['weekday']
    fig.xaxis.major_label_overrides = {int(i): month for i, month in zip(df_weekday['weekday'], df_weekday[f'weekday_str_{lang}'])}

    fig.toolbar.logo = None
    fig.toolbar_location = None
    fig.y_range.start = 0
    fig.x_range.start = -.5
    fig.x_range.end = 6.5
    fig.xaxis.axis_label_text_font_size = "12pt"
    fig.yaxis.axis_label_text_font_size = "12pt"
    fig.xaxis.major_label_text_font_size = "10pt"
    fig.yaxis.major_label_text_font_size = "10pt"
    fig.xaxis.axis_label_text_font_style = 'normal'
    fig.yaxis.axis_label_text_font_style = 'normal'
    fig.xgrid.grid_line_color = None
    fig.ygrid.grid_line_color = None
    output_file(f"html/{filename}_{year}_{lang}.html")
    save(fig,f"html/{filename}_{year}_{lang}.html") 

# Yearly Diversity

In [101]:
def shannon(p_list,n):
    '''
    Calculates Shannon's entropy for a system with n possible outcomes

    input:
        * p_list - list, probability list
        * n - int, number of possible outcomes
    output:
        * entropy - float, Shannon's entropy
    '''
    ps = p_list[p_list>0]
    return np.sum(-ps*np.log(ps))/np.log(n)

In [102]:
start = datetime(year,1,1)
n_weeks = round(365/7)+1
dates = [start] + [start + timedelta(days=7*delta) for delta in range(1, n_weeks)]
dates = dates[:-1] + [datetime(year+1,1,1)]

In [103]:
l=[]
l_dates=[]
for date1, date2 in zip(dates[:-1], dates[1:]):
    mask = (df_year.index < date2) & (df_year.index >= date1)
    l_entropy = []
    counts_artist = np.unique(df_year[mask].artist, return_counts=True)[1]
    p_artist = counts_artist/np.sum(counts_artist)
    entropy = shannon(p_artist, len(p_artist))
    l.append(entropy)
    l_dates.append(f'{date1:%d/%m/%Y} - {date2:%d/%m/%Y}')
    
df_entropy_weeks = pd.DataFrame(
    {'entropy': l,
     'week_str': l_dates,
     'week': np.arange(len(l))}
)    

In [104]:
filename='yearly_entropy'

labels_dict = {
    'pt': {
        'y': 'Diversidade',
        'x': 'Semana do ano',
        'hover_1': 'Data',
        'hover_2': 'Semana',
        'hover_3': 'Diversidade'
    },
    'en': {
        'y': 'Diversity',
        'x': 'Week',
        'hover_1': 'Date',
        'hover_2': 'Week',
        'hover_3': 'Diversity'
    } 
} 

for lang in ['en', 'pt']:
    fig = figure(
        plot_width=plot_width, 
        plot_height=plot_height, 
        tools="", 
        sizing_mode='stretch_both'
    )

    source = ColumnDataSource(df_entropy_weeks)

    line = fig.line(
        x='week', y='entropy',
        source=source,
        line_width=2.5, 
        color='#d53e4f'
    )

    c = fig.circle(
        x='week', 
        y='entropy',
        source=source,
        size=10, 
        color='#d53e4f'
    )   
    
    hover = HoverTool(formatters=formatters, renderers=[c])
    hover.tooltips=[
        (labels_dict[lang]['hover_1'], '@week_str'),
        (labels_dict[lang]['hover_2'], '@week'),
        (labels_dict[lang]['hover_3'], '@entropy{1.111}')
    ]
    fig.add_tools(hover)
    fig.yaxis.axis_label = labels_dict[lang]['y']
    fig.xaxis.axis_label = labels_dict[lang]['x']    
    
    fig.yaxis.formatter = BasicTickFormatter(use_scientific=False)
    fig.y_range = Range1d(0, 1)
    fig.toolbar.logo = None
    fig.toolbar_location = None
    fig.xaxis.axis_label_text_font_size = "12pt"
    fig.yaxis.axis_label_text_font_size = "12pt"
    fig.xaxis.major_label_text_font_size = "10pt"
    fig.yaxis.major_label_text_font_size = "10pt"
    fig.xaxis.axis_label_text_font_style = 'normal'
    fig.yaxis.axis_label_text_font_style = 'normal'
    fig.xgrid.grid_line_color = None
    fig.ygrid.grid_line_color = None

    output_file(f"html/{filename}_{year}_{lang}.html")
    save(fig,f"html/{filename}_{year}_{lang}.html") 

# Yearly diversity through weekdays

In [105]:
l=[]

weekday_str_en = [
    'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'
]
weekday_str_pt = [
    'Seg', 'Ter', 'Qua', 'Qui', 'Sex', 'Sáb', 'Dom'
]
weekday_str_complete_en = [
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
]
weekday_str_complete_pt = [
    'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado', 'Domingo'
]

for weekday in np.arange(7):
    mask = df_year.weekday == weekday
    counts_artist = np.unique(df_year[mask].artist, return_counts=True)[1]
    p_artist = counts_artist/np.sum(counts_artist)
    entropy = shannon(p_artist, len(p_artist))
    l.append(entropy)
    
df_entropy_weekday = pd.DataFrame(
    {'entropy': l,
    'weekday': np.arange(len(l)),
    'weekday_str_en': weekday_str_en,
    'weekday_str_complete_en': weekday_str_complete_en,
    'weekday_str_pt': weekday_str_pt,
    'weekday_str_complete_pt': weekday_str_complete_pt,}
)

In [106]:
filename='weekday_entropy'

labels_dict = {
    'pt': {
        'y': 'Diversidade',
        'x': 'Dia da semana',
        'hover_1': 'Dia da semana',
        'hover_2': 'Diversidade'
    },
    'en': {
        'y': 'Diversity',
        'x': 'Weekday',
        'hover_1': 'Weekday',
        'hover_2': 'Diversity'
    } 
}
    
for lang in ['en', 'pt']:
    fig = figure(
        plot_width=plot_width, 
        plot_height=plot_height, 
        tools="", 
        sizing_mode='stretch_both'
    )

    source = ColumnDataSource(df_entropy_weekday)

    line = fig.line(
        x='weekday', y='entropy',
        source=source,
        line_width=3.5, 
        color='#7bccc4'
    )

    c = fig.circle(
        x='weekday', 
        y='entropy',
        source=source,
        size=12, 
        color='#7bccc4'
    )
    
    hover = HoverTool(formatters=formatters, renderers=[c])
    hover.tooltips=[
        (labels_dict[lang]['hover_1'], f'@weekday_str_complete_{lang}'),
        (labels_dict[lang]['hover_2'], '@entropy{1.111}')
    ]
    fig.add_tools(hover)
    fig.yaxis.axis_label = labels_dict[lang]['y']
    fig.xaxis.axis_label = labels_dict[lang]['x']
    
    fig.xaxis.ticker = source.data['weekday']
    fig.xaxis.major_label_overrides = {int(i): month for i, month in zip(df_entropy_weekday['weekday'], df_entropy_weekday[f'weekday_str_{lang}'])}

    fig.yaxis.formatter = BasicTickFormatter(use_scientific=False)

    fig.y_range = Range1d(0, 1)
    fig.toolbar.logo = None
    fig.toolbar_location = None
    fig.xaxis.axis_label_text_font_size = "12pt"
    fig.yaxis.axis_label_text_font_size = "12pt"
    fig.xaxis.major_label_text_font_size = "10pt"
    fig.yaxis.major_label_text_font_size = "10pt"
    fig.xaxis.axis_label_text_font_style = 'normal'
    fig.yaxis.axis_label_text_font_style = 'normal'
    fig.xgrid.grid_line_color = None
    fig.ygrid.grid_line_color = None

    output_file(f"html/{filename}_{year}_{lang}.html")
    save(fig,f"html/{filename}_{year}_{lang}.html") 

# Yearly listening streaks

In [111]:
counter = 1
streaks = []
artists = []
dates = []
for artist1, artist2, date in zip(df_year.artist.values[:-1], 
                                  df_year.artist.values[1:],
                                  df_year.index[:-1]):
    if artist1 != artist2:
        streaks.append(counter)
        artists.append(artist1)
        dates.append(date)
        counter = 1 
    else:
        counter += 1
        
df_streak = pd.DataFrame({
    'streak': streaks,
    'artist': artists,
    'date': dates
})

In [112]:
filename='listening_streaks'

labels_dict = {
    'pt': {
        'y': 'Sequência',
        'x': 'Data',
        'hover_1': 'Data',
        'hover_2': 'Sequência',
        'hover_3': 'Artista'
    },
    'en': {
        'y': 'Streak',
        'x': 'Date',
        'hover_1': 'Date',
        'hover_2': 'Streak',
        'hover_3': 'Artist'
    } 
}    

for lang in ['en', 'pt']:
    fig = figure(x_axis_type="datetime", plot_width=plot_width, plot_height=plot_height, tools="", sizing_mode='stretch_both')

    source = ColumnDataSource(df_streak)

    c = fig.circle(
        x='date', y='streak',
        source=source,
        size='streak', 
        color='#f768a1',
        alpha=0.5
    )
    
    formatters={'@date': 'datetime'}
    hover = HoverTool(formatters=formatters, renderers=[c])
    hover.tooltips=[
        (labels_dict[lang]['hover_1'], '@date{%d/%m/%Y}'),
        (labels_dict[lang]['hover_2'], '@streak'),
        (labels_dict[lang]['hover_3'], '@artist')
    ]
    fig.add_tools(hover)
    fig.yaxis.axis_label = labels_dict[lang]['y']
    fig.xaxis.axis_label = labels_dict[lang]['x']
    
    fig.xaxis.formatter=DatetimeTickFormatter(days=['%d/%m', '%a %d'], months=['%m/%Y', '%b %y'])
    fig.yaxis.formatter = BasicTickFormatter(use_scientific=False)

    fig.y_range = Range1d(0, 102)
    fig.toolbar.logo = None
    fig.toolbar_location = None
    fig.xaxis.axis_label_text_font_size = "12pt"
    fig.yaxis.axis_label_text_font_size = "12pt"
    fig.xaxis.major_label_text_font_size = "10pt"
    fig.yaxis.major_label_text_font_size = "10pt"
    fig.xaxis.axis_label_text_font_style = 'normal'
    fig.yaxis.axis_label_text_font_style = 'normal'
    fig.xgrid.grid_line_color = None
    fig.ygrid.grid_line_color = None
    output_file(f"html/{filename}_{year}_{lang}.html")
    save(fig,f"html/{filename}_{year}_{lang}.html")

In [109]:
counter = 1
streaks = []
tracks = []
dates = []
for track1, track2, date in zip(df_year.track_w_artist.values[:-1], 
                                df_year.track_w_artist.values[1:],
                                df_year.index[:-1]):
    if track1 != track2:
        streaks.append(counter)
        tracks.append(track1)
        dates.append(date)
        counter = 1 
    else:
        counter += 1
        
df_streak = pd.DataFrame({
    'streak': streaks,
    'track': tracks,
    'date': dates
})

In [110]:
filename='listening_streaks_track'

labels_dict = {
    'pt': {
        'y': 'Sequência',
        'x': 'Data',
        'hover_1': 'Data',
        'hover_2': 'Sequência',
        'hover_3': 'Música'
    },
    'en': {
        'y': 'Streak',
        'x': 'Date',
        'hover_1': 'Date',
        'hover_2': 'Streak',
        'hover_3': 'Track'
    } 
}

for lang in ['en', 'pt']:
    fig = figure(
        x_axis_type="datetime", 
        plot_width=plot_width, 
        plot_height=plot_height, 
        tools="", 
        sizing_mode='stretch_both'
    )

    source = ColumnDataSource(df_streak)

    c = fig.circle(
        x='date', 
        y='streak',
        source=source,
        size='streak', 
        color='#f1a340',
        alpha=0.5
    )    
    
    formatters={'@date': 'datetime'}
    hover = HoverTool(formatters=formatters, renderers=[c])
    hover.tooltips=[
        (labels_dict[lang]['hover_1'], '@date{%d/%m/%Y}'),
        (labels_dict[lang]['hover_2'], '@streak'),
        (labels_dict[lang]['hover_3'], '@track')
    ]
    fig.add_tools(hover)
    fig.yaxis.axis_label = labels_dict[lang]['y']
    fig.xaxis.axis_label = labels_dict[lang]['x']
    
    fig.xaxis.formatter=DatetimeTickFormatter(days=['%d/%m', '%a %d'], months=['%m/%Y', '%b %y'])
    fig.yaxis.formatter = BasicTickFormatter(use_scientific=False)

    fig.y_range = Range1d(0, 102)
    fig.toolbar.logo = None
    fig.toolbar_location = None
    fig.xaxis.axis_label_text_font_size = "12pt"
    fig.yaxis.axis_label_text_font_size = "12pt"
    fig.xaxis.major_label_text_font_size = "10pt"
    fig.yaxis.major_label_text_font_size = "10pt"
    fig.xaxis.axis_label_text_font_style = 'normal'
    fig.yaxis.axis_label_text_font_style = 'normal'
    fig.xgrid.grid_line_color = None
    fig.ygrid.grid_line_color = None
    output_file(f"html/{filename}_{year}_{lang}.html")
    save(fig,f"html/{filename}_{year}_{lang}.html")

# Stream chart

In [40]:
start = datetime(year,1,1)
n_weeks = round(365/7)+1
dates = [start] + [start + timedelta(days=7*delta) for delta in range(1, n_weeks)]
dates = dates[:-1] + [datetime(year+1,1,1)]

df = pd.DataFrame()
for date1, date2 in zip(dates[:-1], dates[1:]):
    mask = (df_year.index < date2) & (df_year.index >= date1)
    artist, counts_artist = np.unique(df_year[mask].artist, return_counts=True)
    df = pd.concat(
        [df, pd.DataFrame({
                'artist': artist,
                'count_': counts_artist,
                'date': [date1]*len(artist)
             })
        ]
    )

In [75]:
df_alt = df[df.artist.isin(
                df[df['count_']>30].artist.unique()
         )]

In [115]:
import altair as alt

selection = alt.selection_multi(fields=['artist'], bind='legend')
filename='stream_artist_chart'

labels_dict = {
    'pt': {
        'legend': 'Artista',
        'x': 'Mês',
    },
    'en': {
        'legend': 'Artist',
        'x': 'Month',
    } 
}   

for lang in ['en', 'pt']:
    alt.Chart(df_alt).mark_area().encode(
        alt.X('yearmonth(date):T',
              axis=alt.Axis(format='%m', domain=False, 
                            tickSize=0, title=labels_dict[lang]['x'])
        ),
        alt.Y('sum(count_):Q', stack='center', axis=None),
        alt.Color('artist:N',
                  scale=alt.Scale(scheme='category20b'),
                  legend=alt.Legend(title=labels_dict[lang]['legend']) 
        ),
        opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
    ).add_selection(
        selection
    ).save(f"html/{filename}_{year}_{lang}.html")