# Altair Decluttering

## Load and Clean the Dataset

In [188]:
import pandas as pd

df = pd.read_excel('../sources/eu_live_births.xlsx', header=7, sheet_name="Sheet 1", na_values=':')
df.head(10)

Unnamed: 0,TIME,2009,Unnamed: 2,2010,Unnamed: 4,2011,Unnamed: 6,2012,Unnamed: 8,2013,...,2016,Unnamed: 16,2017,Unnamed: 18,2018,Unnamed: 20,2019,Unnamed: 22,2020,Unnamed: 24
0,GEO (Labels),,,,,,,,,,...,,,,,,,,,,
1,European Union - 27 countries (from 2020),4622368.0,,4603858.0,,4458386.0,,4417656.0,,4303313.0,...,4379549.0,,4328560.0,,4245710.0,,4168656.0,,4047432.0,p
2,European Union - 28 countries (2013-2020),5412572.0,,5411129.0,,5266162.0,,5230626.0,,5081671.0,...,5153935.0,,5083314.0,,4976628.0,,4881355.0,p,,
3,European Union - 27 countries (2007-2013),5367995.0,,5367768.0,,5224965.0,,5188855.0,,5041732.0,...,5116398.0,,5046758.0,,4939683.0,,4845220.0,p,,
4,Euro area - 19 countries (from 2015),3467449.0,,3472945.0,,3394195.0,,3349465.0,,3272880.0,...,3301935.0,,3229613.0,,3170835.0,,3115288.0,,3042405.0,p
5,Euro area - 18 countries (2014),3435284.0,,3442269.0,,3363927.0,,3319006.0,,3242995.0,...,3271312.0,,3200917.0,,3142686.0,,3087895.0,,3017261.0,p
6,Belgium,127198.0,,130100.0,,128705.0,,128051.0,,125606.0,...,121896.0,,119690.0,,118319.0,,117695.0,,114350.0,p
7,Bulgaria,80956.0,,75513.0,,70846.0,,69121.0,,66578.0,...,64984.0,,63955.0,,62197.0,,61538.0,,59086.0,
8,Czechia,118348.0,,117153.0,,108673.0,,108576.0,,106751.0,...,112663.0,,114405.0,,114036.0,,112231.0,,110200.0,
9,Denmark,62818.0,,63411.0,,58998.0,,57916.0,,55873.0,...,61614.0,,61397.0,,61476.0,,61167.0,,60937.0,


Select only interesting columns.

In [189]:
df = df[['TIME', '2019', '2020']]
df.head()

Unnamed: 0,TIME,2019,2020
0,GEO (Labels),,
1,European Union - 27 countries (from 2020),4168656.0,4047432.0
2,European Union - 28 countries (2013-2020),4881355.0,
3,European Union - 27 countries (2007-2013),4845220.0,
4,Euro area - 19 countries (from 2015),3115288.0,3042405.0


List columns

In [190]:
df.columns

Index(['TIME', '2019', '2020'], dtype='object')

Rename the columns that will be used

In [191]:
df.rename(columns={'TIME' : 'Country'}, inplace=True)

Drop NaN values for column 2020 and 2019

In [192]:
df.dropna(subset=['2020'],inplace=True)
df.dropna(subset=['2019'],inplace=True)

Drop first six rows, which are relative to total Europe

In [193]:
df = df.iloc[6:]

In [194]:
df.shape

(35, 3)

## Basic Bar Chart
Build a first raw bar chart

In [198]:
import altair as alt
bars = alt.Chart(df).mark_bar().encode(
    x=alt.X('2020:Q'),
    y=alt.Y('Country:N'),
)
bars.save('raw_bar_chart.png')
bars

Order bars DESC (use `x` to order ASC).

In [199]:
bars = alt.Chart(df).mark_bar().encode(
    x=alt.X('2020:Q'),
    y=alt.Y('Country:N', sort='-x'),
)
bars.save('ordered_raw_bar_chart.png')
bars

In [48]:
df_regions = pd.read_csv('../sources/eu_regions.csv', sep=';')
df_regions.head(5)

Unnamed: 0,#,Country,Population (2020),Subregion
0,1,Russia,145934462,Eastern Europe
1,2,Germany,83783942,Western Europe
2,3,United Kingdom,67886011,Northern Europe
3,4,France,65273511,Western Europe
4,5,Italy,60461826,Southern Europe


In [62]:
def get_region(x):
    south_exceptions = ['Cyprus', 'Turkey']
    east_exceptions = ['Armenia', 'Azerbaijan']
    if x in south_exceptions:
        return 'Southern Europe'
    if x in east_exceptions:
        return 'Eastern Europe'
    row = df_regions[df_regions['Country'] == x]
    return row['Subregion'].iloc[0]

In [201]:
df['Region'] = df['Country'].apply(lambda x: get_region(x))

In [202]:
df.head(5)

Unnamed: 0,Country,2019,2020,Region
9,Denmark,61167.0,60937.0,Northern Europe
10,Germany,778090.0,773144.0,Western Europe
11,Estonia,14099.0,13209.0,Northern Europe
12,Ireland,59289.0,55959.0,Northern Europe
13,Greece,83763.0,84625.0,Southern Europe


Group Countries by regions

In [203]:
bars = alt.Chart(df).mark_bar().encode(
    x=alt.X('avg_value:Q'),
    y=alt.Y('Region:N')
).transform_aggregate(
   avg_value = 'average(2020)', groupby = ['Region']
)
bars.save('aggregated_bar_chart.png')
bars

Calculate percentage decrease with respect to the previous year

In [205]:
import math

df['2020_2019'] = -(df['2020'] - df['2019'])/df['2019']


In [206]:
bars = alt.Chart(df).mark_bar().encode(
    x=alt.X('avg_value:Q'),
    y=alt.Y('Region:N')
).transform_aggregate(
   avg_value = 'average(2020_2019)', groupby = ['Region']
)
bars.save('aggregated_percentage_bar_chart.png')
bars

## Final Decluttering
Set the graph size

In [207]:
bars = alt.Chart(df).mark_bar().encode(
    x=alt.X('avg_value:Q'),
    y=alt.Y('Region:N')
).transform_aggregate(
   avg_value = 'average(2020_2019)', groupby = ['Region']
).properties(
     height = 300
)
bars.save('size_bar_chart.png')
bars

Put label values near the bar

In [210]:
bars = alt.Chart(df).mark_bar().encode(
    x=alt.X('avg_value:Q'),
    y=alt.Y('Region:N')
).transform_aggregate(
   avg_value = 'average(2020_2019)', groupby = ['Region']
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,
    fontStyle='bold',
    fontSize=20
).encode(
    text=alt.Text('avg_value:Q', format=",.3f")
)

final_bar = (bars + text).properties(
     height = 300
)
final_bar.save('label_bar_chart.png')
final_bar

Remove axes

In [211]:
bars = alt.Chart(df).mark_bar().encode(
    x=alt.X('avg_value:Q', axis=None),
    y=alt.Y('Region:N', title='')
).transform_aggregate(
   avg_value = 'average(2020_2019)', groupby = ['Region']
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,
    fontStyle='bold',
    fontSize=20
).encode(
    text=alt.Text('avg_value:Q', format=",.3f")
)

final_bar = (bars + text).properties(
     height = 300
)
final_bar.save('no_axis_bar_chart.png')
final_bar

Add title

In [213]:
bars = alt.Chart(df).mark_bar().encode(
    x=alt.X('avg_value:Q', axis=None),
    y=alt.Y('Region:N', title='')
).transform_aggregate(
   avg_value = 'average(2020_2019)', groupby = ['Region']
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,
    fontStyle='bold',
    fontSize=20
).encode(
    text=alt.Text('avg_value:Q', format=",.3f"),
    
)

final_bar = (bars + text).properties(
    height = 300,
    title = 'Percentage Decrease in Live Births (2020-2019)'
).configure_title(
    fontSize=24
)
final_bar.save('title_bar_chart.png')
final_bar

Focus on a single region, to tell a story.

In [214]:
bars = alt.Chart(df).mark_bar().encode(
    x=alt.X('avg_value:Q', axis=None),
    y=alt.Y('Region:N', title=''),
    color=alt.condition(alt.datum.Region == 'Southern Europe', alt.value('#8B0000'), alt.value('grey'))
).transform_aggregate(
   avg_value = 'average(2020_2019)', groupby = ['Region']
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,
    fontStyle='bold',
    fontSize=20
).encode(
    text=alt.Text('avg_value:Q', format=",.3f"),
    
)

final_bar = (bars + text).properties(
    height = 300,
    title = 'Percentage Decrease in Live Births (2020-2019)'
).configure_title(
    fontSize=24
)
final_bar.save('final_bar_chart.png')
final_bar