In [635]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np

# Set the default template to dark
pio.templates.default = "plotly_dark"

In [636]:
countries_path = '../data/WID_countries.csv'
countries_df = pd.read_csv(filepath_or_buffer=countries_path, delimiter=';')
countries_df.head(2)

Unnamed: 0,alpha2,titlename,shortname,region,region2
0,AD,Andorra,Andorra,Europe,Western Europe
1,AE,the United Arab Emirates,United Arab Emirates,Asia,West Asia


In [637]:
af_path = '../data/WID_data_AF.csv'
af = pd.read_csv(filepath_or_buffer=af_path, delimiter=';')
af.head(2)

Unnamed: 0,country,variable,percentile,year,value,age,pop
0,AF,ehfghgi999,p0p100,1980,13.085614,999,i
1,AF,ehfghgi999,p0p100,1981,12.898053,999,i


In [638]:
af_meta_path = '../data/WID_metadata_AF.csv'
af_meta = pd.read_csv(af_meta_path, delimiter=';')
af_meta.head(2)

Unnamed: 0,country,variable,age,pop,countryname,shortname,simpledes,technicaldes,shorttype,longtype,shortpop,longpop,shortage,longage,unit,source,method,extrapolation,data_points
0,AF,acitgri992,992,i,Afghanistan,Corporate income tax,,,Average,Average income or wealth between two percentil...,individuals,The base unit is the individual (rather than t...,Adults,The population is comprised of individuals ove...,AFN,,,,
1,AF,acitgri999,999,i,Afghanistan,Corporate income tax,,,Average,Average income or wealth between two percentil...,individuals,The base unit is the individual (rather than t...,All Ages,The population is comprised of individuals of ...,AFN,,,,


In [639]:
print(f"The data file contains {af.shape[1]} columns and {af.shape[0]} rows")
print(f"The metadata file contains {af_meta.shape[1]} columns and {af_meta.shape[0]} rows")
print(f"There are {af.variable.nunique()} and {af_meta.variable.nunique()} unique values for 'variable' in the data and metadata files respectively.")

The data file contains 7 columns and 191381 rows
The metadata file contains 19 columns and 552 rows
There are 552 and 552 unique values for 'variable' in the data and metadata files respectively.


In [640]:
cols = af_meta.columns
nulls = [val for val in af_meta.isna().sum().values]
unique_vals = [af_meta[col].nunique() for col in af_meta.columns]

summary_dict = {
    'cols': cols,
    'no_of_nulls': nulls,
    'no_of_unique': unique_vals
}

summary = pd.DataFrame(summary_dict)
summary.set_index('cols', inplace=True)
summary

Unnamed: 0_level_0,no_of_nulls,no_of_unique
cols,Unnamed: 1_level_1,Unnamed: 2_level_1
country,0,1
variable,0,552
age,0,38
pop,0,4
countryname,0,1
shortname,0,136
simpledes,342,32
technicaldes,414,35
shorttype,0,14
longtype,0,14


In [641]:
df = pd.merge(left=af, right=af_meta, how='left', on=['country', 'variable'])

In [642]:
gdp = df[(df['shortname'] == 'Gross domestic product') & (df['unit'] == '% of national income')][['shortname', 'year', 'value', 'shortage', 'shorttype', 'longtype', 'unit']].copy()
total_pop = df[(df['shortname'] == 'Population') & (df['shortpop'] == 'individuals') & (df['shortage'] == 'All Ages')][['year', 'value', 'shortname']]
total_pop.head(1)

Unnamed: 0,year,value,shortname
143943,1950,7480461.0,Population


In [653]:
fig = go.Figure()

fig.add_trace(
        go.Scatter(
        x=gdp['year'],
        y=gdp['value']
        )
    )

fig.update_layout(
    title='Gross Domestic Product over time'
)

fig.show()

In [643]:
shortname_list = [
    'Defense',
    'Economic affairs',
    'Education',
    'Environmental protection',
    'Health',
    'Housing and community amenities',
    'Public order and safety',
    'Recreation and culture',
    'Social protection'
]

cols_to_keep = ['year', 'value', 'shortname']

a = df[(df['shortname'].isin(shortname_list)) & (df['shorttype'] == 'Total')][cols_to_keep]

total_pop = df[(df['shortname'] == 'Population') & (df['shortpop'] == 'individuals') & (df['shortage'] == 'All Ages')][['year', 'value']]
fx = df[df['shortname'] == 'PPP conversion factor, LCU per USD'][['year', 'value']].rename(columns={'value': 'local_currency_per_usd'})
public_spending = df.loc[(df['shortname'] == 'Total Public Spending (excluding interest payment)') & (df['shorttype'] == 'Total')][['year', 'value']].rename(columns={'value': 'total_public_spending'})

In [658]:
fig = go.Figure()

for _, group in a.groupby('shortname'):

    name = group['shortname'].unique()[0]
    fig.add_trace(
        go.Scatter(
            x=group['year'],
            y=group['value'],
            name=name,
            mode='lines',
            stackgroup='one',
            groupnorm='percent'
        )
    )

fig.update_layout(
    title='Percentage of total public spending'
)

fig.show()

In [646]:
fig = go.Figure()

for _, group in a.groupby('shortname'):
    data = group.merge(total_pop, how='inner', on='year', suffixes=('', '_pop')).merge(fx, how='inner', on='year')
    data['per_capita_local'] = data['value'] / data['value_pop']
    data['per_capita_usd'] = data['per_capita_local'] / data['local_currency_per_usd']

    name = data['shortname'].unique()[0]
    fig.add_trace(
        go.Scatter(
            x=data['year'],
            y=data['per_capita_local'],
            name=name,
            mode='lines+markers'
        )
    )

fig.update_layout(
    title='Per capita public spending'
)

fig.show()

In [647]:
cols = ['shorttype', 'longtype', 'method']
info = df.loc[df['shortname'] == 'Total Public Spending (excluding interest payment)'][cols]
for col in cols:
    print(info[col].unique())

['Average' 'Total' 'Wealth-income ratio']
["Average income or wealth between two percentiles. When the associated percentile is of the form 'pX', intermediary average returns the average between percentile pX and the next consecutive percentile. When the associated percentile is of the form 'pXpY', the variable returns the average between percentiles pX and pY."
 "Macroeconomic variable (i.e. corresponding to national economy rather than to a given group of individuals). The associated percentile is of the form 'pall'."
 'Ratio of net wealth (of a given sector) to net national income']
[nan]


In [655]:
net_wealth_list = [
    'Net Public Wealth to Net National Income Ratio',
    'Net Personal Wealth to Net National Income Ratio',
    'Net national wealth to Net National Income Ratio',
    'Net Private Wealth to Net National Income Ratio'
]

wealth = df[df['shortname'].isin(net_wealth_list)]

cols = ['shorttype', 'longtype', 'method']
for col in cols:
    print(wealth[col].unique())

['Wealth-income ratio']
['Ratio of net wealth (of a given sector) to net national income']
['Household wealth-to-income ratio defined as the ratio of household wealth to market-price national income. See [URL][URL_LINK]https://wid.world/document/distributional-national-accounts-guidelines-2020-concepts-and-methods-used-in-the-world-inequality-database/[/URL_LINK][URL_TEXT]DINA Guidelines[/URL_TEXT][/URL] for details.'
 'Public wealth-to-income ratio defined as the ratio of government wealth to market-price national income. See [URL][URL_LINK]https://wid.world/document/distributional-national-accounts-guidelines-2020-concepts-and-methods-used-in-the-world-inequality-database/[/URL_LINK][URL_TEXT]DINA Guidelines[/URL_TEXT][/URL] for details.'
 'National wealth-to-income ratio defined as the ratio of market-price national wealth to market-price national income. See [URL][URL_LINK]https://wid.world/document/distributional-national-accounts-guidelines-2020-concepts-and-methods-used-in-the-w

In [657]:
fig = go.Figure()

for name, data in wealth.groupby('shortname'):
    fig.add_trace(
        go.Scatter(
            x=data['year'],
            y=data['value'],
            name=name
        )
    )

fig.show()