In [328]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np

# Set the default template to dark
pio.templates.default = "plotly_dark"

In [329]:
countries_path = '../data/unprocessed/WID_countries.csv'
countries_df = pd.read_csv(filepath_or_buffer=countries_path, delimiter=';')
#countries_df['region2'].unique()
countries_df[countries_df['region2'].isna()]

Unnamed: 0,alpha2,titlename,shortname,region,region2
45,CN-RU,rural China,Rural China,,
46,CN-UR,urban China,Urban China,,
57,DE-BD,Baden,Baden,,
58,DE-BY,Bavaria,Bavaria,,
59,DE-HB,Bremen,Bremen,,
...,...,...,...,...,...
377,XQ-MER,,,,
378,XR,Russia & Central Asia,Russia & Central Asia,,
379,XR-MER,Russia & Central Asia (at market exchange rate),Russia & Central Asia (at market exchange rate),,
380,XS,South & South-East Asia,South & South-East Asia,,


In [330]:
country_code = 'AU'
country_path = f"../data/unprocessed/WID_data_{country_code}.csv"
country = pd.read_csv(filepath_or_buffer=country_path, delimiter=';')
country.head(1)

Unnamed: 0,country,variable,percentile,year,value,age,pop
0,AU,ehfcari999,p0p100,1990,100.573195,999,i


In [331]:
meta_path = f"../data/unprocessed/WID_metadata_{country_code}.csv"
meta = pd.read_csv(meta_path, delimiter=';').drop(columns=['age', 'pop'])
meta.head(1)

Unnamed: 0,country,variable,countryname,shortname,simpledes,technicaldes,shorttype,longtype,shortpop,longpop,shortage,longage,unit,source,method,extrapolation,data_points
0,AU,accmhni992,Australia,Consumption of fixed capital attributable to m...,,,Average,Average income or wealth between two percentil...,individuals,The base unit is the individual (rather than t...,Adults,The population is comprised of individuals ove...,AUD,See [URL][URL_LINK]https://wid.world/document/...,WID.world estimations as a proportion of GDP b...,,


In [332]:
print(f"The data file contains {country.shape[1]} columns and {country.shape[0]} rows")
print(f"The metadata file contains {meta.shape[1]} columns and {meta.shape[0]} rows")
print(f"There are {country.variable.nunique()} and {meta.variable.nunique()} unique values for 'variable' in the data and metadata files respectively.")

The data file contains 7 columns and 521651 rows
The metadata file contains 17 columns and 1138 rows
There are 1138 and 1138 unique values for 'variable' in the data and metadata files respectively.


In [333]:
cols = meta.columns
nulls = [val for val in meta.isna().sum().values]
unique_vals = [meta[col].nunique() for col in meta.columns]

summary_dict = {
    'cols': cols,
    'no_of_nulls': nulls,
    'no_of_unique': unique_vals
}

summary = pd.DataFrame(summary_dict)
summary.set_index('cols', inplace=True)
summary

Unnamed: 0_level_0,no_of_nulls,no_of_unique
cols,Unnamed: 1_level_1,Unnamed: 2_level_1
country,0,1
variable,0,1138
countryname,0,1
shortname,0,286
simpledes,632,105
technicaldes,735,107
shorttype,0,14
longtype,0,14
shortpop,0,5
longpop,0,5


In [334]:
df = pd.merge(left=country, right=meta, how='left', on=['country', 'variable'], suffixes=('', '_x'))
df.nunique()

country               1
variable           1138
percentile          389
year                127
value            290527
age                  38
pop                   5
countryname           1
shortname           286
simpledes           105
technicaldes        107
shorttype            14
longtype             14
shortpop              5
longpop               5
shortage             38
longage              38
unit                  8
source               32
method               49
extrapolation         8
data_points           1
dtype: int64

In [335]:
ls = []
for variable, data in df.groupby('variable'):
    summary = {
        'variable': variable,
        'shortname': data['shortname'].unique()[0],
        'pop': data['pop'].unique()[0],
        'shortpop': data['shortpop'].unique()[0],
        'shortage': data['shortage'].unique()[0],
        'unit': data['unit'].unique()[0],
        'shorttype': data['shorttype'].unique()[0],
        'longtype': data['longtype'].unique()[0]
    }
    ls.append(summary)

var_df = pd.DataFrame(ls)

vars_of_interest = [
    'wpwdebi999',
    'wpweali999',
    'wexpgoi999',
    'wgdproi999',
    'mgninci999',
    'agninci999',
    'agdproi999',
    'mgdproi999',
    'enfcari999',
    'enfghgi999',
    'knfcari999',
    'knfghgi999',
    'iqualii999',
    'xlcuspi999',
    'xlcusxi999',
    'npopuli999',
    'apwdebi999',
    'mpwdebi999'
]


var_descriptions = var_df[(var_df['variable'].isin(vars_of_interest)) | ((var_df['variable'].str.contains('gei', case=False)) & (var_df['shortage'] == 'All Ages'))]
var_descriptions.to_csv('../data/reference/variable_descriptions.csv', index=False)
vars = pd.Series(var_descriptions['variable'].unique())
vars.to_csv('../data/reference/variables_to_analyze.csv', header=['variable'], index=False)
final_df = df[df['variable'].isin(vars)].copy()

In [336]:
vars_for_currency_conversion_avg = pd.Series(final_df[(final_df['unit'] == 'AUD') & (final_df['shorttype'] == 'Average')]['variable'].unique())
vars_for_currency_conversion_avg.to_csv('../data/reference/vars_for_currency_conversion_avg.csv', header=['variable'], index=False)

vars_for_currency_conversion_total = pd.Series(final_df[(final_df['unit'] == 'AUD') & (final_df['shorttype'] == 'Total')]['variable'].unique())
vars_for_currency_conversion_total.to_csv('../data/reference/vars_for_currency_conversion_total.csv', header=['variable'], index=False)

final_df[final_df['unit'] == 'AUD']['shorttype'].unique()

array(['Average', 'Total'], dtype=object)

In [337]:
gdp = df[(df['shortname'] == 'Gross domestic product') & (df['unit'] == '% of national income')][['shortname', 'year', 'value', 'shortage', 'shorttype', 'longtype', 'unit']].copy()
total_pop = df[(df['shortname'] == 'Population') & (df['shortpop'] == 'individuals') & (df['shortage'] == 'All Ages')][['year', 'value', 'shortname']]
total_pop.head(1)

Unnamed: 0,year,value,shortname
419920,1820,331000.0,Population


In [338]:
fig = go.Figure()

fig.add_trace(
        go.Scatter(
        x=gdp['year'],
        y=gdp['value']
        )
    )

fig.update_layout(
    title='Gross Domestic Product over time'
)

fig.show()

In [339]:
shortname_list = [
    'Defense',
    'Economic affairs',
    'Education',
    'Environmental protection',
    'Health',
    'Housing and community amenities',
    'Public order and safety',
    'Recreation and culture',
    'Social protection'
]

cols_to_keep = ['year', 'value', 'shortname', 'shorttype', 'longtype', 'variable']

a = df[(df['shortname'].isin(shortname_list)) & (df['shorttype'] == 'Total')][cols_to_keep]

total_pop = df[(df['shortname'] == 'Population') & (df['shortpop'] == 'individuals') & (df['shortage'] == 'All Ages')][['year', 'value']]
fx = df[df['shortname'] == 'PPP conversion factor, LCU per USD'][['year', 'value']].rename(columns={'value': 'local_currency_per_usd'})
public_spending = df.loc[(df['shortname'] == 'Total Public Spending (excluding interest payment)') & (df['shorttype'] == 'Total')][['year', 'value']].rename(columns={'value': 'total_public_spending'})

In [340]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=public_spending['year'],
    y=public_spending['total_public_spending']
))

fig.update_layout(
    title='Total public spending'
)

fig.show()

In [341]:
fig = go.Figure()

for _, group in a.groupby('shortname'):

    if len(group['variable'].unique()) > 1:
        group = group[group['variable'] == group['variable'].unique()[0]]

    name = group['shortname'].unique()[0]
    fig.add_trace(
        go.Scatter(
            x=group['year'],
            y=group['value'],
            name=name,
            mode='lines',
            stackgroup='one',
            groupnorm='percent'
        )
    )

fig.update_layout(
    title='Share of public spending'
)

fig.show()

In [342]:
fig = go.Figure()

for _, group in a.groupby('shortname'):

    if len(group['variable'].unique()) > 1:
        group = group[group['variable'] == group['variable'].unique()[0]]
        
    data = group.merge(total_pop, how='inner', on='year', suffixes=('', '_pop')).merge(fx, how='inner', on='year')
    data['per_capita_local'] = data['value'] / data['value_pop']
    data['per_capita_usd'] = data['per_capita_local'] / data['local_currency_per_usd']

    name = data['shortname'].unique()[0]
    fig.add_trace(
        go.Scatter(
            x=data['year'],
            y=data['per_capita_local'],
            name=name,
            mode='lines+markers'
        )
    )

fig.update_layout(
    title='Per capita public spending'
)

fig.show()

In [355]:
net_wealth_list = [
    'Net Public Wealth to Net National Income Ratio',
    'Net Personal Wealth to Net National Income Ratio',
    'Net national wealth to Net National Income Ratio',
    'Net Private Wealth to Net National Income Ratio'
]

wealth = df[df['shortname'].isin(net_wealth_list)]

cols = ['shorttype', 'longtype', 'method', 'variable']
for col in cols:
    print(wealth[col].unique())

['Wealth-income ratio']
['Ratio of net wealth (of a given sector) to net national income']
['Household wealth-to-income ratio defined as the ratio of household wealth to market-price national income. See [URL][URL_LINK]https://wid.world/document/distributional-national-accounts-guidelines-2020-concepts-and-methods-used-in-the-world-inequality-database/[/URL_LINK][URL_TEXT]DINA Guidelines[/URL_TEXT][/URL] for details.'
 'Public wealth-to-income ratio defined as the ratio of government wealth to market-price national income. See [URL][URL_LINK]https://wid.world/document/distributional-national-accounts-guidelines-2020-concepts-and-methods-used-in-the-world-inequality-database/[/URL_LINK][URL_TEXT]DINA Guidelines[/URL_TEXT][/URL] for details.'
 'National wealth-to-income ratio defined as the ratio of market-price national wealth to market-price national income. See [URL][URL_LINK]https://wid.world/document/distributional-national-accounts-guidelines-2020-concepts-and-methods-used-in-the-w

In [344]:
fig = go.Figure()

for name, data in wealth.groupby('shortname'):
    fig.add_trace(
        go.Scatter(
            x=data['year'],
            y=data['value'],
            name=name
        )
    )

fig.show()

In [345]:
variables = [
    'wpwdebi999',
    'wpweali999'
]

private_debt_wealth = df.loc[df['variable'].isin(variables)].copy()

fig = go.Figure()

for _, group in private_debt_wealth.groupby('variable'):
    fig.add_trace(
        go.Scatter(
            x=group['year'],
            y=group['value'],
            name=group['shortname'].unique()[0]
        )
    )

fig.update_layout(
    title='Private debt and private wealth as a % of national income'
)

fig.show()

In [360]:
wealth = [
    'Net public wealth',
    'Net personal wealth',
    'Net private wealth'
]

df[(df['shortname'].isin(wealth)) & (df['shortage'] == 'All Ages')]['shorttype'].unique()
df[(df['shortname'].isin(wealth)) & (df['shortage'] == 'Adults')]['shorttype'].unique()
#df[df['shortname'].isin(wealth)]['shortage'].unique()
#df[df['shortname'] == 'Net private wealth']['simpledes'].unique()

array(['Average', 'Beta coefficient', 'Gini coefficient',
       'Top 10/Bottom 50 ratio', 'Share', 'Threshold'], dtype=object)

In [365]:
df_list = []

for name, data in df.groupby('shortname'):
    if np.all(np.isin(['Average', 'Total'], data['shorttype'].unique())):
        print(name)
    shorttype_avg = data[data['shorttype'] == 'Average'][['shortname', 'year', 'value']]
    shorttype_total = data[data['shorttype'] == 'Total']
    combined = pd.merge(left=shorttype_total, right=shorttype_avg, on=['shortname', 'year'], how='left')
    df_list.append(combined)

df_list[0]

Book value of corporations
Book-value national wealth
Capital Account
Capital transfers paid to  the rest of the world
Capital transfers received from the rest of the world
Collective consumption expenditure
Compensation of employees 
Compensation of employees paid from the rest of the world
Compensation of employees received from the rest of the world
Consumption of fixed capital
Consumption of fixed capital attributable to mixed income
Consumption of fixed capital attributable to operating surplus
Consumption of fixed capital of corporations
Consumption of fixed capital of financial coporations
Consumption of fixed capital of households and NPISH
Consumption of fixed capital of non-financial coporations
Consumption of fixed capital of the general goverment
Corporate agricultural land
Corporate bonds & loans
Corporate business and other non-financial assets 
Corporate currency & deposits
Corporate currency, deposits, bonds & loans
Corporate debt (non-equity liability)
Corporate dwelli

Unnamed: 0,country,variable,percentile,year,value_x,age,pop,countryname,shortname,simpledes,...,shortpop,longpop,shortage,longage,unit,source,method,extrapolation,data_points,value_y
0,AU,mcwbooi999,p0p100,1989,1.491867e+12,999,i,Australia,Book value of corporations,The corporate sector - in the national accoun...,...,individuals,The base unit is the individual (rather than t...,All Ages,The population is comprised of individuals of ...,AUD,[URL][URL_LINK]http://wid.world/document/revis...,,,,127768.5
1,AU,mcwbooi999,p0p100,1989,1.491867e+12,999,i,Australia,Book value of corporations,The corporate sector - in the national accoun...,...,individuals,The base unit is the individual (rather than t...,All Ages,The population is comprised of individuals of ...,AUD,[URL][URL_LINK]http://wid.world/document/revis...,,,,88725.5
2,AU,mcwbooi999,p0p100,1990,1.432512e+12,999,i,Australia,Book value of corporations,The corporate sector - in the national accoun...,...,individuals,The base unit is the individual (rather than t...,All Ages,The population is comprised of individuals of ...,AUD,[URL][URL_LINK]http://wid.world/document/revis...,,,,120301.0
3,AU,mcwbooi999,p0p100,1990,1.432512e+12,999,i,Australia,Book value of corporations,The corporate sector - in the national accoun...,...,individuals,The base unit is the individual (rather than t...,All Ages,The population is comprised of individuals of ...,AUD,[URL][URL_LINK]http://wid.world/document/revis...,,,,83943.8
4,AU,mcwbooi999,p0p100,1991,1.389266e+12,999,i,Australia,Book value of corporations,The corporate sector - in the national accoun...,...,individuals,The base unit is the individual (rather than t...,All Ages,The population is comprised of individuals of ...,AUD,[URL][URL_LINK]http://wid.world/document/revis...,,,,114489.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,AU,mcwbooi999,p0p100,2020,4.633597e+12,999,i,Australia,Book value of corporations,The corporate sector - in the national accoun...,...,individuals,The base unit is the individual (rather than t...,All Ages,The population is comprised of individuals of ...,AUD,[URL][URL_LINK]http://wid.world/document/revis...,,,,180462.0
64,AU,mcwbooi999,p0p100,2021,4.852993e+12,999,i,Australia,Book value of corporations,The corporate sector - in the national accoun...,...,individuals,The base unit is the individual (rather than t...,All Ages,The population is comprised of individuals of ...,AUD,[URL][URL_LINK]http://wid.world/document/revis...,,,,246658.8
65,AU,mcwbooi999,p0p100,2021,4.852993e+12,999,i,Australia,Book value of corporations,The corporate sector - in the national accoun...,...,individuals,The base unit is the individual (rather than t...,All Ages,The population is comprised of individuals of ...,AUD,[URL][URL_LINK]http://wid.world/document/revis...,,,,187176.2
66,AU,mcwbooi999,p0p100,2022,5.028304e+12,999,i,Australia,Book value of corporations,The corporate sector - in the national accoun...,...,individuals,The base unit is the individual (rather than t...,All Ages,The population is comprised of individuals of ...,AUD,[URL][URL_LINK]http://wid.world/document/revis...,,,,252604.9
