In [156]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np
import json

# Set the default template to dark
pio.templates.default = "plotly_dark"

In [157]:
countries_path = '../data/reference/WID_countries.csv'
countries_df = pd.read_csv(filepath_or_buffer=countries_path, delimiter=';', index_col='alpha2')
#countries_df['region2'].unique()
#countries_df[countries_df['region2'].isna()]
countries_df.head()

Unnamed: 0_level_0,titlename,shortname,region,region2
alpha2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AD,Andorra,Andorra,Europe,Western Europe
AE,the United Arab Emirates,United Arab Emirates,Asia,West Asia
AF,Afghanistan,Afghanistan,Asia,South Asia
AG,Antigua and Barbuda,Antigua and Barbuda,Americas,Caribbean
AI,Anguilla,Anguilla,Americas,Caribbean


In [158]:
country_code = 'CD'
country_path = f"../data/unprocessed/WID_data_{country_code}.csv"
country = pd.read_csv(filepath_or_buffer=country_path, delimiter=';', keep_default_na=False, na_values=[''])
country.head()

Unnamed: 0,country,variable,percentile,year,value,age,pop
0,CD,ehfghgi999,p0p100,1980,19.747982,999,i
1,CD,ehfghgi999,p0p100,1981,17.861057,999,i
2,CD,ehfghgi999,p0p100,1982,19.622098,999,i
3,CD,ehfghgi999,p0p100,1983,20.360354,999,i
4,CD,ehfghgi999,p0p100,1984,15.113213,999,i


In [159]:
meta_path = f"../data/unprocessed/WID_metadata_{country_code}.csv"
meta = pd.read_csv(meta_path, delimiter=';', keep_default_na=False, na_values=['']).drop(columns=['age', 'pop'])
meta.head(1)

Unnamed: 0,country,variable,countryname,shortname,simpledes,technicaldes,shorttype,longtype,shortpop,longpop,shortage,longage,unit,source,method,extrapolation,data_points
0,CD,acitgri992,the DR Congo,Corporate income tax,,,Average,Average income or wealth between two percentil...,individuals,The base unit is the individual (rather than t...,Adults,The population is comprised of individuals ove...,CDF,,,,


In [150]:
cols = meta.columns
nulls = [val for val in meta.isna().sum().values]
unique_vals = [meta[col].nunique() for col in meta.columns]

summary_dict = {
    'cols': cols,
    'no_of_nulls': nulls,
    'no_of_unique': unique_vals
}

summary = pd.DataFrame(summary_dict)
summary.set_index('cols', inplace=True)
summary

Unnamed: 0_level_0,no_of_nulls,no_of_unique
cols,Unnamed: 1_level_1,Unnamed: 2_level_1
country,0,1
variable,0,552
countryname,0,1
shortname,0,136
simpledes,342,32
technicaldes,414,35
shorttype,0,14
longtype,0,14
shortpop,0,4
longpop,0,4


In [155]:
df = pd.merge(left=country, right=meta, how='left', on=['country', 'variable'], suffixes=('', '_x')).sort_values(by=['year', 'variable'])
a = df[df['variable'] == 'xlcusxi999']

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=a['year'],
    y=a['value']
))
fig.show()

In [127]:
# Create a summary of all unique variables in the dataset.
ls = []
for variable, data in df.groupby('variable'):
    summary = {
        'variable': variable,
        'shortname': data['shortname'].unique()[0],
        'pop': data['pop'].unique()[0],
        'shortpop': data['shortpop'].unique()[0],
        'shortage': data['shortage'].unique()[0],
        'unit': data['unit'].unique()[0],
        'shorttype': data['shorttype'].unique()[0],
        'longtype': data['longtype'].unique()[0]
    }
    ls.append(summary)

var_df = pd.DataFrame(ls)
subset_cols=['shortname', 'pop', 'shortpop', 'shortage', 'unit', 'shortage', 'longtype']
var_df = var_df.drop_duplicates(subset=subset_cols, keep='first').copy()

In [129]:
def vars_type_dict(dataframe, shortname_list):
    
    # Create an empty dictionary to store data
    other_vars = {}

    for (var, name), data in dataframe.groupby(['variable', 'shortname']):

        if name in shortname_list:
            shorttype = data['shorttype'].unique()[0]
            if shorttype in other_vars:
                other_vars[shorttype].append(var)
            else:
                other_vars.update({shorttype: [var]})
    
    return other_vars

In [130]:
# List all public spending variables
public_spending_shortnames = [
    'Defense',
    'Economic affairs',
    'Education: Primary',
    'Education: Secondary',
    'Education: Tertiary',
    'Education',
    'Environmental protection',
    'General public services (excluding interest payments)',
    'Health',
    'Housing and community amenities',
    'Public order and safety',
    'Recreation and culture',
    'Social protection: social assistance in cash',
    'Social protection: social assistance in kind',
    'Social protection',
    'Social protection: social insurance',
    'Total Public Spending (excluding interest payment)'
    ]

public_spending_dict = vars_type_dict(var_df, public_spending_shortnames)

In [131]:
# List non public spending variables of interest
other_shortnames = [
    'Gross domestic product',
    'Gross national income',
    'Net personal wealth',
    'National CO2 footprint  ',
    'National carbon footprint  ',
    'Inequality transparency index',
    'PPP conversion factor, LCU per USD',
    'Market exchange rate, LCU per USD'
]

other_dict = vars_type_dict(var_df, other_shortnames)

In [132]:
all_vars = []

for key, vals in other_dict.items():
    all_vars.extend(vals)

for key, vals in public_spending_dict.items():
    all_vars.extend(vals)

# This is the full list of variables to analyze (with descriptions).
var_descriptions = var_df[(var_df['variable'].isin(all_vars)) & (var_df['shortage'] == 'All Ages')]

# Create a CSV with descriptions of the variables chosen for analysis.
var_descriptions.to_csv('../data/reference/variable_descriptions.csv', index=False)

# Create a CSV of only the variable - i.e. excluding the additional variable information.
vars = pd.Series(var_descriptions['variable'].unique())
vars.to_csv('../data/reference/variables_to_analyze.csv', header=['variable'], index=False)

# Create a list of variables that require currency conversion.
vars_for_currency_conversion = var_descriptions[var_descriptions['unit'] == 'AUD']['variable']
vars_for_currency_conversion.to_csv('../data/reference/vars_for_currency_conversion.csv', header=['variable'], index=False)

with open('../data/reference/public_spending_vars.json', 'w') as file:
    json.dump(public_spending_dict, file, indent=4)

In [133]:
with open('../data/reference/public_spending_vars.json', 'r') as file:
    my_dict = json.load(file)

my_dict

{'Average': ['adefgei992',
  'adefgei999',
  'aecogei992',
  'aecogei999',
  'aedpgei992',
  'aedpgei999',
  'aedsgei992',
  'aedsgei999',
  'aedtgei992',
  'aedtgei999',
  'aedugei992',
  'aedugei999',
  'aenvgei992',
  'aenvgei999',
  'aexpgoi992',
  'aexpgoi999',
  'agpsgei992',
  'agpsgei999',
  'aheagei992',
  'aheagei999',
  'ahougei992',
  'ahougei999',
  'apolgei992',
  'apolgei999',
  'arecgei992',
  'arecgei999',
  'asacgei992',
  'asacgei999',
  'asakgei992',
  'asakgei999',
  'asopgei992',
  'asopgei999',
  'aspigei992',
  'aspigei999'],
 'Total': ['mdefgei999',
  'mecogei999',
  'medpgei999',
  'medsgei999',
  'medtgei999',
  'medugei999',
  'menvgei999',
  'mexpgoi999',
  'mgpsgei999',
  'mheagei999',
  'mhougei999',
  'mpolgei999',
  'mrecgei999',
  'msacgei999',
  'msakgei999',
  'msopgei999',
  'mspigei999'],
 'Wealth-income ratio': ['wdefgei999',
  'wecogei999',
  'wedpgei999',
  'wedsgei999',
  'wedtgei999',
  'wedugei999',
  'wenvgei999',
  'wexpgoi999',
  'wgpsgei9

In [134]:
var_descriptions.head()

Unnamed: 0,variable,shortname,pop,shortpop,shortage,unit,shorttype,longtype
77,adefgei999,Defense,i,individuals,All Ages,AUD,Average,Average income or wealth between two percentil...
83,aecogei999,Economic affairs,i,individuals,All Ages,AUD,Average,Average income or wealth between two percentil...
87,aedpgei999,Education: Primary,i,individuals,All Ages,AUD,Average,Average income or wealth between two percentil...
89,aedsgei999,Education: Secondary,i,individuals,All Ages,AUD,Average,Average income or wealth between two percentil...
91,aedtgei999,Education: Tertiary,i,individuals,All Ages,AUD,Average,Average income or wealth between two percentil...


In [135]:
# Create a filtered dataframe including only the variables of interest.
final_df = df[df['variable'].isin(vars)].copy()

In [136]:
per_capita = var_df[(var_df['variable'].isin(vars.values)) & (var_df['unit'] == 'AUD') & (var_df['shorttype'] == 'Average')]
per_capita[per_capita['shortname'] == 'Net personal wealth']

Unnamed: 0,variable,shortname,pop,shortpop,shortage,unit,shorttype,longtype
210,ahweali999,Net personal wealth,i,individuals,All Ages,AUD,Average,Average income or wealth between two percentil...


In [137]:
# Create CSV files for the variables that need to be converted from local currency to USD during processing.
vars_for_currency_conversion_avg = pd.Series(final_df[(final_df['unit'] == 'AUD') & (final_df['shorttype'] == 'Average')]['variable'].unique())
vars_for_currency_conversion_avg.to_csv('../data/reference/vars_for_currency_conversion_avg.csv', header=['variable'], index=False)

vars_for_currency_conversion_total = pd.Series(final_df[(final_df['unit'] == 'AUD') & (final_df['shorttype'] == 'Total')]['variable'].unique())
vars_for_currency_conversion_total.to_csv('../data/reference/vars_for_currency_conversion_total.csv', header=['variable'], index=False)

final_df[final_df['unit'] == 'AUD']['shorttype'].unique()

array(['Average', 'Total'], dtype=object)

In [138]:
gdp = df[(df['shortname'] == 'Gross domestic product') & (df['unit'] == '% of national income')][['shortname', 'year', 'value', 'shortage', 'shorttype', 'longtype', 'unit']].copy()
total_pop = df[(df['shortname'] == 'Population') & (df['shortpop'] == 'individuals') & (df['shortage'] == 'All Ages')][['year', 'value', 'shortname']]
total_pop.head(1)

Unnamed: 0,year,value,shortname
419920,1820,331000.0,Population


In [139]:
fig = go.Figure()

fig.add_trace(
        go.Scatter(
        x=gdp['year'],
        y=gdp['value']
        )
    )

fig.update_layout(
    title='Gross Domestic Product over time'
)

fig.show()

In [140]:
shortname_list = [
    'Defense',
    'Economic affairs',
    'Education',
    'Environmental protection',
    'Health',
    'Housing and community amenities',
    'Public order and safety',
    'Recreation and culture',
    'Social protection'
]

cols_to_keep = ['year', 'value', 'shortname', 'shorttype', 'longtype', 'variable']

a = df[(df['shortname'].isin(shortname_list)) & (df['shorttype'] == 'Total')][cols_to_keep]

total_pop = df[(df['shortname'] == 'Population') & (df['shortpop'] == 'individuals') & (df['shortage'] == 'All Ages')][['year', 'value']]
fx = df[df['shortname'] == 'PPP conversion factor, LCU per USD'][['year', 'value']].rename(columns={'value': 'local_currency_per_usd'})
public_spending = df.loc[(df['shortname'] == 'Total Public Spending (excluding interest payment)') & (df['shorttype'] == 'Total')][['year', 'value']].rename(columns={'value': 'total_public_spending'})

In [141]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=public_spending['year'],
    y=public_spending['total_public_spending']
))

fig.update_layout(
    title='Total public spending'
)

fig.show()

In [142]:
fig = go.Figure()

for _, group in a.groupby('shortname'):

    if len(group['variable'].unique()) > 1:
        group = group[group['variable'] == group['variable'].unique()[0]]

    name = group['shortname'].unique()[0]
    fig.add_trace(
        go.Scatter(
            x=group['year'],
            y=group['value'],
            name=name,
            mode='lines',
            stackgroup='one',
            groupnorm='percent'
        )
    )

fig.update_layout(
    title='Share of public spending'
)

fig.show()

In [143]:
fig = go.Figure()

for _, group in a.groupby('shortname'):

    if len(group['variable'].unique()) > 1:
        group = group[group['variable'] == group['variable'].unique()[0]]
        
    data = group.merge(total_pop, how='inner', on='year', suffixes=('', '_pop')).merge(fx, how='inner', on='year')
    data['per_capita_local'] = data['value'] / data['value_pop']
    data['per_capita_usd'] = data['per_capita_local'] / data['local_currency_per_usd']

    name = data['shortname'].unique()[0]
    fig.add_trace(
        go.Scatter(
            x=data['year'],
            y=data['per_capita_local'],
            name=name,
            mode='lines+markers'
        )
    )

fig.update_layout(
    title='Per capita public spending'
)

fig.show()

In [144]:
variables = [
    'wpwdebi999',
    'wpweali999'
]

private_debt_wealth = df.loc[df['variable'].isin(variables)].copy()

fig = go.Figure()

for _, group in private_debt_wealth.groupby('variable'):
    fig.add_trace(
        go.Scatter(
            x=group['year'],
            y=group['value'],
            name=group['shortname'].unique()[0]
        )
    )

fig.update_layout(
    title='Private debt and private wealth as a % of national income'
)

fig.show()

In [164]:
import os
script_path = os.getcwd()
data_path = os.path.abspath(os.path.join(script_path, "../data"))
processed_path = os.path.join(data_path, "processed")
reference_path = os.path.join(data_path, "reference")
aggregated_path = os.path.join(data_path, "aggregated")

# Load data.
df = pd.read_csv(f"{reference_path}/variable_descriptions.csv")

# Convert to list of dictionaries
list_of_dicts = df.to_dict(orient='records')
list_of_dicts[0].keys()

dict_keys(['variable', 'shortname', 'pop', 'shortpop', 'shortage', 'unit', 'shorttype', 'longtype'])