In [339]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import os

# Set the default template to dark
pio.templates.default = "plotly_dark"

In [340]:
notebook_path = os.getcwd()
data_path = os.path.abspath(os.path.join(notebook_path, "../data"))
processed_path = os.path.join(data_path, "processed")
unprocessed_path = os.path.join(data_path, "unprocessed")
aggregated_path = os.path.join(data_path, "aggregated")

### Topics to analyse:
- Total per capita public spending
- Total per capita private wealth
- Total per capita private debt
- Per capita public spending breakdown across categories - health, education etc.
- GDP per capita and total
- CO2 and carbon footprint totals, per capita, per gdp etc.
- Correlations between variable - debt vs. wealth, population vs. public spending, population vs. national income etc.
### Other questions to ask:
- Which countries spend the most on defense?
- What is the trend in environmental protection spending and how does it correlate with CO2 / carbon footprint?
- Which regions spend more per capita on the various public spending categories?
### Process:
- Aggregate each CSV by 'variable'
### Other notes:
- All currency based variables where 'shorttype' == 'Average' refer to per capita values
- All currency based variables where 'shorttype' == 'Wealth-income ratio' refer to the ratio of variable value to net national income

In [341]:
schema = {
    'country': 'str',
    'variable': 'str',
    'year': 'int',
    'value': 'float',
    'age': 'str',
    'pop': 'str',
    'countryname': 'str',
    'shortname': 'str',
    'shorttype': 'str',
    'shortpop': 'str',
    'shortage': 'str',
    'unit': 'str',
    'source': 'str',
    'method': 'str',
    'value_usd': 'float',
    'value_ppp': 'float',
    'region': 'str',
    'subregion': 'str',
    'value_usd_per_capita': 'float'
}

defense_path = f"{aggregated_path}/Defense.csv"
defense = pd.read_csv(defense_path)
defense = defense[(defense['country'] != 'RS') & (defense['countryname'] != 'Venezuela')]
defense.head()

Unnamed: 0,country,variable,year,value,age,pop,countryname,shortname,shorttype,shortpop,shortage,unit,source,method,value_usd,value_ppp,region,subregion,value_usd_per_capita,value_pct_national_income
0,AD,mdefgei999,1980,20775878.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,48210920.0,62682720.0,Europe,Western Europe,1353.793551,0.017633
1,AD,mdefgei999,1981,21179162.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,38169910.0,62256330.0,Europe,Western Europe,1031.961998,0.017831
2,AD,mdefgei999,1982,21531746.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,32610670.0,59166920.0,Europe,Western Europe,844.809893,0.017519
3,AD,mdefgei999,1983,21826198.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,25319500.0,55705050.0,Europe,Western Europe,626.195461,0.016933
4,AD,mdefgei999,1984,22243598.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,23021900.0,53054230.0,Europe,Western Europe,545.750137,0.015956


In [342]:
defense_region = defense.groupby(['year', 'region'])[['value_ppp', 'value_usd', 'value_pct_national_income']].agg({
    'value_ppp': 'sum',
    'value_usd': 'sum',
    'value_pct_national_income': 'mean'
    }).reset_index()

defense_country = defense.groupby(['year', 'countryname'])[['value_ppp', 'value_usd', 'value_pct_national_income']].agg({
    'value_ppp': 'sum',
    'value_usd': 'sum',
    'value_pct_national_income': 'mean'
    }).reset_index()

In [343]:
defense_country.sort_values(by='value_ppp', ascending=False)

Unnamed: 0,year,countryname,value_ppp,value_usd,value_pct_national_income
821,1981,the DR Congo,5.743919e+22,2.847493e+22,0.005754
567,1980,the DR Congo,4.905420e+22,3.082025e+22,0.004054
1075,1982,the DR Congo,4.496838e+22,2.103483e+22,0.005612
1583,1984,the DR Congo,1.914661e+22,3.965043e+21,0.006141
2091,1986,the DR Congo,1.604569e+22,3.709436e+21,0.009074
...,...,...,...,...,...
251,1977,Sri Lanka,-1.086128e+10,-4.784429e+09,
287,1978,Sri Lanka,-1.110823e+10,-2.882972e+09,
32,1970,Sri Lanka,-1.128594e+10,-5.546447e+09,
216,1976,Sri Lanka,-1.169653e+10,-5.035536e+09,


In [344]:
fig = px.box(
    x=defense_region['value_pct_national_income'],
    color=defense_region['region']
)

fig.show()

In [345]:
fig = go.Figure()

for region, data in defense_region.groupby('region'):

    fig.add_trace(
        go.Box(
            x=data['value_pct_national_income'],
        )
    )

fig.show()

In [346]:
fig = go.Figure()

for region, data in defense_region.groupby('region'):

    fig.add_trace(
        go.Scatter(
            x=data['year'],
            y=data['value_pct_national_income'],
            name=region
        )
    )

fig.show()