In [452]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import os

# Set the default template to dark
pio.templates.default = "plotly_dark"

In [453]:
notebook_path = os.getcwd()
data_path = os.path.abspath(os.path.join(notebook_path, "../data"))
processed_path = os.path.join(data_path, "processed")
unprocessed_path = os.path.join(data_path, "unprocessed")
aggregated_path = os.path.join(data_path, "aggregated")

### Topics to analyse:
- Total per capita public spending
- Total per capita private wealth
- Total per capita private debt
- Per capita public spending breakdown across categories - health, education etc.
- GDP per capita and total
- CO2 and carbon footprint totals, per capita, per gdp etc.
- Correlations between variable - debt vs. wealth, population vs. public spending, population vs. national income etc.
### Other questions to ask:
- Which countries spend the most on defense?
- What is the trend in environmental protection spending and how does it correlate with CO2 / carbon footprint?
- Which regions spend more per capita on the various public spending categories?
### Process:
- Aggregate each CSV by 'variable'
### Other notes:
- All currency based variables where 'shorttype' == 'Average' refer to per capita values
- All currency based variables where 'shorttype' == 'Wealth-income ratio' refer to the ratio of variable value to net national income

In [454]:
schema = {
    'country': 'str',
    'variable': 'str',
    'year': 'int',
    'value': 'float',
    'age': 'str',
    'pop': 'str',
    'countryname': 'str',
    'shortname': 'str',
    'shorttype': 'str',
    'shortpop': 'str',
    'shortage': 'str',
    'unit': 'str',
    'source': 'str',
    'method': 'str',
    'value_usd': 'float',
    'value_ppp': 'float',
    'region': 'str',
    'subregion': 'str',
    'value_usd_per_capita': 'float'
}

defense_path = f"{aggregated_path}/Defense.csv"
defense = pd.read_csv(defense_path)
defense = defense[(defense['country'] != 'RS') & (defense['countryname'] != 'Venezuela')]
defense.head(1)

Unnamed: 0,country,variable,year,value,age,pop,countryname,shortname,shorttype,shortpop,shortage,unit,source,method,value_usd,value_ppp,region,subregion,value_usd_per_capita,value_pct_national_income
0,AD,mdefgei999,1980,20775878.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,48210920.0,62682720.0,Europe,Western Europe,1353.793551,0.017633


In [455]:
my_dict = {
    'one': 1,
    'two': 2
}

my_dict.keys()

dict_keys(['one', 'two'])

In [456]:
def group_dataframe(df, indexes):

    agg_dict = {
        'value_ppp': 'sum',
        'value_usd': 'sum',
        'value_usd_per_capita': 'mean',
        'value_pct_national_income': 'mean'
    }

    return df.groupby(by=indexes)[list(agg_dict.keys())].agg(agg_dict).reset_index()

defense_region = group_dataframe(defense, ['year', 'country', 'region'])
defense_country = group_dataframe(defense, ['year', 'country', 'countryname', 'region'])

In [457]:
def filter_dataframe(df, year, column, count=20, subset='top'):
    if subset == 'top':
        return df[df['year'] == year].sort_values(by=column, ascending=False).head(count)
    else:
        return df[df['year'] == year].sort_values(by=column, ascending=True).head(count)

In [458]:
defense_2023_country_top20_pct = filter_dataframe(defense_country, 2023, 'value_pct_national_income')
defense_2023_country_top20_pct_avg = defense_2023_country_top20_pct['value_pct_national_income'].mean()

defense_2023_country_top20_usd_pc = filter_dataframe(defense_country, 2023, 'value_usd_per_capita')
defense_2023_country_top20_usd_pc_avg = defense_2023_country_top20_usd_pc['value_usd_per_capita'].mean()

defense_2023_country_top20_usd_total = filter_dataframe(defense_country, 2023, 'value_usd')
defense_2023_country_top20_usd_total_avg = defense_2023_country_top20_usd_total['value_usd'].mean()

defense_2023_country_top20_ppp_total = filter_dataframe(defense_country, 2023, 'value_ppp')
defense_2023_country_top20_ppp_total_avg = defense_2023_country_top20_ppp_total['value_ppp'].mean()

In [459]:
fig = go.Figure()

year = 2023

for region, data in defense_region.groupby('region'):

    data = data[data['year'] == year]

    fig.add_trace(
        go.Box(
            x=data['value_pct_national_income'],
            name=region
        )
    )

    fig.update_layout(
        title=f"Distribution of Defense spend by region in {year}"
    )

fig.show()

In [463]:
a = defense_region.groupby('region')
a.get_group('Africa')

Unnamed: 0,year,country,region,value_ppp,value_usd,value_usd_per_capita,value_pct_national_income
23,1970,KE,Africa,8.415680e+09,3.514152e+09,3.062919e+02,
26,1970,LY,Africa,5.999305e+10,2.383610e+10,1.248492e+04,
33,1970,SD,Africa,3.584199e+14,3.407100e+14,3.013756e+07,
35,1970,SL,Africa,5.707572e+11,2.660315e+11,9.576004e+04,
36,1970,TG,Africa,1.092707e+08,3.947343e+07,1.796398e+01,
...,...,...,...,...,...,...,...
9698,2023,TZ,Africa,3.056793e+09,9.585424e+08,1.503954e+01,0.014745
9700,2023,UG,Africa,2.345393e+09,7.956963e+08,1.637830e+01,0.016537
9710,2023,ZA,Africa,4.616861e+09,1.785914e+09,2.984003e+01,0.005657
9711,2023,ZM,Africa,8.052488e+08,3.066404e+08,1.490835e+01,0.010257


In [464]:
fig = go.Figure()

for region, data in defense_region.groupby('region'):
    subset = data.groupby('year')['value_usd_per_capita'].mean().reset_index(name='value_usd_per_capita')

    fig.add_trace(
        go.Scatter(
            x=subset['year'],
            y=subset['value_usd_per_capita'],
            name=region
        )
    )

fig.show()