In [580]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import os

# Set the default template to dark
pio.templates.default = "plotly_dark"

In [581]:
notebook_path = os.getcwd()
data_path = os.path.abspath(os.path.join(notebook_path, "../data"))
processed_path = os.path.join(data_path, "processed")
unprocessed_path = os.path.join(data_path, "unprocessed")
aggregated_path = os.path.join(data_path, "aggregated")

### Topics to analyse:
- Total per capita public spending
- Total per capita private wealth
- Total per capita private debt
- Per capita public spending breakdown across categories - health, education etc.
- GDP per capita and total
- CO2 and carbon footprint totals, per capita, per gdp etc.
- Correlations between variable - debt vs. wealth, population vs. public spending, population vs. national income etc.
### Other questions to ask:
- Which countries spend the most on defense?
- What is the trend in environmental protection spending and how does it correlate with CO2 / carbon footprint?
- Which regions spend more per capita on the various public spending categories?
### Process:
- Aggregate each CSV by 'variable'
### Other notes:
- All currency based variables where 'shorttype' == 'Average' refer to per capita values
- All currency based variables where 'shorttype' == 'Wealth-income ratio' refer to the ratio of variable value to net national income

In [582]:
defense_path = f"{aggregated_path}/Defense.csv"
defense = pd.read_csv(defense_path)
defense = defense[(defense['country'] != 'RS') & (defense['countryname'] != 'Venezuela')]
defense.head(1)

Unnamed: 0,country,variable,year,value,age,pop,countryname,shortname,shorttype,shortpop,shortage,unit,source,method,value_usd,value_ppp,region,subregion,value_usd_per_capita,value_pct_national_income
0,AD,mdefgei999,1980,20775878.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,48210920.0,62682720.0,Europe,Western Europe,1353.793551,0.017633


In [613]:
def group_dataframe(df, indexes):

    agg_dict = {
        'value_ppp': 'sum',
        'value_usd': 'sum',
        'value_usd_per_capita': 'mean',
        'value_pct_national_income': 'mean'
    }

    return df.groupby(by=indexes)[list(agg_dict.keys())].agg(agg_dict).reset_index()


def filter_dataframe(df, year, column, count=20, subset='top'):
    if subset == 'top':
        return df[df['year'] == year].sort_values(by=column, ascending=False).head(count)
    else:
        return df[df['year'] == year].sort_values(by=column, ascending=True).head(count)


def get_country_flag(country_code):
    # Convert 2-letter country code to flag emoji
    # Country code should be ISO 3166-1 alpha-2 (2 letters)
    code = country_code.upper()
    return chr(ord(code[0]) + 127397) + chr(ord(code[1]) + 127397)



In [584]:
defense_region = group_dataframe(defense, ['year', 'country', 'region'])
defense_country = group_dataframe(defense, ['year', 'country', 'countryname', 'region'])

In [585]:
defense_2023_country_top20_pct = filter_dataframe(defense_country, 2023, 'value_pct_national_income')
defense_2023_country_top20_pct_avg = defense_2023_country_top20_pct['value_pct_national_income'].mean()

defense_2023_country_top20_usd_pc = filter_dataframe(defense_country, 2023, 'value_usd_per_capita')
defense_2023_country_top20_usd_pc_avg = defense_2023_country_top20_usd_pc['value_usd_per_capita'].mean()

defense_2023_country_top20_usd_total = filter_dataframe(defense_country, 2023, 'value_usd')
defense_2023_country_top20_usd_total_avg = defense_2023_country_top20_usd_total['value_usd'].mean()

defense_2023_country_top20_ppp_total = filter_dataframe(defense_country, 2023, 'value_ppp')
defense_2023_country_top20_ppp_total_avg = defense_2023_country_top20_ppp_total['value_ppp'].mean()

In [620]:
defense_2023_country_top10_pct = filter_dataframe(defense_country, 2023, 'value_pct_national_income', count=10)
defense_2023_country_top10_pct['flag'] = defense_2023_country_top10_pct['country'].apply(get_country_flag)
defense_2023_country_top10_pct['country+flag'] = defense_2023_country_top10_pct['countryname'] + defense_2023_country_top10_pct['flag']
defense_2023_country_top10_pct

Unnamed: 0,year,country,countryname,region,value_ppp,value_usd,value_usd_per_capita,value_pct_national_income,flag,country+flag
9699,2023,UA,Ukraine,Asia,137270500000.0,39140940000.0,1065.216459,0.24474,🇺🇦,Ukraine🇺🇦
9649,2023,OM,Oman,Asia,23372470000.0,12346700000.0,2658.538142,0.149595,🇴🇲,Oman🇴🇲
9501,2023,AE,the United Arab Emirates,Asia,55564920000.0,35079050000.0,3685.981009,0.07285,🇦🇪,the United Arab Emirates🇦🇪
9502,2023,AF,Afghanistan,Asia,5429521000.0,951564000.0,22.528004,0.071995,🇦🇫,Afghanistan🇦🇫
9520,2023,BI,Burundi,Africa,819658000.0,229771300.0,17.356228,0.070579,🇧🇮,Burundi🇧🇮
9506,2023,AM,Armenia,Asia,3646722000.0,1401070000.0,504.350363,0.069301,🇦🇲,Armenia🇦🇲
9595,2023,JO,Jordan,Asia,5912149000.0,2542771000.0,224.24242,0.055675,🇯🇴,Jordan🇯🇴
9598,2023,KG,Kyrgyzstan,Asia,2433491000.0,649388400.0,96.414626,0.053665,🇰🇬,Kyrgyzstan🇰🇬
9586,2023,IL,Israel,Asia,22050440000.0,21274130000.0,2318.836745,0.050527,🇮🇱,Israel🇮🇱
9620,2023,MA,Morocco,Africa,17836450000.0,6847558000.0,180.963923,0.049809,🇲🇦,Morocco🇲🇦


In [627]:
data = filter_dataframe(defense_country, 2023, 'value_pct_national_income', count=10).sort_values(by='value_pct_national_income')
avg_value = data['value_pct_national_income'].mean()
flags = data['country'].apply(get_country_flag)
countries = data['countryname'].values + ' ' + flags.values
values = data['value_pct_national_income'].values - avg_value
text = data['value_pct_national_income'].values

fig = go.Figure(
    go.Bar(
        x=values,
        y=countries,
        orientation='h',
        text=text,
        texttemplate='%{text:.1%}'
    )
)

fig.update_layout(
    title='Top 10 Countries by Defense spend (% of country net income)',
    xaxis=dict(
        showticklabels=False
    )
)

fig.add_vline(
    x=0,
    annotation_text=f"Mean {avg_value:.1%}",
    annotation_position='top'
)

fig.show()

In [588]:
a = defense_country[defense_country['countryname'].isin(countries)]
per_capita = filter_dataframe(a, year=2023, column='value_usd_per_capita', count=10)
per_capita_countries = per_capita['countryname'].values
per_capita_mean = per_capita['value_usd_per_capita'].mean()
per_capita_values = per_capita['value_usd_per_capita'].values
per_capita_data = per_capita_values - per_capita_mean
per_capita

Unnamed: 0,year,country,countryname,region,value_ppp,value_usd,value_usd_per_capita,value_pct_national_income
9501,2023,AE,the United Arab Emirates,Asia,55564920000.0,35079050000.0,3685.981009,0.07285
9649,2023,OM,Oman,Asia,23372470000.0,12346700000.0,2658.538142,0.149595
9586,2023,IL,Israel,Asia,22050440000.0,21274130000.0,2318.836745,0.050527
9699,2023,UA,Ukraine,Asia,137270500000.0,39140940000.0,1065.216459,0.24474
9506,2023,AM,Armenia,Asia,3646722000.0,1401070000.0,504.350363,0.069301
9595,2023,JO,Jordan,Asia,5912149000.0,2542771000.0,224.24242,0.055675
9620,2023,MA,Morocco,Africa,17836450000.0,6847558000.0,180.963923,0.049809
9598,2023,KG,Kyrgyzstan,Asia,2433491000.0,649388400.0,96.414626,0.053665
9502,2023,AF,Afghanistan,Asia,5429521000.0,951564000.0,22.528004,0.071995
9520,2023,BI,Burundi,Africa,819658000.0,229771300.0,17.356228,0.070579


In [608]:


fig = make_subplots(
    rows=1,
    cols=2,
    specs=[[{}, {}]],
    shared_xaxes=True,
    shared_yaxes=True
    )

fig.add_trace(go.Bar(
        x=values,
        y=countries,
        orientation='h',
        text=text,
        texttemplate='%{text:.0%}',
        name='% of national income'
    ),
    row=1,
    col=1
)

fig.add_trace(go.Bar(
        x=per_capita_data,
        y=per_capita_countries,
        orientation='h',
        text=per_capita_values,
        texttemplate='$%{text:,.0f}',
        textposition='inside',
        name='per capita (USD)'
    ),
    row=1,
    col=2
)

fig.add_vline(
    x=0,
    annotation_text='Mean',
    annotation_position='top'
)

fig.update_layout(
    title='Top 10 Countries by Defense spend (% of country net income)',
    xaxis=dict(
        showticklabels=False
    ),
    xaxis2=dict(
        showticklabels=False
    )
)

fig.show()

In [634]:
def plot_flagpole_chart(dataframe, column_name, column_format, denominator=None):

    data = filter_dataframe(dataframe, 2023, column_name, count=10).sort_values(by=column_name)
    avg_value = data[column_name].mean()
    countries = data['countryname'].values
    values = data[column_name].values - avg_value
    text = data[column_name].values

    if column_format == '$':
        text_format = '$%{text:,.0f}B'
    
    if column_format == '%':
        text_format = '%{text:.0%}'

    if denominator:
        text = text / denominator

    fig = go.Figure(
        go.Bar(
            x=values,
            y=countries,
            orientation='h',
            text=text,
            texttemplate=text_format
        )
    )

    fig.update_layout(
        title=f"Top 10 Countries by Defense spend {column_name}",
        xaxis=dict(
            showticklabels=False
        )
    )

    fig.add_vline(
        x=0,
        annotation_text='Mean',
        annotation_position='top'
    )

    return fig

plot = plot_flagpole_chart(defense_country, 'value_usd', '$', denominator=1000000000)
plot.show()

In [590]:
fig = go.Figure()

year = 2023

for region, data in defense_region.groupby('region'):

    data = data[data['year'] == year]

    fig.add_trace(
        go.Box(
            x=data['value_pct_national_income'],
            name=region
        )
    )

    fig.update_layout(
        title=f"Distribution of Defense spend by region in {year}"
    )

fig.show()

In [591]:
fig = go.Figure()

for region, data in defense_region.groupby('region'):
    subset = data.groupby('year')['value_pct_national_income'].mean()

    fig.add_trace(
        go.Scatter(
            x=subset.index,
            y=subset.values,
            name=region
        )
    )

fig.show()

In [592]:
defense_region[
    (defense_region['region'] == 'Americas')
      & (defense_region['year'] == 2015)
      ].sort_values(by='value_pct_national_income').head(5)

Unnamed: 0,year,country,region,value_ppp,value_usd,value_usd_per_capita,value_pct_national_income
7814,2015,BQ,Americas,5492931.0,7667058.0,331.0,-0.987241
7872,2015,HT,Americas,201912100.0,86952380.0,8.231698,0.001297
7866,2015,GT,Americas,705769200.0,289756200.0,6.780046,0.00156
7940,2015,PA,Americas,225824900.0,120711000.0,30.5,0.002258
7831,2015,CR,Americas,784160600.0,485520700.0,37.211883,0.003016


In [593]:
defense_region[
    defense_region['country'] == 'BQ'
      ].sort_values(by='value_pct_national_income').head(5)

Unnamed: 0,year,country,region,value_ppp,value_usd,value_usd_per_capita,value_pct_national_income
7814,2015,BQ,Americas,5492931.0,7667058.0,331.0,-0.987241
3547,1995,BQ,Americas,5655185.0,7172856.0,543.2,0.011027
3760,1996,BQ,Americas,5945646.0,7642583.0,572.6,0.012377
2695,1991,BQ,Americas,5553944.0,6834262.0,537.0,0.012883
4186,1998,BQ,Americas,5709410.0,7486965.0,548.7,0.013006


In [594]:
def manage_outliers(df, column_name):

    df = df.copy()

    q1 = df[column_name].quantile(.25)
    q3 = df[column_name].quantile(.75)
    IQR = q3 - q1
    lower_bound = q1 - 1.5 * IQR
    upper_bound = q3 + 1.5 * IQR

    # replace lower outliers with lower_bound values
    df.loc[df[column_name] < lower_bound, column_name] = lower_bound
    
    # replace upper outliers with upper_bound values
    df.loc[df[column_name] > upper_bound, column_name] = upper_bound

    return df

In [595]:
fig = go.Figure()

for region, data in defense_region.groupby('region'):

    # Manage outliers and missing values before continuing.
    data_cleaned = manage_outliers(data, 'value_pct_national_income').dropna(subset='value_pct_national_income')
    subset = data_cleaned.groupby('year')['value_pct_national_income'].mean()

    fig.add_trace(
        go.Scatter(
            x=subset.index,
            y=subset.values,
            name=region
        )
    )

    fig.update_layout(
        title="Average Defense spend by region over time"
    )

fig.show()

In [599]:
fig = go.Figure()

for region, data in defense_region.groupby('region'):

    # Manage outliers and missing values before continuing.
    data_cleaned = manage_outliers(data, 'value_ppp').dropna(subset='value_ppp')
    subset = data_cleaned.groupby('year')['value_ppp'].mean()

    fig.add_trace(
        go.Scatter(
            x=subset.index,
            y=subset.values,
            name=region
        )
    )

    fig.update_layout(
        title="Total Defense spend by region over time (PPP)"
    )

fig.show()

In [597]:
fig = go.Figure()

for region, data in defense_region.groupby('region'):

    # Manage outliers and missing values before continuing.
    data_cleaned = manage_outliers(data, 'value_usd_per_capita').dropna(subset='value_usd_per_capita')
    subset = data_cleaned.groupby('year')['value_usd_per_capita'].mean()

    fig.add_trace(
        go.Scatter(
            x=subset.index,
            y=subset.values,
            name=region
        )
    )

    fig.update_layout(
        title="Average Defense spend by region over time (USD per capita)"
    )

fig.show()