In [164]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import os

# Set the default template to dark
pio.templates.default = "plotly_dark"

In [165]:
# Prepare file paths for loading data.
notebook_path = os.getcwd()
data_path = os.path.abspath(os.path.join(notebook_path, "../data"))
processed_path = os.path.join(data_path, "processed")
unprocessed_path = os.path.join(data_path, "unprocessed")
aggregated_path = os.path.join(data_path, "aggregated")

In [166]:
# Load the defense file into a dataframe.
defense_path = f"{aggregated_path}/Defense.csv"
defense = pd.read_csv(defense_path, keep_default_na=False, na_values=[''])


#defense[(defense['variable'].isin(vars)) & (defense['country'] == 'ZM') & (defense['year'] == 2020)]
defense['variable'].unique()


array(['mdefgei999', 'mdefgoi999'], dtype=object)

In [167]:
defense.head()

Unnamed: 0,country,variable,year,value,age,pop,countryname,shortname,shorttype,shortpop,shortage,unit,source,method,value_usd,value_ppp,region,subregion,value_usd_per_capita,value_pct_national_income
0,AD,mdefgei999,1980,20775878.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,48210920.0,62682720.0,Europe,Western Europe,1353.793551,0.017633
1,AD,mdefgei999,1981,21179162.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,38169910.0,62256330.0,Europe,Western Europe,1031.961998,0.017831
2,AD,mdefgei999,1982,21531746.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,32610670.0,59166920.0,Europe,Western Europe,844.809893,0.017519
3,AD,mdefgei999,1983,21826198.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,25319500.0,55705050.0,Europe,Western Europe,626.195461,0.016933
4,AD,mdefgei999,1984,22243598.0,999,i,Andorra,Defense,Total,individuals,All Ages,EUR,,,23021900.0,53054230.0,Europe,Western Europe,545.750137,0.015956


In [168]:
# Create some helper functions to avoid repeating code.

def aggregate_dataframe(df, indexes):
    """
    Aggregates a DataFrame by specified index columns, applying different aggregation
    functions to monetary and statistical columns.

    Parameters:
        df (pandas.DataFrame): Input DataFrame containing monetary and statistical columns
        indexes (list): List of column names to group by

    Returns:
        pandas.DataFrame: Aggregated DataFrame with:
            - Sum of PPP and USD values
            - Mean of per capita USD values
            - Mean of percent national income values
            - Original index columns preserved

    Example:
        >>> indexes = ['country', 'year'] 
        >>> agg_df = aggregate_dataframe(data, indexes)
    """

    agg_dict = {
        'value_ppp': 'sum',
        'value_usd': 'sum',
        'value_usd_per_capita': 'mean',
        'value_pct_national_income': 'mean'
    }

    return df.groupby(by=indexes)[list(agg_dict.keys())].agg(agg_dict).reset_index()


def filter_dataframe(df, year, column, count=20, subset='top'):
    """
    Filters a DataFrame to return the top or bottom N rows for a specific year,
    sorted by a specified column.

    Parameters:
        df (pandas.DataFrame): Input DataFrame to filter
        year (int): Year to filter on in the 'year' column
        column (str): Column name to sort by
        count (int, optional): Number of rows to return. Defaults to 20
        subset (str, optional): Whether to return top or bottom rows.
                                Must be 'top' or 'bottom'. Defaults to 'top'

    Returns:
        pandas.DataFrame: Filtered DataFrame containing up to 'count' rows for the
                        specified year, sorted by 'column' in descending order for
                        'top' subset or ascending order for 'bottom' subset

    Example:
        >>> # Get top 10 countries by GDP in 2020
        >>> top_gdp = filter_dataframe(data, 2020, 'gdp', count=10)
        >>> # Get bottom 5 countries by population in 2019
        >>> bottom_pop = filter_dataframe(data, 2019, 'population', 
        ...                              count=5, subset='bottom')
    """
    if subset == 'top':
        return df[df['year'] == year].sort_values(by=column, ascending=False).head(count)
    else:
        return df[df['year'] == year].sort_values(by=column, ascending=True).head(count)


def get_country_flag(country_code):
    """
    Converts a two-letter country code into its corresponding flag emoji.

    Parameters:
        country_code (str): ISO 3166-1 alpha-2 country code (2 letters)
                            Examples: 'US', 'GB', 'FR', 'DE'

    Returns:
        str: Unicode flag emoji corresponding to the country code

    Notes:
        - Country codes are converted to uppercase internally
        - Uses Regional Indicator Symbol Letters (Unicode 127397 offset)
        - For example, 'US' becomes '🇺🇸', 'GB' becomes '🇬🇧'

    Example:
        >>> flag = get_country_flag('jp')
        >>> print(flag)  # Prints: 🇯🇵
        >>> flag = get_country_flag('IT')
        >>> print(flag)  # Prints: 🇮🇹
    """
    code = country_code.upper()
    return chr(ord(code[0]) + 127397) + chr(ord(code[1]) + 127397)


def manage_outliers(df, column_name, remove=True):
    """
    Handles outliers in a specified DataFrame column using the Interquartile Range (IQR) method.
    Values beyond 1.5 * IQR are capped at the upper/lower bounds (winsorization).

    Parameters:
        df (pandas.DataFrame): Input DataFrame containing the column to process
        column_name (str): Name of the column to handle outliers in
        remove (bool): Instruction to modify or remove outliers

    Returns:
        pandas.DataFrame: A copy of the input DataFrame with outliers in the specified
                        column capped at Q1 - 1.5*IQR and Q3 + 1.5*IQR

    Notes:
        - Creates a copy of the DataFrame to avoid modifying the original
        - Uses standard 1.5 * IQR method to identify outliers
        - Q1 = 25th percentile, Q3 = 75th percentile
        - Outliers are capped rather than removed

    Example:
        >>> data = pd.DataFrame({'values': [1, 2, 3, 100, 4, 5, -50]})
        >>> cleaned = manage_outliers(data, 'values')
        >>> # Extreme values 100 and -50 will be capped at the bounds
    """
    df = df.copy()

    q1 = df[column_name].quantile(.25)
    q3 = df[column_name].quantile(.75)
    IQR = q3 - q1
    lower_bound = q1 - 1.5 * IQR
    upper_bound = q3 + 1.5 * IQR

    if remove:
        # remove lower outliers
        return df[(df[column_name] > lower_bound) & (df[column_name] < upper_bound)]
    
    else:
        # replace lower outliers with lower_bound values
        df.loc[df[column_name] < lower_bound, column_name] = lower_bound
        
        # replace upper outliers with upper_bound values
        df.loc[df[column_name] > upper_bound, column_name] = upper_bound

        return df

In [169]:
#
defense_region = aggregate_dataframe(defense, ['year', 'country', 'region'])
defense_subregion = aggregate_dataframe(defense, ['year', 'country', 'subregion'])

In [170]:
data = filter_dataframe(defense, 2023, 'value_pct_national_income', count=10).sort_values(by='value_pct_national_income')
avg_value = data['value_pct_national_income'].mean()
flags = data['country'].apply(get_country_flag)
countries = data['countryname'].values + ' ' + flags.values
values = data['value_pct_national_income'].values - avg_value
text = data['value_pct_national_income'].values

fig = go.Figure(
    go.Bar(
        x=values,
        y=countries,
        orientation='h',
        text=text,
        texttemplate='%{text:.1%}'
    )
)

fig.update_layout(
    title='Top 10 Countries by Defense spend (% of country net income)',
    xaxis=dict(
        showticklabels=False
    )
)

fig.add_vline(
    x=0,
    annotation_text=f"Mean {avg_value:.1%}",
    annotation_position='top'
)

fig.show()

In [171]:
data = filter_dataframe(defense, 2023, 'value_usd', count=10).sort_values(by='value_usd')
avg_value = data['value_usd'].mean()
flags = data['country'].apply(get_country_flag)
countries = data['countryname'].values + ' ' + flags.values
values = data['value_usd'].values - avg_value
text = data['value_usd'].values / 1000000000

fig = go.Figure(
    go.Bar(
        x=values,
        y=countries,
        orientation='h',
        text=text,
        texttemplate='$%{text:,.2f}B'
    )
)

fig.update_layout(
    title='Top 10 Countries by Defense spend (total USD)',
    xaxis=dict(
        showticklabels=False
    )
)

fig.add_vline(
    x=0,
    annotation_text=f"Mean ${avg_value/1000000000:.1f}B",
    annotation_position='top'
)

fig.show()

In [172]:
data = filter_dataframe(defense, 2023, 'value_ppp', count=10).sort_values(by='value_ppp')
avg_value = data['value_ppp'].mean()
flags = data['country'].apply(get_country_flag)
countries = data['countryname'].values + ' ' + flags.values
values = data['value_ppp'].values - avg_value
text = data['value_ppp'].values / 1000000000

fig = go.Figure(
    go.Bar(
        x=values,
        y=countries,
        orientation='h',
        text=text,
        texttemplate='$%{text:,.2f}B'
    )
)

fig.update_layout(
    title='Top 10 Countries by Defense spend (total PPP)',
    xaxis=dict(
        showticklabels=False
    )
)

fig.add_vline(
    x=0,
    annotation_text=f"Mean ${avg_value/1000000000:.1f}B",
    annotation_position='top'
)

fig.show()

In [173]:
data = filter_dataframe(defense, 2023, 'value_usd_per_capita', count=10).sort_values(by='value_usd_per_capita')
avg_value = data['value_usd_per_capita'].mean()
flags = data['country'].apply(get_country_flag)
countries = data['countryname'].values + ' ' + flags.values
values = data['value_usd_per_capita'].values - avg_value
text = data['value_usd_per_capita'].values

fig = go.Figure(
    go.Bar(
        x=values,
        y=countries,
        orientation='h',
        text=text,
        texttemplate='$%{text:,.0f}'
    )
)

fig.update_layout(
    title='Top 10 Countries by Defense spend (total USD per capita)',
    xaxis=dict(
        showticklabels=False
    )
)

fig.add_vline(
    x=0,
    annotation_text=f"Mean ${avg_value:,.0f}",
    annotation_position='top'
)

fig.show()

In [174]:
df_after_1980_per_capita = defense[defense['year'] >= 1980][['country', 'countryname', 'region', 'subregion', 'value_usd_per_capita']].copy()
df_after_1980_per_capita = manage_outliers(df_after_1980_per_capita, 'value_usd_per_capita')

mean_list = []

for region, region_data in df_after_1980_per_capita.groupby('region'):
    for country, country_data in region_data.groupby('country'):
        mean_value = country_data['value_usd_per_capita'].mean()
        mean_list.append(
            {
                'country': country,
                'region': region,
                'subregion': country_data['subregion'].unique()[0],
                'mean_per_capita_value': mean_value
            }
        )
        

df = pd.DataFrame(mean_list)

fig = make_subplots(
    rows=1,
    cols=df['region'].nunique()
)

for i, (region, data) in enumerate(df.groupby('region')):
    fig.add_trace(
        go.Histogram(
            x=data['mean_per_capita_value'],
            name=region
        ),
        row=1,
        col=i+1
    )

fig.update_layout(
    title='Distribution of mean per capita spending within each region (USD)'
)

fig.show()

In [175]:
df_after_1980_pct = defense[defense['year'] >= 1980][['country', 'countryname', 'region', 'subregion', 'value_pct_national_income']].copy()
df_after_1980_pct = manage_outliers(df_after_1980_pct, 'value_pct_national_income', remove=False)

mean_list = []

for region, region_data in df_after_1980_pct.groupby('region'):
    for country, country_data in region_data.groupby('country'):
        mean_value = country_data['value_pct_national_income'].mean()
        mean_list.append(
            {
                'country': country,
                'region': region,
                'subregion': country_data['subregion'].unique()[0],
                'mean_value': mean_value
            }
        )
        

df = pd.DataFrame(mean_list)

fig = make_subplots(
    rows=1,
    cols=defense['region'].nunique()
)

for i, (region, data) in enumerate(df.groupby('region')):
    fig.add_trace(
        go.Histogram(
            x=data['mean_value'],
            name=region
        ),
        row=1,
        col=i+1
    )

fig.update_layout(
    title='Distribution of mean per spending as % of national income within each region'
)

fig.show()

In [176]:
fig = go.Figure()

year = 2023

for region, data in defense.groupby('region'):

    data = data[data['year'] == year]

    fig.add_trace(
        go.Box(
            x=data['value_pct_national_income'],
            name=region
        )
    )

    fig.update_layout(
        title=f"Distribution of Defense spend by region in {year}"
    )

fig.show()

In [177]:
fig = go.Figure()

for region, data in defense.groupby('region'):
    subset = data.groupby('year')['value_pct_national_income'].mean()

    fig.add_trace(
        go.Scatter(
            x=subset.index,
            y=subset.values,
            name=region
        )
    )

fig.show()

In [178]:
defense[
    (defense['region'] == 'Americas')
      & (defense['year'] == 2015)
      ].sort_values(by='value_pct_national_income').head(5)[['country', 'countryname', 'value_pct_national_income']]

Unnamed: 0,country,countryname,value_pct_national_income
1443,BQ,"Bonaire, Sint Eustatius and Saba",-0.987241
4683,HT,Haiti,0.001297
4354,GT,Guatemala,0.00156
4343,GT,Guatemala,0.00156
8726,PA,Panama,0.002258


In [179]:
defense[
    defense['country'] == 'BQ'
      ].sort_values(by='value_pct_national_income').head(5)[['country', 'countryname', 'value_pct_national_income']]

Unnamed: 0,country,countryname,value_pct_national_income
1443,BQ,"Bonaire, Sint Eustatius and Saba",-0.987241
1423,BQ,"Bonaire, Sint Eustatius and Saba",0.011027
1424,BQ,"Bonaire, Sint Eustatius and Saba",0.012377
1419,BQ,"Bonaire, Sint Eustatius and Saba",0.012883
1426,BQ,"Bonaire, Sint Eustatius and Saba",0.013006


In [180]:
fig = go.Figure()

for region, data in defense.groupby('region'):

    # Manage outliers and missing values before continuing.
    data_cleaned = manage_outliers(data, 'value_pct_national_income').dropna(subset='value_pct_national_income')
    subset = data_cleaned.groupby('year')['value_pct_national_income'].mean()

    fig.add_trace(
        go.Scatter(
            x=subset.index,
            y=subset.values,
            name=region
        )
    )

    fig.update_layout(
        title="Average Defense spend by region over time"
    )

fig.show()

In [181]:
fig = go.Figure()

for region, data in defense.groupby('region'):

    # Manage outliers and missing values before continuing.
    data_cleaned = manage_outliers(data, 'value_ppp').dropna(subset='value_ppp')
    subset = data_cleaned.groupby('year')['value_ppp'].mean()

    fig.add_trace(
        go.Scatter(
            x=subset.index,
            y=subset.values,
            name=region
        )
    )

    fig.update_layout(
        title="Total Defense spend by region over time (PPP)"
    )

fig.show()

In [182]:
fig = go.Figure()

for region, data in defense.groupby('region'):

    # Manage outliers and missing values before continuing.
    data_cleaned = manage_outliers(data, 'value_usd_per_capita').dropna(subset='value_usd_per_capita')
    subset = data_cleaned.groupby('year')['value_usd_per_capita'].mean()

    fig.add_trace(
        go.Scatter(
            x=subset.index,
            y=subset.values,
            name=region
        )
    )

    fig.update_layout(
        title="Average Defense spend by region over time (USD per capita)"
    )

fig.show()