In [26]:
import arviz as az
import io
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import pymc as pm
import pytensor.tensor as pt
import requests
import statsmodels.api as sm
import warnings
import xarray as xr
from pathlib import Path
import urllib.request

In [27]:
countries_iso = ['BEL', 'DNK', 'FRA', 'GBR', 'ITA', 'NLD', 'NOR', 'PRT']
countries_names = ['Belgium', 'Denmark', 'France', 'United Kingdom', 'Italy',
                   'Netherlands', 'Norway', 'Portugal']

#Iceland, Luxembourg too small

name_to_iso = dict(zip(countries_names, countries_iso))

Jordà-Schularick-Taylor Macrohistory Database:
- https://www.macrohistory.net/app/download/9834512569/JSTdatasetR6.xlsx?t=1763503850

Long-Term Productivity Database:
- https://www.longtermproductivity.com/download.html

SIPRI Military Expenditure Database:
- https://www.sipri.org/databases/milex

In [28]:
data_path0 = Path.cwd().parent / "data raw" / "JSTdatasetR6.xlsx"
print("Macro dataset")
print("Full file path: ",data_path0)
print("File exists: ",data_path0.exists())
print()

data_path1 = Path.cwd().parent / "data raw" / "BCLDatabase_online_v2.6.xlsx"
print("TFP dataset")
print("Full file path: ",data_path1)
print("File exists: ",data_path1.exists())
print()

data_path2 = Path.cwd().parent / "data raw" / "SIPRI-Milex-data-1949-2024_2.xlsx"
print("Defense spending dataset")
print("Full file path: ",data_path2)
print("File exists: ",data_path2.exists())

Macro dataset
Full file path:  /Users/awalters/escp_phd/govt_spending/data raw/JSTdatasetR6.xlsx
File exists:  True

TFP dataset
Full file path:  /Users/awalters/escp_phd/govt_spending/data raw/BCLDatabase_online_v2.6.xlsx
File exists:  True

Defense spending dataset
Full file path:  /Users/awalters/escp_phd/govt_spending/data raw/SIPRI-Milex-data-1949-2024_2.xlsx
File exists:  True


In [29]:
df_macro = pd.read_excel(data_path0)

In [30]:
df_macro = df_macro.loc[df_macro["iso"].isin(countries_iso)]

In [31]:
#Real in constant 1990 USD (use 1990 exchange rate)
df_macro["gdp_real_lcu_1990"] = df_macro["gdp"] * 100 / df_macro["cpi"]
df_macro["revenue_real_lcu_1990"] = df_macro["revenue"] * 100 / df_macro["cpi"]
df_macro["expenditure_real_lcu_1990"] = df_macro["expenditure"] * 100 / df_macro["cpi"]

xrusd_1990 = (
    df_macro.loc[df_macro["year"].eq(1990)]
      .groupby("iso", as_index=False)["xrusd"]
      .mean()
      .rename(columns={"xrusd": "xrusd_1990"})
)

df_macro = df_macro.merge(xrusd_1990, on="iso", how="left")

df_macro["gdp_real_usd_1990"] = df_macro["gdp_real_lcu_1990"] / df_macro["xrusd_1990"]
df_macro["revenue_real_usd_1990"] = df_macro["revenue_real_lcu_1990"] / df_macro["xrusd_1990"]
df_macro["expenditure_real_usd_1990"] = df_macro["expenditure_real_lcu_1990"] / df_macro["xrusd_1990"]

In [32]:
df_macro["rgdp_pc"] = df_macro["gdp_real_usd_1990"]/df_macro["pop"] #Real GDP per capita
df_macro["def_gdp"] = 100*(df_macro["revenue_real_usd_1990"] -
                      df_macro["expenditure_real_usd_1990"])/df_macro["gdp_real_usd_1990"] #deficit as percent of gdp
df_macro["gov_pc"] = df_macro["expenditure_real_usd_1990"]/df_macro["pop"] #real government spending per capita
df_macro["bill_rate"] = df_macro["bill_rate"]*100 #convert bill rate to %
df_macro["debtgdp"] = df_macro["debtgdp"]*100 #convert debt to %
df_macro["nx_gdp"] = 100*(df_macro["exports"]-df_macro["imports"])/df_macro["gdp"]

In [33]:
df_macro = df_macro[["iso", #country code
                      "year", #year
                      "rgdp_pc", #Real GDP per capita
                      "gov_pc", #Real government spending per capita
                      "bill_rate", #Fed funds rate
                      "def_gdp", #deficit as percent of gdp
                      "debtgdp", #debt as percent of gdp
                      "nx_gdp" #trade balance as percent of GDP
                     ]]

In [34]:
df_macro

Unnamed: 0,iso,year,rgdp_pc,gov_pc,bill_rate,def_gdp,debtgdp,nx_gdp
0,BEL,1870,4.019499,0.193015,3.320298,-0.575351,15.1140,-5.924190
1,BEL,1871,4.035578,0.205927,3.326465,-0.643207,16.0160,-9.410814
2,BEL,1872,4.615443,0.208227,3.057484,-0.698213,13.1944,-7.403920
3,BEL,1873,4.557426,0.267734,3.345953,-2.075387,16.1846,-5.232999
4,BEL,1874,4.581273,0.238907,3.357878,-1.018796,17.1812,-4.411558
...,...,...,...,...,...,...,...,...
1203,PRT,2016,12.151540,3.281512,-0.263700,-3.288330,131.5057,1.147418
1204,PRT,2017,12.603878,3.242038,-0.329100,-2.442291,126.1434,1.009392
1205,PRT,2018,13.066035,3.284166,-0.322100,-1.786786,121.4814,0.462677
1206,PRT,2019,13.607829,3.420007,-0.356300,-1.837718,116.6078,0.452126


In [35]:
df_tfp = pd.read_excel(data_path1, sheet_name=3)

df_tfp = df_tfp.rename(columns={df_tfp.columns[0]: "year"}).copy()
df_tfp = df_tfp.iloc[:, :-2]
df_tfp = df_tfp.melt(id_vars="year", var_name="iso", value_name="tfp")

In [36]:
df_mil = pd.read_excel(data_path2, sheet_name=6, skiprows=5, na_values=["..."])
df_mil = df_mil.rename(columns={df_mil.columns[0]: "iso"}).drop(columns=df_mil.columns[1])

df_mil = df_mil.melt(id_vars="iso", var_name="year", value_name="mil_gdp")
df_mil["iso"] = df_mil["iso"].replace(name_to_iso)
df_mil["mil_gdp"] = pd.to_numeric(df_mil["mil_gdp"], errors="coerce")
df_mil["mil_gdp"] = df_mil["mil_gdp"]*100

df_mil = df_mil.loc[df_mil["iso"].isin(countries_iso)]

In [37]:
df = df_macro.merge(
    df_tfp,
    on=["year", "iso"],
    how="left"
)

df = df.merge(
    df_mil,
    on=["year", "iso"],
    how="left"
)

df["iso"] = df["iso"].astype("string")

In [38]:
# sanity: drop rows missing iso/year
df = df.dropna(subset=["iso", "year"])

# ensure unique iso-year (if duplicates exist, decide how to aggregate)
dups = df.duplicated(["iso", "year"])
if dups.any():
    df = (df.groupby(["iso", "year"], as_index=False)
            .mean(numeric_only=True))  # or pick .first(), etc.

In [39]:
out = df.loc[(df["year"] >= 1950) & df.isna().any(axis=1)]
na_cols = out.columns[out.isna().any()].tolist()
out[["year","iso"] + na_cols]

Unnamed: 0,year,iso,def_gdp,debtgdp,mil_gdp
80,1950,BEL,-5.179133,73.6889,
81,1951,BEL,-3.272222,64.4794,
82,1952,BEL,-5.771846,66.2522,
110,1980,BEL,-9.097519,,3.250195
111,1981,BEL,-14.285372,,3.366049
231,1950,DNK,,,1.664503
232,1951,DNK,,,2.052989
233,1952,DNK,,,2.739504
234,1953,DNK,,31.064528,3.363347
238,1957,DNK,0.389413,,3.078795


In [40]:
df.iloc[:, 2:]=df.iloc[:, 2:].interpolate()
df.iloc[:, 2:]=df.iloc[:, 2:].bfill()

df = df.loc[df.year >= 1950]

df = df.copy()

In [41]:
vars_keep = ["rgdp_pc","gov_pc","bill_rate","def_gdp","debtgdp","tfp","mil_gdp","nx_gdp"]

# logs (only where strictly positive)
df["log_rgdp_pc"] = np.log(df["rgdp_pc"])
df["log_gov_pc"]  = np.log(df["gov_pc"])
df["log_tfp"]     = np.log(df["tfp"])

# ratios/rates typically already in percent points (just keep)
df["tbill"]    = df["bill_rate"]
df["def_gdp"]  = df["def_gdp"]
df["debt_gdp"] = df["debtgdp"]
df["mil_gdp"]  = df["mil_gdp"]
df["nx_gdp"]   = df["nx_gdp"]

model_vars = ["log_rgdp_pc","log_gov_pc","log_tfp","tbill","def_gdp","debt_gdp","mil_gdp","nx_gdp"]

In [42]:
df = df.sort_values(["iso","year"])

# create full grid iso x year
isos  = sorted(df["iso"].unique())
years = np.arange(df["year"].min(), df["year"].max()+1)

grid = pd.MultiIndex.from_product([isos, years], names=["iso","year"]).to_frame(index=False)
dfg = grid.merge(df[["iso","year"] + model_vars], on=["iso","year"], how="left")

# find years that are complete for ALL countries and ALL vars
complete_by_year = (dfg.groupby("year")[model_vars]
                      .apply(lambda g: g.notna().all().all()))  # True if no NaNs anywhere that year

years_ok = complete_by_year[complete_by_year].index
df = dfg[dfg["year"].isin(years_ok)].copy()

# final check: no missing
assert df[model_vars].notna().all().all()

In [43]:
df

Unnamed: 0,iso,year,log_rgdp_pc,log_gov_pc,log_tfp,tbill,def_gdp,debt_gdp,mil_gdp,nx_gdp
0,BEL,1950,1.874636,0.444990,1.354683,1.4000,-5.179133,73.6889,4.862558,-4.414455
1,BEL,1951,1.937201,0.456351,1.403621,1.4000,-3.272222,64.4794,4.862558,1.317299
2,BEL,1952,1.940369,0.556753,1.400661,1.4000,-5.771846,66.2522,4.862558,-0.099480
3,BEL,1953,1.949283,0.487863,1.440028,1.4000,-4.360491,68.6405,4.862558,-1.987730
4,BEL,1954,1.971972,0.490545,1.477510,1.4000,-4.873439,69.6069,4.880254,-2.875324
...,...,...,...,...,...,...,...,...,...,...
563,PRT,2016,2.497456,1.188304,2.127599,-0.2637,-3.288330,131.5057,1.542005,1.147418
564,PRT,2017,2.534005,1.176202,2.143288,-0.3291,-2.442291,126.1434,1.239840,1.009392
565,PRT,2018,2.570016,1.189113,2.149451,-0.3221,-1.786786,121.4814,1.341479,0.462677
566,PRT,2019,2.610645,1.229643,2.165644,-0.3563,-1.837718,116.6078,1.373958,0.452126


In [44]:
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df = df.sort_values(["iso", "year"])
df = df.set_index(["iso", "year"])

In [45]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,log_rgdp_pc,log_gov_pc,log_tfp,tbill,def_gdp,debt_gdp,mil_gdp,nx_gdp
iso,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BEL,1950,1.874636,0.444990,1.354683,1.4000,-5.179133,73.6889,4.862558,-4.414455
BEL,1951,1.937201,0.456351,1.403621,1.4000,-3.272222,64.4794,4.862558,1.317299
BEL,1952,1.940369,0.556753,1.400661,1.4000,-5.771846,66.2522,4.862558,-0.099480
BEL,1953,1.949283,0.487863,1.440028,1.4000,-4.360491,68.6405,4.862558,-1.987730
BEL,1954,1.971972,0.490545,1.477510,1.4000,-4.873439,69.6069,4.880254,-2.875324
...,...,...,...,...,...,...,...,...,...
PRT,2016,2.497456,1.188304,2.127599,-0.2637,-3.288330,131.5057,1.542005,1.147418
PRT,2017,2.534005,1.176202,2.143288,-0.3291,-2.442291,126.1434,1.239840,1.009392
PRT,2018,2.570016,1.189113,2.149451,-0.3221,-1.786786,121.4814,1.341479,0.462677
PRT,2019,2.610645,1.229643,2.165644,-0.3563,-1.837718,116.6078,1.373958,0.452126


In [46]:
g = df["mil_gdp"].groupby(level=0)
df["mil_delta"] = g.shift(-1) - df["mil_gdp"]

In [47]:
df = df.drop(columns=["mil_gdp"])
df = df.dropna()

In [48]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,log_rgdp_pc,log_gov_pc,log_tfp,tbill,def_gdp,debt_gdp,nx_gdp,mil_delta
iso,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BEL,1950,1.874636,0.444990,1.354683,1.4000,-5.179133,73.6889,-4.414455,0.000000
BEL,1951,1.937201,0.456351,1.403621,1.4000,-3.272222,64.4794,1.317299,0.000000
BEL,1952,1.940369,0.556753,1.400661,1.4000,-5.771846,66.2522,-0.099480,0.000000
BEL,1953,1.949283,0.487863,1.440028,1.4000,-4.360491,68.6405,-1.987730,0.017696
BEL,1954,1.971972,0.490545,1.477510,1.4000,-4.873439,69.6069,-2.875324,-1.096842
...,...,...,...,...,...,...,...,...,...
PRT,2015,2.463687,1.153745,2.121243,-0.0200,-3.119415,131.1792,0.739336,0.213077
PRT,2016,2.497456,1.188304,2.127599,-0.2637,-3.288330,131.5057,1.147418,-0.302165
PRT,2017,2.534005,1.176202,2.143288,-0.3291,-2.442291,126.1434,1.009392,0.101639
PRT,2018,2.570016,1.189113,2.149451,-0.3221,-1.786786,121.4814,0.462677,0.032479


In [49]:
df = df.reset_index()

In [50]:
df.to_csv(Path.cwd().parent / "data processed" / "nato_dataset.csv", index=False)