In [40]:
# Dataset links
# https://data.worldbank.org/indicator/MS.MIL.XPND.GD.ZS
# https://www.cato.org/human-freedom-index-new

# Import Modules

In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

!pip install category_encoders==2.*

# Import Files & Wrangle

In [None]:
from glob import glob
for filename in glob('*.csv'):
    print(filename)

In [72]:
# Read in data sets
MilitaryExpenditure = pd.read_csv('MilitaryExpenditure.csv', skiprows=4)
HumanFreedom = pd.read_csv('HumanFreedom.csv')

In [73]:
# List of countries
ussr = ['Russia', 'Russian Federation', 'Ukraine', 'Georgia', 'Belarus', 'Uzbekistan', 'Armenia', 'Azerbaijan', 
        'Kazakhstan', 'Kyrgyz Republic', 'Moldova', 'Turkmenistan', 'Tajik', 'Tajikistan', 'Latvia', 'Lithuania',
         'Estonia', 'Kyrgyzstan']

In [74]:
# Make list of years to drop
years = list(range(1960, 2008)) #2nd exclusive
years.append(2019)
yearsStr = [str(item) for item in years]

In [75]:
# Wrangle MilitaryExpenditure

def wrangleM(df):

  df = df.copy()

  # Rename Columns
  df.rename(columns={'Country Name':'country'}, inplace=True)

  # Drop Columns
  df.drop(columns=['Indicator Name', 'Indicator Code', '2020', 'Unnamed: 65', 'Country Code'], inplace=True)

 # Get rid of extra years 
  df.drop(columns=yearsStr, inplace=True)
 
  # Pull out ussr countries
  df = df[df.country.str.contains('|'.join(ussr))]

  # Melt on country // NOT WORKING INSIDE OF FUNCTION? 
  df = pd.melt(df, id_vars=['country'])

  # Rename new columns
  df.rename(columns={'variable':'year',
                     'value':'militarySpend'}, inplace=True)
  
  # Cast year to int
  df['year'] = df['year'].astype(int)

  # Drop countries not included in HF dataset
  df = df[~df['country'].isin(['Turkmenistan', 'Uzbekistan'])]

  # Reset index
  df.set_index('year', inplace=True)

  return df

MilitaryExpenditure = wrangleM(MilitaryExpenditure)

In [76]:
def wrangleH(df):
  df = df.copy()
  
  # Drop columns except
  df.drop(df.columns.difference(['countries','hf_score', 'year']), 1, inplace=True)

  # Rename columns
  df.rename(columns={'countries':'country'}, inplace=True)

  # Pull countries
  df = df[df.country.str.contains('|'.join(ussr))]
  
  # Reset index
  df.set_index('year', inplace=True)

  return df

HumanFreedom = wrangleH(HumanFreedom)

In [53]:
merged = pd.merge(HumanFreedom, MilitaryExpenditure, how='inner', on=['country', 'year'])

In [None]:
merged = merged[merged['hf_score'].notna()]
merged.reset_index(inplace=True)
merged= merged.drop('index', axis=1)
merged

In [None]:
merged

# Target/Feature Split

In [None]:
target = 'hf_score'
y = merged[target]
X = merged.drop(columns=[target, 'level_0'])

In [87]:
cutoff = 2016
mask = X.year < cutoff
X_train, y_train = X.loc[mask], y.loc[mask]
X_test, y_test = X.loc[~mask], y.loc[~mask]

# Establish Baseline

In [88]:
# Establish Baseline
y_pred = [y_train.mean()] * len(y_train)
print('Baseline MAE:', mean_absolute_error(y_train, y_pred))

Baseline MAE: 0.7508420099592578


# Linear Regression

In [99]:
linear_model = make_pipeline(
    OneHotEncoder(),
    SimpleImputer(),
    LinearRegression()
)

linear_model.fit(X_train, y_train);

  elif pd.api.types.is_categorical(cols):


In [100]:
print('Linear Regression training accuracy:', linear_model.score(X_train, y_train))
print('Linear Regression validation accuracy:', linear_model.score(X_test, y_test))

Linear Regression training accuracy: 0.979222398085841
Linear Regression validation accuracy: 0.8896669213818822


In [101]:
cv_scores = cross_val_score(linear_model, X, y, cv=5, scoring='r2', n_jobs=-1)
cv_scores.mean()

-0.9003577996313291

In [102]:
X_test.loc[0]

year                2018
country          Armenia
militarySpend    4.89704
Name: 0, dtype: object

In [103]:
linear_model.predict([{'year': 2019, 'country': "Armenia", 'militarySpend': 5.0}])

array([7.37044974])

# Ridge Regression

In [105]:
linear_model_ridge = make_pipeline(
    OneHotEncoder(),
    SimpleImputer(),
    Ridge()
)
linear_model_ridge.fit(X_train, y_train);

  elif pd.api.types.is_categorical(cols):


In [106]:
print('Ridge training R^2:', linear_model_ridge.score(X_train, y_train))
print('Ridge test R^2:', linear_model_ridge.score(X_test, y_test))

Ridge training R^2: 0.9667361500912945
Ridge test R^2: 0.8691622065601075


In [107]:
cv_scores = cross_val_score(linear_model_ridge, X, y, cv=5, scoring='r2', n_jobs=-1)
cv_scores.mean()

-0.9637054545244238

In [108]:
linear_model_ridge.predict([{'year': 2019, 'country': "Armenia", 'militarySpend': 5.0}])

array([7.29807482])

# Gradient Boost

In [None]:
model_skgb = make_pipeline(
    OneHotEncoder(),
    SimpleImputer(),
    StandardScaler(),
    GradientBoostingRegressor(random_state=42)
)

model_skgb.fit(X_train, y_train);

In [148]:
print('sklearn Training Accuracy:', model_skgb.score(X_train, y_train))
print('sklearn Validation Accuracy:', model_skgb.score(X_test, y_test))

sklearn Training Accuracy: 0.9975109851045731
sklearn Validation Accuracy: 0.946247267703106


In [160]:
cv_scores = cross_val_score(model_skgb, X, y, cv=5, scoring='r2', n_jobs=-1)
cv_scores.mean()

-1.0343901803348083

# Hyperparameter Tuning w GridSearchCV

In [None]:
# hyperparameter tuning w gridsearchCV

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
params = {'ridge__alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}


gs_rf = GridSearchCV(
    linear_model_ridge,
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=1
)

gs_rf.fit(X_train, y_train)


In [110]:
print('sklearn Training Accuracy:', gs_rf.score(X_train, y_train))
print('sklearn Validation Accuracy:', gs_rf.score(X_test, y_test))

sklearn Training Accuracy: 0.9792207986143918
sklearn Validation Accuracy: 0.8895538128886874
