In [1]:
%run -i 'setup.py'

import pandas as pd
import numpy as np

from sklearn.linear_model import ElasticNet
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from src import country_data
from src import ml_model

In [2]:
df = country_data.load_combined()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,code,defence_budget,economic_affairs_budget,education_budget,environment_protection_budget,general_public_services_budget,health_budget,housing_and_community_amenities_budget,public_order_and_safety_budget,recreation_culture_and_religion_budget,social_protection_budget,gross_debt,total_expenditure,total_revenue,neonatal_mortality_rate,u5_mortality_rate,maternal_mortality_rate,modern_contraceptive_rate,adolescent_fertility_rate,safely_managed_water_use_rate
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Australia,2000,AUS,810.00,2305.06,2703.66,278.31,2594.59,3054.12,420.02,828.84,468.08,5515.92,22104.04,19676.48,19012.85,3.526892,6.186699,8.552052,70.8,,
Australia,2001,AUS,876.34,2520.57,2873.72,317.33,2384.78,3321.49,487.21,904.87,443.33,5651.10,22934.30,20505.11,20074.95,3.492919,6.078227,7.890079,70.8,,
Australia,2002,AUS,952.50,2471.85,3054.12,359.97,2434.51,3519.82,463.23,978.01,477.97,5818.90,23178.93,21067.50,21494.66,3.437476,5.987457,7.756465,70.8,,
Australia,2003,AUS,941.96,2630.58,3197.41,363.18,2443.23,3783.56,437.90,1040.06,513.95,6425.13,22578.58,22406.06,22920.74,3.371362,5.905259,7.824608,70.8,,
Australia,2004,AUS,954.41,2729.79,3420.64,419.68,2607.13,4202.01,502.36,1085.05,522.06,6607.54,21687.28,23820.31,24456.83,3.302830,5.822850,7.428333,70.8,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United States,2014,USA,60919.16,59642.97,107557.50,0.00,101519.01,155813.85,9269.78,35493.18,4668.18,136985.51,2376203.67,671869.14,581279.96,3.959782,6.878153,,,,
United States,2015,USA,59874.15,58900.14,111188.11,0.00,101614.44,166514.54,9473.39,36732.33,4880.08,141829.42,2489273.49,691006.65,606661.90,3.867693,6.799975,,,,
United States,2016,USA,59902.90,62828.36,114157.80,0.00,105975.28,174124.38,9745.64,37710.33,5118.04,145200.15,2592405.47,714762.88,615551.38,3.743808,6.725240,,,,
United States,2017,USA,61595.47,64257.30,117005.45,0.00,109919.14,181783.57,11629.72,39115.86,5319.25,148861.74,2644538.41,739487.53,658644.70,3.615545,6.640237,,,,


## Improve data imputation?

In [25]:
def prepare_data(df):
    prepared_df = df.reset_index().drop("code", axis=1)

    features = prepared_df[ml_model.FEATURES]
    labels = prepared_df[ml_model.LABELS]

    return features, labels


current_model = ml_model.pipeline()

features, labels = prepare_data(df)

In [26]:
imputed_labels = labels.fillna(0)

X_train, X_test, y_train, y_test = ml_model.split_data(features, imputed_labels)

current_model.fit(X_train, y_train)
y_pred = current_model.predict(X_test)

mean_absolute_error(y_test, y_pred)

13.099286898202202

In [28]:
imputer = IterativeImputer(max_iter=100)

X_train, X_test, y_train, y_test = ml_model.split_data(features, labels)

imputer.fit(y_train)

current_model.fit(X_train, imputer.transform(y_train))
y_pred = current_model.predict(X_test)

mean_absolute_error(imputer.transform(y_test), y_pred)

  positive)


3.5120566675688294

### Iterative imputation is a little better than filling with zeros

They're made-up data labels, but they should give us more variety than making all the NaNs 0, and we don't have time to go find the actual data from other sources. In particular the ranges of values is a bit more realistic, because OECD countries probably don't have safe drinking water for only 36.4% of their residents on average.

## Add World Bank data

In [15]:
gini = os.path.join(BASE_DIR, 'data/country_stats/world_bank/gini_index/API_SI.POV.GINI_DS2_en_csv_v2_247786.csv')
wb_id_vars = ['Country Name', 'Country Code', 'Indicator Name']

gini_df = (
    pd
    .read_csv(gini, header=2)
    .drop('Indicator Code', axis=1)
    .melt(id_vars=wb_id_vars, value_vars=np.arange(1960, 2019).astype(str))
    .rename(columns={'Country Name': 'country', 'variable': 'year', 'value': 'gini_index'})
    .astype({'year': int})
    .set_index(['country', 'year'])
    .loc[:, ['gini_index']]
    .groupby('country')
    .ffill()
    .groupby('country')
    .bfill()
)


gini_df

Unnamed: 0_level_0,Unnamed: 1_level_0,gini_index
country,year,Unnamed: 2_level_1
Aruba,1960,
Afghanistan,1960,
Angola,1960,52.0
Albania,1960,27.0
Andorra,1960,
...,...,...
Kosovo,2018,29.0
"Yemen, Rep.",2018,36.7
South Africa,2018,63.0
Zambia,2018,57.1


In [16]:
pop = os.path.join(BASE_DIR, 'data/country_stats/world_bank/population/API_SP.POP.TOTL_DS2_en_csv_v2_247892.csv')

pop_df = (
    pd
    .read_csv(pop, header=2)
    .drop('Indicator Code', axis=1)
    .melt(id_vars=wb_id_vars, value_vars=np.arange(1960, 2019).astype(str))
    .rename(columns={'Country Name': 'country', 'variable': 'year', 'value': 'population'})
    .astype({'year': int})
    .set_index(['country', 'year'])
    .loc[:, ['population']]
    .groupby('country')
    .ffill()
    .groupby('country')
    .bfill()
)


pop_df

Unnamed: 0_level_0,Unnamed: 1_level_0,population
country,year,Unnamed: 2_level_1
Aruba,1960,54211.0
Afghanistan,1960,8996973.0
Angola,1960,5454933.0
Albania,1960,1608800.0
Andorra,1960,13411.0
...,...,...
Kosovo,2018,1845300.0
"Yemen, Rep.",2018,28498687.0
South Africa,2018,57779622.0
Zambia,2018,17351822.0


In [18]:
expanded_df = df.join([gini_df, pop_df], how='left')
expanded_df

Unnamed: 0_level_0,Unnamed: 1_level_0,code,defence_budget,economic_affairs_budget,education_budget,environment_protection_budget,general_public_services_budget,health_budget,housing_and_community_amenities_budget,public_order_and_safety_budget,recreation_culture_and_religion_budget,...,total_expenditure,total_revenue,neonatal_mortality_rate,u5_mortality_rate,maternal_mortality_rate,modern_contraceptive_rate,adolescent_fertility_rate,safely_managed_water_use_rate,gini_index,population
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Australia,2000,AUS,810.00,2305.06,2703.66,278.31,2594.59,3054.12,420.02,828.84,468.08,...,19676.48,19012.85,3.526892,6.186699,8.552052,70.8,,,32.6,19153000.0
Australia,2001,AUS,876.34,2520.57,2873.72,317.33,2384.78,3321.49,487.21,904.87,443.33,...,20505.11,20074.95,3.492919,6.078227,7.890079,70.8,,,33.5,19413000.0
Australia,2002,AUS,952.50,2471.85,3054.12,359.97,2434.51,3519.82,463.23,978.01,477.97,...,21067.50,21494.66,3.437476,5.987457,7.756465,70.8,,,33.5,19651400.0
Australia,2003,AUS,941.96,2630.58,3197.41,363.18,2443.23,3783.56,437.90,1040.06,513.95,...,22406.06,22920.74,3.371362,5.905259,7.824608,70.8,,,33.5,19895400.0
Australia,2004,AUS,954.41,2729.79,3420.64,419.68,2607.13,4202.01,502.36,1085.05,522.06,...,23820.31,24456.83,3.302830,5.822850,7.428333,70.8,,,33.1,20127400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United States,2014,USA,60919.16,59642.97,107557.50,0.00,101519.01,155813.85,9269.78,35493.18,4668.18,...,671869.14,581279.96,3.959782,6.878153,,,,,41.0,318386421.0
United States,2015,USA,59874.15,58900.14,111188.11,0.00,101614.44,166514.54,9473.39,36732.33,4880.08,...,691006.65,606661.90,3.867693,6.799975,,,,,41.0,320742673.0
United States,2016,USA,59902.90,62828.36,114157.80,0.00,105975.28,174124.38,9745.64,37710.33,5118.04,...,714762.88,615551.38,3.743808,6.725240,,,,,41.5,323071342.0
United States,2017,USA,61595.47,64257.30,117005.45,0.00,109919.14,181783.57,11629.72,39115.86,5319.25,...,739487.53,658644.70,3.615545,6.640237,,,,,41.5,325147121.0


In [24]:
def prepare_data(df):
    prepared_df = df.reset_index().drop("code", axis=1)

    features = prepared_df[ml_model.FEATURES + ['gini_index', 'population']]
    labels = prepared_df[ml_model.LABELS]

    return features, labels


features, labels = prepare_data(expanded_df)

X_train, X_test, y_train, y_test = ml_model.split_data(features, labels)

imputer.fit(y_train)

current_model.fit(X_train, imputer.transform(y_train))
y_pred = current_model.predict(X_test)

mean_absolute_error(imputer.transform(y_test), y_pred)

  positive)


3.452752410529173

## Try other algorithms

In [36]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [50]:
base_pipeline_steps = [
    ColumnTransformer(
        [
            (
                "onehotencoder",
                OneHotEncoder(sparse=False, handle_unknown="ignore"),
                ["country"],
            )
        ],
        remainder=StandardScaler(),
    )
]

regressors = [
    RandomForestRegressor(n_estimators=100),
    ExtraTreesRegressor(n_estimators=100),
    Lasso(),
    Ridge(),
    LinearRegression()
]

for regressor in regressors:
    pipeline_steps = base_pipeline_steps + [regressor]
    pipeline = make_pipeline(*pipeline_steps)

    pipeline.fit(X_train, imputer.transform(y_train))
    y_pred = pipeline.predict(X_test)

    print(regressor.__class__.__name__)
    print(mean_absolute_error(imputer.transform(y_test), y_pred))
    print()

RandomForestRegressor
0.6465553878303332

ExtraTreesRegressor
0.1781782894218609

Lasso
3.668246813640717

Ridge
1.2889855146030529

LinearRegression
1.24439617402233



  positive)


### ExtraTrees has the best MAE

I tend to get better performance from gradient-boosted trees, but they don't support multi-output by default, so sticking with the best of the algorithms that doesn't require extra work.