## Import

In [1]:
# notebooky stuff
%load_ext autoreload
%autoreload 2
from IPython.display import display

import sys 
sys.path.append('../../modules')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import eumf_data, eumf_eval, eumf_pipeline
from sklearn import preprocessing, linear_model, model_selection, ensemble, feature_selection
import seaborn as sns
import itertools

# pandas pretty output
pd.set_option('display.min_rows', 20)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

## Data

### Load all data

In [2]:
# migration rates
df_values = eumf_data.load_registrations_from_csv(impute_missing=True)

# google trends
df_trends = eumf_data.load_trends_from_csv()

countries = eumf_data.get_countries()
keyword_ids = df_trends.columns.levels[0].tolist()

df_gdp = eumf_data.read_gdp()
df_unempl = eumf_data.read_unempl()


### Join data, combine countries

In [3]:
# list of countries
countries = eumf_data.get_countries()
countries.remove("CY")

# migration rates
df_values = eumf_data.load_registrations_from_csv(impute_missing=True, countries=countries)

# google trends
df_trends = eumf_data.load_trends_from_csv(countries=countries)
keyword_ids = df_trends.columns.levels[0].tolist()

# macroeconomic data
df_gdp = eumf_data.read_gdp(countries=countries)
df_unempl = eumf_data.read_unempl(countries=countries)

country_combinations = [
    # ["GR", "CY"],
    ["LV", "LT", "EE"],
    ["BE", "NL", "LU"],
    ["CZ", "SK"],
    ["SE", "FI", "DK"],
    ["AT", "CH"]
]

panel = df_values.join(df_trends, how="outer")
panel_3m = panel.resample("3M", closed="left").mean()

panel_comb = eumf_data.combine_countries(panel, combinations=country_combinations)
panel_comb_3m = eumf_data.combine_countries(panel_3m, combinations=country_combinations)
df_gdp_comb = eumf_data.combine_countries(df_gdp, combinations=country_combinations)
df_unempl_comb = eumf_data.combine_countries(
    df_unempl, combinations=country_combinations, average=True
)
# note: strictly, unweighted average is wrong for unemployment, but should work in most cases

panel_comb_3m_macro = panel_comb_3m.join(df_gdp_comb).join(df_unempl_comb)

panel_comb_3m_macro["2017":"2018"]



Unnamed: 0_level_0,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,112,112,112,112,112,112,112,112,112,112,112,112,112,112,...,value,value,value,value,value,value,value,value,value,value,value,value,value,value,gdp,gdp,gdp,gdp,gdp,gdp,gdp,gdp,gdp,gdp,gdp,gdp,gdp,gdp,gdp,gdp,gdp,gdp,unempl,unempl,unempl,unempl,unempl,unempl,unempl,unempl,unempl,unempl,unempl,unempl,unempl,unempl,unempl,unempl,unempl,unempl
country,AT+CH,BE+NL+LU,BG,CZ+SK,ES,FR,GB,GR,HR,HU,IE,IT,LV+LT+EE,PL,PT,RO,SE+FI+DK,SI,AT+CH,BE+NL+LU,BG,CZ+SK,ES,FR,GB,GR,HR,HU,IE,IT,LV+LT+EE,PL,PT,RO,SE+FI+DK,SI,AT+CH,BE+NL+LU,BG,CZ+SK,ES,FR,GB,GR,HR,HU,IE,IT,LV+LT+EE,PL,...,ES,FR,GB,GR,HR,HU,IE,IT,LV+LT+EE,PL,PT,RO,SE+FI+DK,SI,AT+CH,BE+NL+LU,BG,CZ+SK,ES,FR,GB,GR,HR,HU,IE,IT,LV+LT+EE,PL,PT,RO,SE+FI+DK,SI,AT+CH,BE+NL+LU,BG,CZ+SK,ES,FR,GB,GR,HR,HU,IE,IT,LV+LT+EE,PL,PT,RO,SE+FI+DK,SI
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2,Unnamed: 82_level_2,Unnamed: 83_level_2,Unnamed: 84_level_2,Unnamed: 85_level_2,Unnamed: 86_level_2,Unnamed: 87_level_2,Unnamed: 88_level_2,Unnamed: 89_level_2,Unnamed: 90_level_2,Unnamed: 91_level_2,Unnamed: 92_level_2,Unnamed: 93_level_2,Unnamed: 94_level_2,Unnamed: 95_level_2,Unnamed: 96_level_2,Unnamed: 97_level_2,Unnamed: 98_level_2,Unnamed: 99_level_2,Unnamed: 100_level_2,Unnamed: 101_level_2
2017-03-31,10.524,16.048,0.0,0.0,26.667,21.857,21.381,0.0,0.0,0.0,0.0,14.095,0.0,6.095,0.0,0.0,21.286,0.0,34.714,60.19,2.19,10.857,38.714,29.81,53.381,10.0,0.0,0.619,11.095,22.048,0.0,11.952,12.714,11.571,26.524,0.0,97.19,178.952,42.238,127.048,33.143,63.333,47.286,9.0,63.095,62.81,59.333,14.905,179.429,39.524,...,1689.0,1268.333,1080.333,2327.333,4459.0,3747.667,156.0,5255.333,1396.333,12264.667,866.333,18186.333,654.333,420.333,28780.0,42970.0,1510.0,7620.0,5950.0,8450.0,9080.0,3760.0,2590.0,2820.0,14640.0,6800.0,10490.0,2760.0,4520.0,1850.0,33780.0,4790.0,2.85,6.433,6.6,6.0,18.2,9.6,4.6,22.4,12.7,4.3,7.4,11.6,7.333,5.3,9.8,5.3,7.167,7.4
2017-06-30,4.952,14.81,0.0,0.0,21.048,13.952,18.857,0.0,0.0,0.0,0.0,15.286,0.0,6.524,0.0,0.0,27.714,0.0,39.286,61.619,10.619,7.714,24.381,27.952,55.714,4.714,2.333,7.095,20.619,17.286,0.0,13.81,23.143,11.143,32.857,0.0,85.619,168.905,35.19,83.429,23.81,57.857,48.0,11.905,33.048,27.905,62.81,14.286,126.667,21.476,...,1427.667,1023.333,1049.0,2113.333,4121.333,3637.667,164.667,4699.0,1407.0,12940.0,609.667,19264.0,520.333,365.0,29250.0,44540.0,1800.0,8470.0,6330.0,8560.0,8980.0,4120.0,2990.0,3200.0,14700.0,7140.0,11780.0,2960.0,4790.0,2200.0,35590.0,5260.0,2.7,5.867,6.2,5.75,17.3,9.6,4.4,21.8,11.5,4.3,6.7,11.3,7.667,5.1,9.2,4.9,7.133,6.6
2017-09-30,4.905,16.095,0.0,0.0,12.429,21.048,16.048,0.0,0.0,0.0,0.0,11.81,0.0,10.762,0.0,0.0,9.667,0.0,35.762,68.952,16.095,8.143,20.476,33.762,56.0,6.762,0.0,9.095,21.333,29.952,0.0,18.048,8.286,11.19,30.619,0.0,89.143,171.19,32.143,75.905,21.81,60.381,42.952,11.429,28.667,28.048,67.143,16.476,145.048,25.571,...,2495.333,1796.333,1691.667,2704.0,5043.0,4514.0,283.333,5486.0,1742.667,14442.333,743.667,20279.0,960.0,388.0,28610.0,43130.0,2010.0,8810.0,6180.0,8400.0,8650.0,4420.0,3380.0,3360.0,16280.0,7120.0,12240.0,3000.0,4850.0,2650.0,34270.0,5330.0,2.75,5.733,6.1,5.3,16.8,9.5,4.2,21.0,10.3,4.1,6.6,11.3,7.067,4.7,8.7,4.8,7.033,6.6
2017-12-31,4.667,18.905,0.0,0.0,15.19,14.143,16.762,0.0,0.0,0.0,0.0,13.952,0.0,10.667,0.0,0.0,22.238,0.0,34.429,42.619,4.0,1.952,17.476,36.667,45.524,1.81,2.19,3.333,23.286,26.667,0.0,9.429,12.381,9.667,22.286,0.0,85.048,146.714,37.381,104.048,29.143,54.81,38.762,10.476,46.333,40.667,59.048,19.429,157.095,27.857,...,1842.333,1358.667,1138.667,2784.667,3961.333,3252.667,167.0,4942.0,1482.333,9845.667,570.0,15346.333,564.333,348.0,29160.0,46720.0,2070.0,9010.0,6520.0,8850.0,9000.0,4170.0,2950.0,3590.0,16920.0,7640.0,12460.0,3440.0,4860.0,2850.0,36230.0,5440.0,2.7,5.367,5.8,5.1,16.6,9.0,4.2,20.9,10.0,3.9,6.3,10.8,6.767,4.4,8.1,4.7,6.8,5.8
2018-03-31,4.429,10.762,0.0,0.0,16.905,18.762,20.429,0.0,0.0,0.0,0.0,13.476,0.0,5.857,0.0,0.0,21.143,0.0,40.0,36.333,8.714,7.952,25.81,25.619,52.286,1.19,0.714,3.857,23.143,33.476,0.0,17.667,34.714,5.429,28.81,0.0,74.0,174.667,48.333,140.286,31.857,59.571,46.952,12.476,65.476,62.0,65.762,23.571,187.333,37.667,...,1615.0,1245.667,1119.667,2325.0,4284.667,3459.667,179.333,5166.0,1590.222,11652.333,699.667,19815.667,640.0,354.333,28150.0,44400.0,1620.0,8350.0,6150.0,8670.0,9020.0,3830.0,2750.0,3090.0,16490.0,6990.0,11300.0,3040.0,4740.0,1990.0,34040.0,5120.0,2.55,5.3,5.5,4.75,16.2,9.2,4.2,20.3,9.3,3.7,6.0,11.0,7.067,4.0,7.6,4.5,6.567,5.6
2018-06-30,7.143,20.381,0.0,0.0,15.333,31.143,22.714,0.0,0.0,0.0,0.0,17.19,0.0,12.143,0.0,0.0,14.81,0.0,36.286,62.429,4.095,4.381,18.19,62.143,55.571,1.619,0.667,13.381,19.81,30.333,0.0,19.19,2.857,16.762,19.048,0.0,73.286,183.143,42.286,94.0,27.286,59.905,62.048,13.19,40.429,31.619,64.81,26.238,137.095,25.095,...,1444.0,1053.667,1067.333,2200.0,4225.667,3464.333,195.0,5036.0,1617.778,12849.333,607.667,21976.0,556.0,305.889,28920.0,46090.0,1940.0,9120.0,6540.0,8740.0,9110.0,4180.0,3210.0,3440.0,16060.0,7320.0,12750.0,3110.0,5000.0,2430.0,35810.0,5570.0,2.35,5.2,5.4,4.55,15.4,9.2,4.0,19.5,8.3,3.6,5.8,11.0,6.233,3.8,7.1,4.2,6.267,5.3
2018-09-30,6.429,12.762,0.0,0.0,28.143,18.714,19.762,0.0,0.0,0.0,0.0,18.476,0.0,6.571,0.0,0.0,17.286,0.0,33.0,49.333,18.476,9.619,35.667,34.238,56.381,2.476,1.81,4.524,30.667,37.81,0.0,15.952,5.905,13.429,43.81,0.0,78.048,175.238,39.619,81.667,25.571,63.905,48.381,27.524,35.429,31.905,63.905,21.476,111.762,30.905,...,2545.333,1759.667,1683.0,2685.0,4854.333,3951.333,289.333,5557.333,1927.667,13522.667,705.667,20945.0,959.0,373.0,29200.0,45050.0,2210.0,9370.0,6360.0,8610.0,9010.0,4500.0,3580.0,3530.0,17470.0,7230.0,13290.0,3190.0,5090.0,2920.0,34280.0,5710.0,2.45,5.167,5.2,4.2,14.9,8.9,4.0,19.0,8.3,3.8,5.6,10.1,6.133,3.9,6.9,3.9,6.167,5.1
2018-12-31,7.286,12.286,0.0,0.0,24.476,26.0,19.524,0.0,0.0,0.0,0.0,21.143,0.0,11.381,0.0,0.0,14.429,0.0,24.619,46.286,6.476,1.571,28.333,37.905,49.952,5.143,0.0,1.619,18.476,15.0,0.0,23.714,19.381,0.0,25.381,0.0,86.857,175.476,49.619,105.0,57.333,81.0,46.762,23.0,51.81,60.619,63.143,22.048,115.238,35.095,...,1915.667,1342.667,1206.667,2703.333,3709.333,2886.667,183.0,5151.667,1467.222,9602.333,596.0,16656.0,551.667,330.0,30490.0,48280.0,2210.0,9420.0,6730.0,9070.0,9280.0,4230.0,3150.0,3840.0,17250.0,7750.0,13690.0,3620.0,5130.0,3150.0,36680.0,5740.0,2.4,4.867,4.8,4.05,14.6,8.8,3.9,18.4,7.7,3.7,5.6,10.5,5.767,3.8,6.7,4.1,6.1,4.5


## Experiments

In [4]:
T_TEST_MIN = "2014"
T_TEST_MAX = "2014"



### Which AR order?

up to 8

In [16]:
### TRAINING
t_min = "2010"
t_max = "2019"

n_years_eff = int(t_max) - int(t_min) - 1

cv_default = model_selection.KFold(n_splits=n_years_eff, shuffle=False)

feature_combinations = [["value"], ["value", "19"], ["19"]]

lag_order = [1, 2, 3, 4, 5, 6, 7, 8]

tuners = []
cv_scores, test_scores = [], []
train_stackeds, test_stackeds, train_unstackeds = [], [], []
model_names = []

# params = {}
params = {
    "randomforestregressor__max_features": ["auto", "sqrt"],
    "randomforestregressor__min_samples_leaf": [1, 2, 4, 8],
    "randomforestregressor__min_samples_split": [2, 4, 8, 16],
}

for features, order in itertools.product(feature_combinations, lag_order):

    model_names.append("+".join(features) + "_" + str(order))

    labeled = eumf_pipeline.prepare_data(
        panel_comb_3m_macro,
        columns=features,
        lags=list(range(1, order + 1)),
        t_min=t_min,
        t_max=t_max,
    )
    transformed = eumf_pipeline.transform_data(labeled)
    train, test = eumf_pipeline.split_data(
        transformed, t_test_min=T_TEST_MIN, t_test_max=T_TEST_MAX
    )
    train_stacked, test_stacked = eumf_pipeline.stack_data(train, test)

    train_stackeds.append(train_stacked)
    train_unstackeds.append(train)
    test_stackeds.append(test_stacked)

    tuner = eumf_pipeline.train_reg_model(
        train_stacked,
        reg=ensemble.RandomForestRegressor(random_state=42),
        extra_pipeline_steps=[preprocessing.StandardScaler()],
        params=params,
        scoring=eumf_eval.scorer_rmse,
    )
    tuners.append(tuner)

    cv_score = eumf_eval.score_cv(tuner.best_estimator_, train_stacked, cv=cv_default,)
    cv_scores.append(cv_score)

    test_score = eumf_eval.score_test(tuner.best_estimator_, test_stacked,)
    test_scores.append(test_score)


In [17]:
pd.DataFrame({k: t.best_params_ for k, t in zip(model_names, tuners)}).transpose()

Unnamed: 0,randomforestregressor__max_features,randomforestregressor__min_samples_leaf,randomforestregressor__min_samples_split
value_1,auto,8,2
value_2,auto,8,2
value_3,auto,8,2
value_4,auto,4,2
value_5,auto,4,2
value_6,auto,4,2
value_7,auto,4,2
value_8,auto,4,2
value+19_1,auto,8,2
value+19_2,auto,8,2


In [18]:
eumf_eval.agg_multiple_cv_scores(cv_scores, model_names)


Unnamed: 0_level_0,fit_time,fit_time,fit_time,score_time,score_time,score_time,test_mae,test_mae,test_mae,test_rmse,test_rmse,test_rmse,test_explained_variance,test_explained_variance,test_explained_variance,test_r2_mod,test_r2_mod,test_r2_mod,test_delta_mae,test_delta_mae,test_delta_mae
Unnamed: 0_level_1,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem
value_1,0.326,0.093,0.033,0.028,0.007,0.003,-0.071,0.018,0.006,-0.102,0.032,0.011,0.3607,0.193,0.068,0.449,0.268,0.095,0.047,0.056,0.02
value_2,0.334,0.059,0.021,0.028,0.006,0.002,-0.07,0.019,0.007,-0.102,0.033,0.012,0.3758,0.172,0.061,0.462,0.241,0.085,0.047,0.057,0.02
value_3,0.31,0.059,0.021,0.025,0.007,0.002,-0.071,0.018,0.006,-0.103,0.033,0.012,0.3747,0.174,0.062,0.455,0.244,0.086,0.047,0.057,0.02
value_4,0.366,0.085,0.03,0.023,0.011,0.004,-0.069,0.015,0.005,-0.099,0.029,0.01,0.3952,0.178,0.063,0.471,0.248,0.088,0.049,0.061,0.022
value_5,0.329,0.038,0.013,0.023,0.009,0.003,-0.068,0.016,0.005,-0.098,0.03,0.011,0.4095,0.168,0.059,0.49,0.23,0.081,0.05,0.06,0.021
value_6,0.372,0.048,0.017,0.021,0.008,0.003,-0.068,0.015,0.005,-0.099,0.031,0.011,0.4119,0.161,0.057,0.49,0.227,0.08,0.05,0.06,0.021
value_7,0.322,0.086,0.03,0.02,0.008,0.003,-0.067,0.015,0.005,-0.098,0.03,0.011,0.4124,0.165,0.058,0.488,0.232,0.082,0.05,0.061,0.021
value_8,0.394,0.084,0.03,0.021,0.005,0.002,-0.067,0.016,0.006,-0.098,0.031,0.011,0.4144,0.168,0.059,0.491,0.232,0.082,0.05,0.06,0.021
value+19_1,0.25,0.05,0.018,0.022,0.005,0.002,-0.068,0.014,0.005,-0.099,0.029,0.01,0.3765,0.197,0.07,0.465,0.267,0.094,0.05,0.061,0.022
value+19_2,0.451,0.09,0.032,0.036,0.011,0.004,-0.066,0.015,0.005,-0.098,0.031,0.011,0.4034,0.162,0.057,0.491,0.231,0.082,0.052,0.06,0.021


up to 4, one more year in training

In [19]:
### TRAINING
t_min = "2009"
t_max = "2019"

n_years_eff = int(t_max) - int(t_min) - 1

cv_default = model_selection.KFold(n_splits=n_years_eff, shuffle=False)

feature_combinations = [["value"], ["value", "19"], ["19"]]

lag_order = [1, 2, 3, 4]

tuners = []
cv_scores, test_scores = [], []
train_stackeds, test_stackeds, train_unstackeds = [], [], []
model_names = []

# params = {}
params = {
    "randomforestregressor__max_features": ["auto", "sqrt"],
    "randomforestregressor__min_samples_leaf": [1, 2, 4, 8],
    "randomforestregressor__min_samples_split": [2, 4, 8, 16],
}

for features, order in itertools.product(feature_combinations, lag_order):

    model_names.append("+".join(features) + "_" + str(order))

    labeled = eumf_pipeline.prepare_data(
        panel_comb_3m_macro,
        columns=features,
        lags=list(range(1, order + 1)),
        t_min=t_min,
        t_max=t_max,
    )
    transformed = eumf_pipeline.transform_data(labeled)
    train, test = eumf_pipeline.split_data(
        transformed, t_test_min=T_TEST_MIN, t_test_max=T_TEST_MAX
    )
    train_stacked, test_stacked = eumf_pipeline.stack_data(train, test)

    train_stackeds.append(train_stacked)
    train_unstackeds.append(train)
    test_stackeds.append(test_stacked)

    tuner = eumf_pipeline.train_reg_model(
        train_stacked,
        reg=ensemble.RandomForestRegressor(random_state=42),
        extra_pipeline_steps=[preprocessing.StandardScaler()],
        params=params,
        scoring=eumf_eval.scorer_rmse,
    )
    tuners.append(tuner)

    cv_score = eumf_eval.score_cv(tuner.best_estimator_, train_stacked, cv=cv_default,)
    cv_scores.append(cv_score)

    test_score = eumf_eval.score_test(tuner.best_estimator_, test_stacked,)
    test_scores.append(test_score)


In [20]:
pd.DataFrame({k: t.best_params_ for k, t in zip(model_names, tuners)}).transpose()

Unnamed: 0,randomforestregressor__max_features,randomforestregressor__min_samples_leaf,randomforestregressor__min_samples_split
value_1,auto,4,16
value_2,auto,4,16
value_3,auto,4,16
value_4,auto,4,16
value+19_1,auto,2,16
value+19_2,auto,2,16
value+19_3,auto,4,16
value+19_4,auto,4,16
19_1,auto,8,2
19_2,auto,8,2


In [21]:
eumf_eval.agg_multiple_cv_scores([df[1:] for df in cv_scores], model_names)


Unnamed: 0_level_0,fit_time,fit_time,fit_time,score_time,score_time,score_time,test_mae,test_mae,test_mae,test_rmse,test_rmse,test_rmse,test_explained_variance,test_explained_variance,test_explained_variance,test_r2_mod,test_r2_mod,test_r2_mod,test_delta_mae,test_delta_mae,test_delta_mae
Unnamed: 0_level_1,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem
value_1,0.455,0.157,0.055,0.042,0.02,0.007,-0.069,0.014,0.005,-0.101,0.027,0.01,0.355,0.238,0.084,0.432,0.312,0.11,0.049,0.061,0.022
value_2,0.224,0.038,0.013,0.019,0.004,0.002,-0.068,0.014,0.005,-0.099,0.028,0.01,0.383,0.21,0.074,0.457,0.282,0.1,0.05,0.062,0.022
value_3,0.466,0.101,0.036,0.044,0.017,0.006,-0.069,0.014,0.005,-0.1,0.028,0.01,0.379,0.21,0.074,0.45,0.282,0.1,0.049,0.062,0.022
value_4,0.38,0.228,0.081,0.03,0.023,0.008,-0.066,0.013,0.005,-0.097,0.028,0.01,0.41,0.182,0.064,0.478,0.258,0.091,0.052,0.064,0.023
value+19_1,0.604,0.171,0.06,0.049,0.019,0.007,-0.066,0.011,0.004,-0.098,0.024,0.009,0.361,0.259,0.091,0.436,0.32,0.113,0.051,0.066,0.023
value+19_2,0.416,0.05,0.018,0.028,0.007,0.002,-0.065,0.011,0.004,-0.097,0.025,0.009,0.386,0.236,0.084,0.458,0.293,0.104,0.053,0.066,0.023
value+19_3,0.32,0.04,0.014,0.02,0.005,0.002,-0.066,0.012,0.004,-0.096,0.026,0.009,0.407,0.204,0.072,0.482,0.265,0.094,0.052,0.066,0.023
value+19_4,0.41,0.1,0.035,0.021,0.006,0.002,-0.065,0.012,0.004,-0.094,0.027,0.01,0.432,0.177,0.062,0.51,0.237,0.084,0.052,0.064,0.023
19_1,0.263,0.08,0.028,0.02,0.006,0.002,-0.105,0.031,0.011,-0.145,0.042,0.015,-0.125,0.408,0.144,-0.162,0.609,0.215,0.012,0.046,0.016
19_2,0.216,0.053,0.019,0.022,0.013,0.005,-0.096,0.026,0.009,-0.138,0.043,0.015,-0.047,0.372,0.132,-0.032,0.525,0.186,0.022,0.051,0.018


### Add absolute values to GT

In [22]:
### TRAINING

t_min = "2010"
t_max = "2019"
n_years_eff = int(t_max) - int(t_min) - 1
cv_default = model_selection.KFold(n_splits=n_years_eff, shuffle=False)

feature_combinations = [["value", "19"], ["19"], ["19", "gdp", "unempl"]]
w_absolutes = [False, True]

tuners = []
cv_scores, test_scores = [], []
train_stackeds, test_stackeds, train_unstackeds = [], [], []
model_names = []

# params = {}
params = {
    "randomforestregressor__max_features": ["auto", "sqrt"],
    "randomforestregressor__min_samples_leaf": [1, 2, 4, 8],
    "randomforestregressor__min_samples_split": [2, 4, 8, 16],
}

for features, w_absolutes in itertools.product(feature_combinations, w_absolutes):

    model_names.append("+".join(features) + "_" + str(w_absolutes))

    labeled = eumf_pipeline.prepare_data(
        panel_comb_3m_macro, columns=features, lags=list(range(1, 6)), t_min="2010"
    )
    transformed = eumf_pipeline.transform_data(labeled)
    if w_absolutes:
        extra_cols = [c for c in labeled.x.columns if c[0].startswith("19")]
        transformed.x = transformed.x.join(labeled.x[extra_cols], rsuffix="_abs")
    train, test = eumf_pipeline.split_data(
        transformed, t_test_min=T_TEST_MIN, t_test_max=T_TEST_MAX
    )
    train_stacked, test_stacked = eumf_pipeline.stack_data(train, test)

    train_stackeds.append(train_stacked)
    train_unstackeds.append(train)
    test_stackeds.append(test_stacked)

    tuner = eumf_pipeline.train_reg_model(
        train_stacked,
        reg=ensemble.RandomForestRegressor(random_state=42),
        extra_pipeline_steps=[preprocessing.StandardScaler()],
        params=params,
        scoring=eumf_eval.scorer_rmse,
    )
    tuners.append(tuner)

    cv_score = eumf_eval.score_cv(tuner.best_estimator_, train_stacked, cv=cv_default,)
    cv_scores.append(cv_score)

    test_score = eumf_eval.score_test(tuner.best_estimator_, test_stacked,)
    test_scores.append(test_score)


In [23]:
pd.DataFrame({k: t.best_params_ for k, t in zip(model_names, tuners)}).transpose()

Unnamed: 0,randomforestregressor__max_features,randomforestregressor__min_samples_leaf,randomforestregressor__min_samples_split
value+19_False,auto,4,2
value+19_True,auto,8,2
19_False,auto,1,16
19_True,sqrt,1,2
19+gdp+unempl_False,sqrt,1,2
19+gdp+unempl_True,sqrt,4,2


In [24]:
eumf_eval.agg_multiple_cv_scores(cv_scores, model_names)


Unnamed: 0_level_0,fit_time,fit_time,fit_time,score_time,score_time,score_time,test_mae,test_mae,test_mae,test_rmse,test_rmse,test_rmse,test_explained_variance,test_explained_variance,test_explained_variance,test_r2_mod,test_r2_mod,test_r2_mod,test_delta_mae,test_delta_mae,test_delta_mae
Unnamed: 0_level_1,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem
value+19_False,0.38,0.064,0.023,0.018,0.008,0.002699,-0.065,0.014,0.005,-0.095,0.03,0.011,0.437,0.147,0.052,0.522,0.214,0.076,0.053,0.061,0.022
value+19_True,0.476,0.145,0.051,0.02,0.007,0.002453,-0.065,0.015,0.005,-0.097,0.032,0.011,0.434,0.14,0.05,0.52,0.209,0.074,0.052,0.06,0.021
19_False,0.358,0.072,0.025,0.02,0.007,0.002512,-0.086,0.026,0.009,-0.129,0.043,0.015,-0.005,0.404,0.143,0.113,0.505,0.179,0.031,0.049,0.017
19_True,0.27,0.062,0.022,0.023,0.009,0.003178,-0.087,0.03,0.011,-0.127,0.049,0.017,0.11,0.276,0.097,0.188,0.391,0.138,0.031,0.047,0.017
19+gdp+unempl_False,0.15,0.02,0.007,0.012,0.001,0.000465,-0.083,0.027,0.01,-0.118,0.044,0.016,0.202,0.18,0.064,0.311,0.282,0.1,0.035,0.049,0.017
19+gdp+unempl_True,0.135,0.025,0.009,0.011,0.002,0.0006386,-0.085,0.027,0.01,-0.118,0.042,0.015,0.185,0.192,0.068,0.295,0.302,0.107,0.033,0.049,0.017


### Normalization scheme of x values

In [9]:
### TRAINING

t_min = "2010"
t_max = "2019"
n_years_eff = int(t_max) - int(t_min) - 1
cv_default = model_selection.KFold(n_splits=n_years_eff, shuffle=False)

feature_combinations = [["19"], ["19", "gdp", "unempl"], ["19", "gdp", "unempl", "value"]]
epsilons = [0.1, 1., 5., 10.]

tuners = []
cv_scores, test_scores = [], []
train_stackeds, test_stackeds, train_unstackeds = [], [], []
model_names = []

# params = {}
params = {
}

for features, eps in itertools.product(feature_combinations, epsilons):

    model_names.append("+".join(features) + "_" + str(eps))

    labeled = eumf_pipeline.prepare_data(
        panel_comb_3m_macro, columns=features, lags=list(range(1, 6)), t_min="2010"
    )
    transformed = eumf_pipeline.transform_data(labeled, eps_x=eps)
    train, test = eumf_pipeline.split_data(
        transformed, t_test_min=T_TEST_MIN, t_test_max=T_TEST_MAX
    )
    train_stacked, test_stacked = eumf_pipeline.stack_data(train, test)

    train_stackeds.append(train_stacked)
    train_unstackeds.append(train)
    test_stackeds.append(test_stacked)

    tuner = eumf_pipeline.train_reg_model(
        train_stacked,
        reg=ensemble.RandomForestRegressor(random_state=42),
        extra_pipeline_steps=[preprocessing.StandardScaler()],
        params=params,
        scoring=eumf_eval.scorer_rmse,
    )
    tuners.append(tuner)

    cv_score = eumf_eval.score_cv(tuner.best_estimator_, train_stacked, cv=cv_default,)
    cv_scores.append(cv_score)

    test_score = eumf_eval.score_test(tuner.best_estimator_, test_stacked,)
    test_scores.append(test_score)


In [10]:
eumf_eval.agg_multiple_cv_scores(cv_scores, model_names)


Unnamed: 0_level_0,fit_time,fit_time,fit_time,score_time,score_time,score_time,test_mae,test_mae,test_mae,test_rmse,test_rmse,test_rmse,test_explained_variance,test_explained_variance,test_explained_variance,test_r2_mod,test_r2_mod,test_r2_mod,test_delta_mae,test_delta_mae,test_delta_mae
Unnamed: 0_level_1,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem,mean,std,sem
19_0.1,0.242,0.015,0.005,0.012,0.001572,0.0005558,-0.088,0.026,0.009,-0.129,0.041,0.015,-0.017,0.388,0.137,0.105,0.497,0.176,0.03,0.049,0.017
19_1.0,0.28,0.02,0.007,0.013,0.0007809,0.0002761,-0.088,0.026,0.009,-0.13,0.041,0.014,-0.031,0.371,0.131,0.089,0.487,0.172,0.03,0.049,0.017
19_5.0,0.4,0.145,0.051,0.021,0.01061,0.00375,-0.09,0.028,0.01,-0.134,0.044,0.016,-0.039,0.225,0.08,0.093,0.364,0.129,0.028,0.048,0.017
19_10.0,0.488,0.176,0.062,0.025,0.01079,0.003817,-0.089,0.026,0.009,-0.135,0.045,0.016,-0.068,0.313,0.111,0.059,0.438,0.155,0.028,0.05,0.018
19+gdp+unempl_0.1,0.612,0.076,0.027,0.018,0.003609,0.001276,-0.087,0.027,0.01,-0.123,0.041,0.015,0.084,0.189,0.067,0.223,0.336,0.119,0.031,0.047,0.017
19+gdp+unempl_1.0,0.558,0.114,0.04,0.017,0.003558,0.001258,-0.086,0.027,0.01,-0.122,0.041,0.014,0.089,0.182,0.064,0.231,0.335,0.119,0.032,0.047,0.017
19+gdp+unempl_5.0,0.477,0.037,0.013,0.014,0.001012,0.000358,-0.085,0.028,0.01,-0.121,0.044,0.016,0.128,0.171,0.06,0.271,0.305,0.108,0.033,0.047,0.017
19+gdp+unempl_10.0,0.542,0.113,0.04,0.016,0.003041,0.001075,-0.086,0.03,0.011,-0.121,0.045,0.016,0.132,0.157,0.056,0.274,0.297,0.105,0.032,0.045,0.016
19+gdp+unempl+value_0.1,0.534,0.024,0.009,0.013,0.0004607,0.0001629,-0.065,0.013,0.005,-0.097,0.029,0.01,0.393,0.2,0.071,0.485,0.262,0.092,0.052,0.062,0.022
19+gdp+unempl+value_1.0,0.745,0.244,0.086,0.018,0.005618,0.001986,-0.066,0.015,0.005,-0.097,0.029,0.01,0.387,0.197,0.07,0.481,0.261,0.092,0.051,0.06,0.021
