In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import pylab
import copy 

from sklearn import ensemble
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit, KFold
from sklearn.model_selection import ParameterGrid
from sklearn import metrics

from sklearn.linear_model import ElasticNet

In [2]:
data_training_full = pd.read_pickle('/mnt/disks/disk1/data_train_1209.pkl')

In [3]:
data_training = data_training_full.sample(frac=0.1, random_state=123).sort_index()
data_training_h1 = data_training_full.iloc[:data_training_full.shape[0]//2,:].sample(frac=0.1, random_state=123).sort_index()
data_training_h2 = data_training_full.iloc[data_training_full.shape[0]//2:,:].sample(frac=0.1, random_state=123).sort_index()

In [4]:
def model_wf_cv(alg, dtrain, predictors, target, n_splits):
    tscv = KFold(n_splits=n_splits) #TimeSeriesSplit
    cv_scores_test = np.zeros((n_splits, 1))
    cv_scores_train = np.zeros((n_splits, 1))
    cv_scores_r2_test = np.zeros((n_splits, 1))
    
    for i, (train_index, test_index) in enumerate(tscv.split(dtrain)):
        
        alg.fit(dtrain.iloc[train_index][predictors], dtrain.iloc[train_index][target])
        
        test_predictions = alg.predict(dtrain.iloc[test_index][predictors])
        train_predictions = alg.predict(dtrain.iloc[train_index][predictors])
        
        cv_scores_test[i, 0] = np.sqrt(metrics.mean_squared_error(dtrain.iloc[test_index][target].values, 
                                                             test_predictions))
        cv_scores_train[i, 0] = np.sqrt(metrics.mean_squared_error(dtrain.iloc[train_index][target].values, 
                                                             train_predictions))
        cv_scores_r2_test[i, 0] = metrics.r2_score(dtrain.iloc[test_index][target].values, 
                                                             test_predictions)
        
    return np.mean(cv_scores_train), np.mean(cv_scores_test), np.mean(cv_scores_r2_test)

def grid_search_rmse(alg, dtrain, predictors, target, n_splits, parameters):
    best_score = np.inf
    for g in ParameterGrid(parameters):
        print(g)
        alg.set_params(**g)
        wf_cv_score_train, wf_cv_score_test, wf_cv_score_test_r2 = model_wf_cv(alg, dtrain, predictors, target, n_splits)
        if(wf_cv_score_test < best_score):
            best_score = wf_cv_score_test
            best_grid = g
        print("\tCV score test: %f (R2 %f)\tCV score train: %f"%(wf_cv_score_test, wf_cv_score_test_r2, wf_cv_score_train))
    return best_score, best_grid

In [5]:
predictors = list(data_training)
predictors = np.array(predictors)[[x != 'y' for x in predictors]]

regr = ElasticNet(
    normalize=True,
    random_state=0
)

regr_h1 = ElasticNet(
    normalize=True,
    random_state=0
)

regr_h2 = ElasticNet(
    normalize=True,
    random_state=0
)

In [7]:
%%time

params_elastnet = {
    'alpha': [7e-6, 9e-6, 1e-5], 
    'l1_ratio': [0.5, 0.6, 0.7, 0.8, 0.9]
}

print(grid_search_rmse(copy.deepcopy(regr), data_training, predictors, 'y', 3, params_elastnet))

{'alpha': 7e-06, 'l1_ratio': 0.5}
	CV score test: 0.696736 (R2 0.018406)	CV score train: 0.698550
{'alpha': 7e-06, 'l1_ratio': 0.6}
	CV score test: 0.696729 (R2 0.018434)	CV score train: 0.698530
{'alpha': 7e-06, 'l1_ratio': 0.7}
	CV score test: 0.696728 (R2 0.018445)	CV score train: 0.698504
{'alpha': 7e-06, 'l1_ratio': 0.8}
	CV score test: 0.696740 (R2 0.018421)	CV score train: 0.698470
{'alpha': 7e-06, 'l1_ratio': 0.9}
	CV score test: 0.696797 (R2 0.018266)	CV score train: 0.698418
{'alpha': 9e-06, 'l1_ratio': 0.5}
	CV score test: 0.696736 (R2 0.018409)	CV score train: 0.698768
{'alpha': 9e-06, 'l1_ratio': 0.6}
	CV score test: 0.696724 (R2 0.018450)	CV score train: 0.698751
{'alpha': 9e-06, 'l1_ratio': 0.7}
	CV score test: 0.696722 (R2 0.018467)	CV score train: 0.698729
{'alpha': 9e-06, 'l1_ratio': 0.8}
	CV score test: 0.696723 (R2 0.018473)	CV score train: 0.698696
{'alpha': 9e-06, 'l1_ratio': 0.9}
	CV score test: 0.696757 (R2 0.018394)	CV score train: 0.698642
{'alpha': 1e-05, 'l1

In [9]:
%%time

params_elastnet_h1 = {
    'alpha': [5e-6, 1e-5, 1e-4], 
    'l1_ratio': [0.5, 0.7, 0.9]
}

print(grid_search_rmse(copy.deepcopy(regr_h1), data_training_h1, predictors, 'y', 3, params_elastnet_h1))

{'alpha': 5e-06, 'l1_ratio': 0.5}
	CV score test: 0.769017 (R2 0.016065)	CV score train: 0.764399
{'alpha': 5e-06, 'l1_ratio': 0.7}
	CV score test: 0.769133 (R2 0.015766)	CV score train: 0.764311
{'alpha': 5e-06, 'l1_ratio': 0.9}
	CV score test: 0.769342 (R2 0.015234)	CV score train: 0.764163
{'alpha': 1e-05, 'l1_ratio': 0.5}
	CV score test: 0.768405 (R2 0.017640)	CV score train: 0.765108
{'alpha': 1e-05, 'l1_ratio': 0.7}
	CV score test: 0.768392 (R2 0.017676)	CV score train: 0.765071
{'alpha': 1e-05, 'l1_ratio': 0.9}
	CV score test: 0.768474 (R2 0.017468)	CV score train: 0.764965
{'alpha': 0.0001, 'l1_ratio': 0.5}
	CV score test: 0.769509 (R2 0.014833)	CV score train: 0.769164
{'alpha': 0.0001, 'l1_ratio': 0.7}
	CV score test: 0.769668 (R2 0.014430)	CV score train: 0.769452
{'alpha': 0.0001, 'l1_ratio': 0.9}
	CV score test: 0.769605 (R2 0.014589)	CV score train: 0.769428
(0.7683916538448633, {'alpha': 1e-05, 'l1_ratio': 0.7})
CPU times: user 6min 48s, sys: 41.1 s, total: 7min 29s
Wall

In [12]:
%%time

params_elastnet_h2 = {
    'alpha': [5e-6, 1e-5, 1e-4], 
    'l1_ratio': [0.3, 0.5, 0.7]
}

print(grid_search_rmse(copy.deepcopy(regr_h2), data_training_h2, predictors, 'y', 3, params_elastnet_h2))

{'alpha': 5e-06, 'l1_ratio': 0.3}
	CV score test: 0.628012 (R2 0.012688)	CV score train: 0.624125
{'alpha': 5e-06, 'l1_ratio': 0.5}
	CV score test: 0.627973 (R2 0.012811)	CV score train: 0.624095
{'alpha': 5e-06, 'l1_ratio': 0.7}
	CV score test: 0.628039 (R2 0.012587)	CV score train: 0.624044
{'alpha': 1e-05, 'l1_ratio': 0.3}
	CV score test: 0.627394 (R2 0.014687)	CV score train: 0.624728
{'alpha': 1e-05, 'l1_ratio': 0.5}
	CV score test: 0.627295 (R2 0.015000)	CV score train: 0.624718
{'alpha': 1e-05, 'l1_ratio': 0.7}
	CV score test: 0.627322 (R2 0.014901)	CV score train: 0.624695
{'alpha': 0.0001, 'l1_ratio': 0.3}
	CV score test: 0.627687 (R2 0.014018)	CV score train: 0.628591
{'alpha': 0.0001, 'l1_ratio': 0.5}
	CV score test: 0.627958 (R2 0.013184)	CV score train: 0.629083
{'alpha': 0.0001, 'l1_ratio': 0.7}
	CV score test: 0.628095 (R2 0.012766)	CV score train: 0.629381
(0.6272953815035068, {'alpha': 1e-05, 'l1_ratio': 0.5})
CPU times: user 6min 48s, sys: 41.8 s, total: 7min 30s
Wall

In [6]:
regr.set_params(alpha=9e-06, l1_ratio=0.7)
regr_h1.set_params(alpha=1e-05, l1_ratio=0.7)
regr_h2.set_params(alpha=1e-05, l1_ratio=0.5)

regr.fit(data_training[predictors], data_training['y'])
regr_h1.fit(data_training_h1[predictors], data_training_h1['y'])
regr_h2.fit(data_training_h1[predictors], data_training_h2['y'])

pickle.dump([regr, regr_h1, regr_h2], open("./core_models/regr1.pkl", "wb"))
pickle.dump(predictors, open("./core_models/regr1_columns.pkl", "wb"))