In [9]:
import pathlib

import numpy as np
import pandas as pd

import datetime as dt

import itertools

import holidays_co

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, TimeSeriesSplit

from sklearn import preprocessing

# from sklearnex import patch_sklearn
# patch_sklearn()

from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from joblib import dump, load

# Datos

In [10]:
data = pd.read_csv('./data/processed/dataset.csv', index_col=0)
data.index = pd.to_datetime(data.index)

data = data.drop(columns=['value_no_cl',])

pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [11]:
data.shape

(2557, 17)

## Explora correlación

In [12]:
# data_corr = data.loc['2012-01-01':'2017-12-31'].corr(method='spearman')
#
# fig = plt.figure(figsize=(12, 12))
# ax = sns.heatmap(data_corr, annot=True, linewidths=.5, vmin=-1, vmax=1, cmap='bwr', fmt='.2f')
# plt.show()
# plt.close('all')

In [13]:
# data_corr = data_anom.corr(method='spearman')
#
# fig = plt.figure(figsize=(12, 12))
# ax = sns.heatmap(data_corr, annot=True, linewidths=.5, vmin=-1, vmax=1, cmap='bwr', fmt='.2f')
# plt.show()
# plt.close('all')

## Datos para entrenamiento, prueba y validación

In [14]:
X = data.drop(columns='value')
y = data.loc[:, ['value']]


X_train = X.loc['2012-01-01':'2016-12-31']
y_train = y.loc['2012-01-01':'2016-12-31']

X_val = X.loc['2017-01-01':'2017-12-31']
y_val = y.loc['2017-01-01':'2017-12-31']

X_test = X.loc['2018-01-01':'2018-06-30']


scaler_y = preprocessing.RobustScaler(with_centering=False, quantile_range=(0, 99)).fit(y_train)
print(scaler_y.center_)
print(scaler_y.scale_)

y_train_scaled = pd.DataFrame(
    data=scaler_y.transform(y_train),
    index=y_train.index, columns=y_train.columns
)

y_val_scaled = pd.DataFrame(
    data=scaler_y.transform(y_val),
    index=y_val.index, columns=y_val.columns
)

X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

None
[2094.1]


((1827, 16), (365, 16), (1827, 1), (365, 1), (181, 16))

# Entrenamiento

In [15]:
models_dict = {
    'Ridge': {
        'estimator': Ridge(),
        'n_cv': 10,
        'parameters_model' :{
            'estimator__alpha': np.logspace(-4, 6, 100)
        }
    },
    'Lasso': {
        'estimator': Lasso(),
        'n_cv': 10,
        'parameters_model' :{
            'estimator__alpha': np.logspace(-6, 8, 100)
        }
    },
    'ElasticNet': {
        'estimator': ElasticNet(),
        'n_cv': 10,
        'parameters_model' :{
            'estimator__alpha': np.logspace(-4, 2, 100),
            'estimator__l1_ratio': [0.05, 0.1, 0.5, 0.7, 0.9, 0.95, 0.99],
        }
    },
    'KNeighborsRegressor': {
        'estimator': KNeighborsRegressor(),
        'n_cv': 10,
        'parameters_model': {
            'estimator__n_neighbors': range(3, 30),
            'estimator__weights': ['uniform','distance'],
            'estimator__p': [1, 2, 3],
        }
    },
    'RandomForestRegressor': {
        'estimator': RandomForestRegressor(
            oob_score=False,
            random_state=123,
            warm_start=True
        ),
        'n_cv': 8,
        'parameters_model': {
            'estimator__max_features' : [1, 'sqrt', 'log2'],
            'estimator__n_estimators': [400],
            'estimator__max_depth': [3, 4, 5],
            # 'estimator__ccp_alpha': np.logspace(-4, 4, 16)
        }
    },
    'GradientBoostingRegressor': {
        'estimator': GradientBoostingRegressor(
            n_estimators=500,
            random_state=123,
            # Activación de la parada temprana
            validation_fraction=0.1,
            n_iter_no_change=5,
            tol=0.0001
        ),
        'n_cv': 8,
        'parameters_model': {
            'estimator__max_features' : [1, 'sqrt', 'log2'],
            'estimator__max_depth' : [1, 3, 5],
            'estimator__subsample' : [0.5, 1],
            'estimator__learning_rate' : [0.001, 0.01, 0.1],
        }
    },
    'SVR': {
        'estimator': SVR(cache_size=1000),
        'n_cv': 8,
        'parameters_model': {
            'estimator__C': [10, 50, 75], #
            'estimator__gamma': [0.1, 0.01, 0.001, 0.0001], #
            'estimator__kernel': ['rbf'] #, 'rbf', 'poly', 'sigmoid'
        }
    },
}
# {'estimator__C': 100, 'estimator__gamma': 0.01, 'estimator__kernel': 'rbf', 'pca__n_components': 10}

In [16]:
tscv = TimeSeriesSplit(n_splits=10)

parameters_preprocess = {
    'pca__n_components': np.arange(7, 14, 1) # 13, len(X_train.columns)+1
}

refit_model = True

for model_str in models_dict.keys():

    time = dt.datetime.now()

    ruta_modelo = pathlib.Path(f'./models/{model_str}.joblib')

    # if ruta_modelo.is_file():
    if refit_model:

        print(f'Entrenando modelo {model_str} ...')

        pca = PCA(svd_solver='full')
        # scaler = preprocessing.StandardScaler()
        scaler = preprocessing.RobustScaler(with_centering=False, quantile_range=(0, 99))

        parameters_model = models_dict[model_str]['parameters_model']
        parameters = parameters_preprocess | parameters_model
        model = models_dict[model_str]['estimator']

        model_estimator =  Pipeline(steps=[("scaler", scaler), ("pca", pca), ("estimator", model)])

        clf = GridSearchCV(
            estimator=model_estimator,
            param_grid=parameters,
            scoring='neg_mean_squared_error',
            n_jobs=7,
            refit=True,
            cv=tscv, #models_dict[model_str]['n_cv'],
            verbose=1
        )
        clf.fit(X_train, y_train_scaled.to_numpy().ravel())

        dump(clf, str(ruta_modelo))

    print(clf.best_params_)
    print(clf.best_score_, clf.best_estimator_['pca'].explained_variance_ratio_.sum())
    print('time   ', (dt.datetime.now() - time).total_seconds())

    print('*************************************\n')


Entrenando modelo Ridge ...
Fitting 10 folds for each of 700 candidates, totalling 7000 fits
{'estimator__alpha': 1.747528400007683, 'pca__n_components': 13}
-0.030948357422297144 0.9975378474605173
time    13.357463
*************************************

Entrenando modelo Lasso ...
Fitting 10 folds for each of 700 candidates, totalling 7000 fits
{'estimator__alpha': 0.001788649529057435, 'pca__n_components': 13}
-0.030683876192272724 0.9975378474605173
time    12.366629
*************************************

Entrenando modelo ElasticNet ...
Fitting 10 folds for each of 4900 candidates, totalling 49000 fits
{'estimator__alpha': 0.01, 'estimator__l1_ratio': 0.05, 'pca__n_components': 13}
-0.030562835040945913 0.9975378474605173
time    83.53115
*************************************

Entrenando modelo KNeighborsRegressor ...
Fitting 10 folds for each of 1134 candidates, totalling 11340 fits
{'estimator__n_neighbors': 7, 'estimator__p': 1, 'estimator__weights': 'distance', 'pca__n_compone

In [31]:
clf.best_estimator_['pca'].explained_variance_ratio_.sum()

0.9427608960425135

In [None]:
{'estimator__C': 100, 'estimator__gamma': 0.01, 'estimator__kernel': 'rbf', 'pca__n_components': 10}
