In [2]:
import numpy as np
import pandas as pd

import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.pipeline import Pipeline

# from sklearnex import patch_sklearn
# patch_sklearn()

from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from joblib import dump, load

# Correlaciones

In [3]:
data = pd.read_csv('./data/processed/dataset.csv', index_col=0)

data_anom = data.drop(
    columns=['value', 'cos_month', 'sin_month', 'cos_n_week', 'sin_n_week', 'cos_weekday', 'sin_weekday']
)
data = data.drop(columns=['value_no_cl', 'cos_month', 'sin_month', 'cos_n_week', 'sin_n_week'])

In [4]:
# data_corr = data.loc['2012-01-01':'2017-12-31'].corr(method='spearman')
#
# fig = plt.figure(figsize=(12, 12))
# ax = sns.heatmap(data_corr, annot=True, linewidths=.5, vmin=-1, vmax=1, cmap='bwr', fmt='.2f')
# plt.show()
# plt.close('all')

In [5]:
# data_corr = data_anom.corr(method='spearman')
#
# fig = plt.figure(figsize=(12, 12))
# ax = sns.heatmap(data_corr, annot=True, linewidths=.5, vmin=-1, vmax=1, cmap='bwr', fmt='.2f')
# plt.show()
# plt.close('all')

# Tratamiento de datos

## Estandarización

In [6]:
X = data.drop(columns='value')
y = data.loc[:, ['value']]

X_train = X.loc['2012-01-01':'2017-12-31']
X_test = X.loc['2018-01-01':'2018-06-30']

y_train = y.loc['2012-01-01':'2017-12-31']

In [7]:
# standard_scaler_X = preprocessing.StandardScaler().fit(X_train)
# print(standard_scaler_X.mean_)
# print(standard_scaler_X.scale_)
# X_train_standard_scaled = pd.DataFrame(
#     data=standard_scaler_X.transform(X_train),
#     index=X_train.index, columns=X_train.columns
# )
# X_test_standard_scaled = pd.DataFrame(
#     data=standard_scaler_X.transform(X_test),
#     index=X_test.index, columns=X_test.columns
# )

In [8]:
# standard_scaler_y = preprocessing.StandardScaler().fit(y_train)
# print(standard_scaler_y.mean_)
# print(standard_scaler_y.scale_)
# y_train_standard_scaled = pd.DataFrame(
#     data=standard_scaler_y.transform(y_train),
#     index=y_train.index, columns=y_train.columns
# )

# Pipeline

In [9]:
models_dict = {
    'Ridge': {
        'estimator': Ridge(),
        'best_estimator': None,
        'parameters_model' :{
            'estimator__alpha': np.logspace(-4, 6, 100)
        }
    },
    'Lasso': {
        'estimator': Lasso(),
        'best_estimator': None,
        'parameters_model' :{
            'estimator__alpha': np.logspace(-6, 6, 100)
        }
    },
    'ElasticNet': {
        'estimator': ElasticNet(),
        'best_estimator': None,
        'parameters_model' :{
            'estimator__alpha': np.logspace(-4, 2, 100),
            'estimator__l1_ratio': [0, 0.1, 0.5, 0.7, 0.9, 0.95, 0.99],
        }
    },
    'SVC': {
        'estimator': SVR(),
        'best_estimator': None,
        'parameters_model': {
            'estimator__C': [0.1, 1, 10, 100, 1000],
            'estimator__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'estimator__kernel': ['linear', 'rbf']
        }
    },
    'KNeighborsRegressor': {
        'estimator': KNeighborsRegressor(),
        'best_estimator': None,
        'parameters_model': {
            'estimator__n_neighbors': range(3, 30),
            'estimator__weights': ['uniform','distance']
        }
    },
    'RandomForestRegressor': {
        'estimator': RandomForestRegressor(),
        'best_estimator': None,
        'parameters_model': {
            'estimator__max_features' : [1, 'sqrt', 'log2'],
            'estimator__n_estimators': [400, 500, 600],
            'estimator__max_depth'   : [8, 9, 10, 11]
        }
    },
    'GradientBoostingRegressor': {
        'estimator': GradientBoostingRegressor(),
        'best_estimator': None,
        'parameters_model': {
            'estimator__max_features' : [1, 'sqrt', 'log2'],
            'estimator__max_depth' : [None, 1, 3, 5, 10, 20],
            'estimator__subsample' : [0.5, 1],
            'estimator__learning_rate' : [0.001, 0.01, 0.1],
            'estimator__n_estimators': [50, 100, 200, ]
        }
    }
}


In [10]:
parameters_preprocess = {
    'pca__n_components': range(6, len(X_train.columns)+1)
}

for model_str in models_dict.keys():

    time = dt.datetime.now()

    pca = PCA()
    scaler = preprocessing.StandardScaler()

    parameters_model = models_dict[model_str]['parameters_model']
    parameters = parameters_preprocess | parameters_model
    model = models_dict[model_str]['estimator']

    model_estimator =  Pipeline(steps=[("scaler", scaler), ("pca", pca), ("estimator", model)])

    clf = GridSearchCV(
        estimator=model_estimator,
        param_grid=parameters,
        scoring=['neg_mean_squared_error', 'r2'],
        n_jobs=-1,
        refit='neg_mean_squared_error',
        cv=8,
        verbose=1
    )
    clf.fit(X_train, y_train.to_numpy().ravel())

    dump(clf, f'./models/{model_str}.joblib')

    print(model)
    print(clf.best_params_, clf.best_score_,
          clf.cv_results_['mean_test_r2'][clf.best_index_],
          clf.cv_results_['mean_test_neg_mean_squared_error'][clf.best_index_]
    )
    print('time   ', (dt.datetime.now() - time).total_seconds())

Fitting 8 folds for each of 700 candidates, totalling 5600 fits
Ridge()
{'estimator__alpha': 14.174741629268048, 'pca__n_components': 11} -127660.60580521282 0.5753904687658107 -127660.60580521282
time    10.397408
Fitting 8 folds for each of 700 candidates, totalling 5600 fits
Lasso()
{'estimator__alpha': 1e-06, 'pca__n_components': 11} -127680.99222056005 0.575257928995521 -127680.99222056005
time    8.773612
Fitting 8 folds for each of 4900 candidates, totalling 39200 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

ElasticNet()
{'estimator__alpha': 0.008697490026177835, 'estimator__l1_ratio': 0, 'pca__n_components': 11} -127660.44099060187 0.5754020140870193 -127660.44099060187
time    81.545032
Fitting 8 folds for each of 350 candidates, totalling 2800 fits
SVR()
{'estimator__C': 1000, 'estimator__gamma': 0.1, 'estimator__kernel': 'rbf', 'pca__n_components': 10} -84463.69371427069 0.721517056232378 -84463.69371427069
time    138.546616
Fitting 8 folds for each of 378 candidates, totalling 3024 fits
KNeighborsRegressor()
{'estimator__n_neighbors': 18, 'estimator__weights': 'distance', 'pca__n_components': 8} -91560.17694069655 0.6940479930346026 -91560.17694069655
time    7.314539
Fitting 8 folds for each of 252 candidates, totalling 2016 fits
RandomForestRegressor()
{'estimator__max_depth': 9, 'estimator__max_features': 'log2', 'estimator__n_estimators': 600, 'pca__n_components': 9} -92345.91437429389 0.6932497377362644 -92345.91437429389
time    487.265989
Fitting 8 folds for each of 2268 candi