# __Excursion: Grid Search of time series__

In [None]:
import pandas as pd
import numpy as np
import datetime as dt

import joblib
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import xgboost as xgb

### __Kfold cross validation__

<img src="../images/Kfold_CV.png"> <br/>
source: scikit-learn.org

### __Time series split__

<img src="../images/time_series_split.png"> <br/>
source: datascience.stackexchange.com
<br/>
<br/>
__Note: the sklearn tscv does not account for leakiness__
<br/>
<br/>

In [None]:
learning_rate = [0.01, 0.1, 0.3] #learning_rate
max_depth = [10, 25, 50] #depth of trees
n_estimators = [5, 10, 100] #number of base learners

params = {
    'learning_rate': learning_rate,
    'max_depth': max_depth,
    'n_estimators': n_estimators,
        }

params

In [None]:
def XGB_GS_ts(X_train, y_train, params, run, n_splits=2, n_jobs=7, verbose=5):
    '''
    Function performs GridSearch using TimeSeries CV
    X_train, y_train
    n_splits=number of splits in TimeSeriesCV; default:3
    n_jobs=default: -1
    verbose=default:5
    '''
    
    model = xgb.XGBRegressor()

    tscv = TimeSeriesSplit(n_splits=n_splits)
    gsearch = GridSearchCV(estimator=model, cv=tscv,
                            param_grid=params, n_jobs=n_jobs, verbose=verbose)

    gsearch.fit(X_train, y_train)
    
    print("Best params were: {}".format(gsearch.best_params_))
    
    pd.DataFrame(gsearch.cv_results_).to_csv('{}/nem-data/trainings/grid_searches/{}_GS.csv'.format(os.environ['HOME'],run))
    joblib.dump(gsearch, '{}/nem-data/trainings/gridsearches/{}_GS_object.pkl'.format(os.environ['HOME'], run))
    
    best_model = gsearch.best_estimator_
    
    error_test = np.sqrt(mse(y_test, best_model.predict(X_test))/y_test.mean())
    error_train = np.sqrt(mse(y_train, best_model.predict(X_train))/y_train.mean())
    compare_train_test_error = abs(error_test - error_train)
    
    settings = {
    "Model": "XGBoost",
    "Feature Description": "sine_cosine, lag_12, horizon=0, demand, capacity, interconnectors",
    "Model Description": gsearch.best_params_
    }

    print(f"Root mean squared percentage error: {error_train, error_test}")
    log_test_results(
        settings, error_train, error_test,
        compare_train_test_error, run
    )
    
    return gsearch

In [None]:
def log_test_results(settings, error_train, error_test, train_test_error_difference, file_name):
    csv_path = '{}/nem-data/trainings/grid_searches/{}_GS_log.csv'.format(os.environ['HOME'], file_name)
    must_add_headers = False if os.path.isfile(csv_path) else True

    with open(csv_path, mode='a') as test_results:
        writer = csv.writer(test_results,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)

        if must_add_headers:
            writer.writerow([
                'Model', 'Feature Description', "Model Description", "Training error", "Test error",
                "Difference_train_test_error"
            ])
        writer.writerow([
            settings["Model"], settings["Feature Description"],
            str(settings["Model Description"]), error_train, error_test,
            train_test_error_difference
        ])

In [None]:

#show log file!
