In [1]:
#histgradboost
import pandas as pd
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor

#__file__ = Path('submissions') /  'my_submission1' /  'estimator.py'

def _encode(X):
    #cyclical encoding of dates
    X = X.copy()
    year_norm = 2 * np.pi * X['date'].dt.year / X['date'].dt.year.max()
    month_norm = 2 * np.pi * X['date'].dt.month / X['date'].dt.month.max()
    day_norm = 2 * np.pi * X['date'].dt.day / X['date'].dt.day.max()
    weekday_norm = 2 * np.pi * X['date'].dt.weekday / X['date'].dt.weekday.max()
    hour_norm = 2 * np.pi * X['date'].dt.hour / X['date'].dt.hour.max()
    X.loc[:, 'year_sin'] = np.sin(year_norm)
    X.loc[:, 'year_cos'] = np.cos(year_norm)
    X.loc[:, 'month_sin'] = np.sin(month_norm)
    X.loc[:, 'month_cos'] = np.cos(month_norm)
    X.loc[:, 'day_sin'] = np.sin(day_norm)
    X.loc[:, 'day_cos'] = np.cos(day_norm)
    X.loc[:, 'weekday_sin'] = np.sin(weekday_norm)
    X.loc[:, 'weekday_cos'] = np.cos(weekday_norm)
    X.loc[:, 'hour_sin'] = np.sin(hour_norm)
    X.loc[:, 'hour_cos'] = np.cos(hour_norm)
    #encode dates
    X.loc[:, 'year'] = X['date'].dt.year
    X.loc[:, 'month'] = X['date'].dt.month
    X.loc[:, 'day'] = X['date'].dt.day
    X.loc[:, 'weekday'] = X['date'].dt.weekday
    X.loc[:, 'hour'] = X['date'].dt.hour
    return X.drop(columns=["date"]) 

def _merge_external_data(X):
    file_path = Path(__file__).parent / 'external_data.csv'
    df_ext = pd.read_csv(file_path, parse_dates=['date'])
    X = X.copy()
    # When using merge_asof left frame need to be sorted
    X['orig_index'] = np.arange(X.shape[0])
    X = pd.merge_asof(X.sort_values('date'), df_ext[['date', 't', 'ff', 'u', 'brent', 'holidays', 'curfew', 'rush hour', 'Taux', 'bike']].sort_values('date'), on='date')
    # Sort back to the original order
    X = X.sort_values('orig_index')
    del X['orig_index']
    return X

def get_estimator():
    date_encoder = FunctionTransformer(_encode)
    cycl_cols = ['month_sin', 'month_cos','day_sin', 'day_cos', 'weekday_sin', 'weekday_cos', 'hour_sin', 'hour_cos']
    date_cols = ['year', 'day']

    categorical_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    categorical_cols = ["site_name", "counter_name"]
    binary_cols =  ['curfew']
    numeric_cols = ['Taux', 'bike', 't', 'brent', 'ff']

    preprocessor = ColumnTransformer(
        [
            ('date', 'passthrough', date_cols),
            ('cycl', 'passthrough', cycl_cols),
            ('holiday', 'passthrough', binary_cols),
            ('cat', categorical_encoder, categorical_cols),
            ('numeric', 'passthrough', numeric_cols)
        ]
    )
    regressor = HistGradientBoostingRegressor(random_state=0, max_leaf_nodes=300, max_iter=150)

    pipe = make_pipeline(
        FunctionTransformer(_merge_external_data, validate=False), date_encoder, preprocessor, regressor)

    return pipe


ImportError: cannot import name 'HistGradientBoostingRegressor' from 'sklearn.ensemble' (C:\Users\ckunt\anaconda3\lib\site-packages\sklearn\ensemble\__init__.py)

In [None]:
import problem

X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()

In [None]:
for param_name in get_estimator().get_params().keys():
    print(param_name)

In [48]:
from sklearn.model_selection import GridSearchCV
import time

param_grid = {'histgradientboostingregressor__max_leaf_nodes': (10, 100, 300),
             'histgradientboostingregressor__max_iter': (3, 30, 60, 100)}

model_grid = GridSearchCV(get_estimator(), param_grid=param_grid, n_jobs=7)
start_time = time.time()
model_grid.fit(X_train, y_train)
print(time.time() - start_time)

print(f"The best set of parameters is: {model_grid.best_params_}")

165.1913046836853
The best set of parameters is: {'histgradientboostingregressor__max_depth': 3, 'histgradientboostingregressor__max_iter': 100}


In [None]:
from sklearn.model_selection import TimeSeriesSplit

param_search = {'histgradientboostingregressor__max_leaf_nodes': (10, 100, 300, 600),
             'histgradientboostingregressor__max_depth': (3, 30, 60, 100, 200, 300)}

tscv = TimeSeriesSplit(n_splits=6)
gsearch = GridSearchCV(estimator=get_estimator(), cv=tscv,
                        param_grid=param_search)
gsearch.fit(X_train, y_train)
print(f"The best set of parameters is: {gsearch.best_params_}")

In [29]:
from sklearn.model_selection import cross_val_score

train_scores = cross_val_score(model_grid, X_train, y_train)
test_scores = cross_val_score(model_grid, X_test, y_test)

print(f"The train accuracy score is: {train_scores.mean():.2f} +- {train_scores.std():.2f}")
print(f"The different scores obtained are: \n{train_scores}")

print(f"The test accuracy score is: {test_scores.mean():.2f} +- {test_scores.std():.2f}")
print(f"The different scores obtained are: \n{test_scores}")

print(f"The best set of parameters is: {model_grid.best_params_}")

The train accuracy score is: 0.56 +- 0.11
The different scores obtained are: 
[0.56062665 0.70574582 0.58571728 0.58393854 0.35100386]
The validation accuracy score is: 0.52 +- 0.07
The different scores obtained are: 
[0.55676324 0.63239756 0.4270575  0.50191016 0.49208823]
The best set of parameters is: {'histgradientboostingregressor__learning_rate': 0.1, 'histgradientboostingregressor__max_leaf_nodes': 3}
