# Imports

In [7]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from skopt import BayesSearchCV
from skopt.space import *
from skopt.plots import plot_convergence

rng = np.random.RandomState(42)

# Data

In [8]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

print(data.shape)
data.head()

(40778, 54)


Unnamed: 0,DATE,TARGET,TMAX_d1,TMAX_d1_d2_diff,TMAX_3day_mean,TMAX_3day_std,TMAX_3day_min,TMAX_3day_max,TMAX_3day_median,TMAX_5day_mean,...,MONTH_11,MONTH_12,STATION_USW00003967,STATION_USW00012916,STATION_USW00013960,STATION_USW00024025,STATION_USW00024233,STATION_USW00093067,STATION_USW00093225,STATION_USW00094728
0,2010-01-16,15.6,15.0,-1.1,13.7,3.251154,10.0,16.1,15.0,12.98,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2010-01-17,14.4,15.6,0.6,15.566667,0.550757,15.0,16.1,15.6,14.22,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2010-01-18,17.2,14.4,-1.2,15.0,0.6,14.4,15.6,15.0,14.22,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2010-01-19,21.1,17.2,2.8,15.733333,1.404754,14.4,17.2,15.6,15.66,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2010-01-20,23.9,21.1,3.9,17.566667,3.365016,14.4,21.1,17.2,16.66,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 52), (34938,)
Valid: (2920, 52), (2920,)
Test: (2920, 52), (2920,)


# Baseline

In [10]:
model = RandomForestRegressor(random_state=rng, n_jobs=-1)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 1.112367422290915
Valid MAE: 3.200565410958904


# Hyperparameter tuning

In [11]:
hyperparameter_space = {
    'n_estimators': Integer(low=50, high=500),
    'max_depth': Integer(low=1, high=15),
    'min_samples_leaf': Integer(low=1, high=500),
    'max_features': Real(low=0.1, high=1.0),
    'min_impurity_decrease': Real(low=1e-6, high=1.0, prior='log-uniform'),
    'ccp_alpha': Real(low=1e-6, high=0.1, prior='log-uniform'),
}
model = RandomForestRegressor(random_state=rng)

clf = BayesSearchCV(
    model, hyperparameter_space, n_iter=100,
    optimizer_kwargs={
        'base_estimator': 'GP',
        'n_initial_points': 20,
        'initial_point_generator': 'grid',
        'acq_func': 'gp_hedge', # probabilistically choose one of ['EI', 'LCB', 'PI'] acquisition functions at every iteration
        'acq_func_kwargs': {'xi': 0.1},
        'n_jobs': -1, # default: 1
        'random_state': rng
    },
    scoring='neg_mean_absolute_error', cv=3,
    random_state=rng, n_jobs=-1, verbose=3
)

clf.fit(X_train, y_train)
print('Best score: ', clf.best_score_)
print('Best params: ', clf.best_params_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

In [12]:
print('Best score: ', clf.best_score_)
print('Best params: ', clf.best_params_)

Best score:  -3.0063831507521623
Best params:  OrderedDict([('ccp_alpha', 1e-06), ('max_depth', 11), ('max_features', 0.5313900896382238), ('min_impurity_decrease', 0.0023339126274021294), ('min_samples_leaf', 1), ('n_estimators', 500)])


In [13]:
model = RandomForestRegressor(
    n_estimators=500, max_depth=11,
    max_features=0.5,
    n_jobs=-1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.464822758909998
Valid MAE: 3.161022151074089


In [15]:
model = RandomForestRegressor(
    n_estimators=1000, max_depth=12,
    max_features=0.5,
    n_jobs=-1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.3154421926663393
Valid MAE: 3.1622074774325526
