# Imports

In [1]:
!pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.6.0


In [2]:
import numpy as np
import pandas as pd
import matplotlib
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import optuna

rng = np.random.RandomState(42)

In [3]:
import sklearn
sklearn.__version__

'1.6.0'

# Data

In [4]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 66), (34938,)
Valid: (2920, 66), (2920,)
Test: (2920, 66), (2920,)


# Baseline

In [6]:
model = HistGradientBoostingRegressor(
    max_iter=10000, 
    early_stopping=True, validation_fraction=0.1, n_iter_no_change=25,
    verbose=1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Binning 0.017 GB of training data: 0.171 s
Binning 0.002 GB of validation data: 0.008 s
Fitting gradient boosted rounds:
Fit 84 trees in 1.305 s, (2604 total leaves)
Time spent computing histograms: 0.630s
Time spent finding best splits:  0.175s
Time spent applying splits:      0.105s
Time spent predicting:           0.012s
Train MAE: 2.7128854203032637
Valid MAE: 3.13856222322185


# Hyperparameter tuning

In [7]:
def load_data():
    data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
    data = pd.read_csv(data_path)
    data['DATE'] = data['DATE'].astype('datetime64[ns]')
    X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
    y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
    X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
    y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
    return X_train, X_valid, y_train, y_valid

# max_iter is determined by early stopping
def objective(trial):
    PARAMS = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-6, 0.5),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 64),
        'max_depth': trial.suggest_int('max_depth', 1, 12),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 1000),
        'l2_regularization': trial.suggest_float('l2_regularization', 0.0, 100),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0)
    }
    X_train, _, y_train, _ = load_data()
    model = HistGradientBoostingRegressor(
        max_iter=10000, 
        early_stopping=True, validation_fraction=0.1, n_iter_no_change=25,
        verbose=0, random_state=rng
    )
    model.set_params(**PARAMS)
    score = np.mean(cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'))
    return score

In [8]:
sampler = optuna.samplers.TPESampler(
    n_startup_trials = 50, # the random sampling is used instead of the TPE algorithm until the given number of trials finish in the same study
    n_ei_candidates = 24, # number of candidate samples used to calculate the expected improvement.
    multivariate = True, # multivariate TPE when suggesting candidates; default: False
    seed = 42
)
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=100, interval_steps=20),
    direction='maximize',
    sampler=sampler,
    storage='sqlite:///db.sqlite3'
)
study.optimize(
    objective, n_trials = 1000,
    timeout = 3600*10, # in seconds
    n_jobs = 2,
    show_progress_bar = True
)

[I 2024-12-23 20:59:18,982] A new study created in RDB with name: no-name-a8ea1cad-784a-4193-8af9-74625a804603


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-12-23 20:59:22,671] Trial 0 finished with value: -3.112473825343957 and parameters: {'learning_rate': 0.4739992417883072, 'max_leaf_nodes': 48, 'max_depth': 9, 'min_samples_leaf': 304, 'l2_regularization': 6.386042408116543, 'max_features': 0.425878418449255}. Best is trial 0 with value: -3.112473825343957.
[I 2024-12-23 20:59:23,837] Trial 1 finished with value: -3.0606111102828586 and parameters: {'learning_rate': 0.22115066161961355, 'max_leaf_nodes': 30, 'max_depth': 4, 'min_samples_leaf': 439, 'l2_regularization': 15.095809583635644, 'max_features': 0.19648096396565395}. Best is trial 1 with value: -3.0606111102828586.
[I 2024-12-23 21:00:10,711] Trial 2 finished with value: -3.0399688759969767 and parameters: {'learning_rate': 0.0067154464453057835, 'max_leaf_nodes': 24, 'max_depth': 10, 'min_samples_leaf': 563, 'l2_regularization': 24.481152794599105, 'max_features': 0.8899013726815374}. Best is trial 2 with value: -3.0399688759969767.
[I 2024-12-23 21:00:17,941] Trial 4

In [9]:
print(f"Best score: {study.best_trial.value}")
print(f"Best params: {study.best_trial.params}")

Best score: -2.98030777634189
Best params: {'learning_rate': 0.02272113590315003, 'max_leaf_nodes': 17, 'max_depth': 8, 'min_samples_leaf': 2, 'l2_regularization': 15.513355726003248, 'max_features': 0.23800041394683397}


In [6]:
model = HistGradientBoostingRegressor(
    max_iter=10000, 
    learning_rate=0.02, max_leaf_nodes=17, max_depth=8,
    min_samples_leaf=2, l2_regularization=15.51, max_features=0.238,
    early_stopping=True, validation_fraction=0.1, n_iter_no_change=250,
    verbose=1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Binning 0.017 GB of training data: 0.195 s
Binning 0.002 GB of validation data: 0.008 s
Fitting gradient boosted rounds:
Fit 1418 trees in 12.022 s, (24106 total leaves)
Time spent computing histograms: 7.254s
Time spent finding best splits:  1.502s
Time spent applying splits:      0.901s
Time spent predicting:           0.174s
Train MAE: 2.666724509093296
Valid MAE: 3.1360646898332805


In [7]:
model = HistGradientBoostingRegressor(
    max_iter=10000, 
    learning_rate=0.02, max_leaf_nodes=17, max_depth=8,
    min_samples_leaf=2, l2_regularization=15.5, max_features=0.25,
    early_stopping=True, validation_fraction=0.1, n_iter_no_change=250,
    verbose=1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Binning 0.017 GB of training data: 0.172 s
Binning 0.002 GB of validation data: 0.008 s
Fitting gradient boosted rounds:
Fit 1724 trees in 14.393 s, (29308 total leaves)
Time spent computing histograms: 8.694s
Time spent finding best splits:  1.829s
Time spent applying splits:      1.075s
Time spent predicting:           0.200s
Train MAE: 2.6299867352729365
Valid MAE: 3.1399849982538584


In [8]:
model = HistGradientBoostingRegressor(
    max_iter=10000, 
    learning_rate=0.03, max_leaf_nodes=14, max_depth=1,
    min_samples_leaf=13, l2_regularization=4.53, max_features=0.964,
    early_stopping=True, validation_fraction=0.1, n_iter_no_change=250,
    verbose=1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Binning 0.017 GB of training data: 0.185 s
Binning 0.002 GB of validation data: 0.008 s
Fitting gradient boosted rounds:
Fit 8352 trees in 30.463 s, (16704 total leaves)
Time spent computing histograms: 21.630s
Time spent finding best splits:  1.136s
Time spent applying splits:      0.799s
Time spent predicting:           0.612s
Train MAE: 2.87554442456212
Valid MAE: 3.1470062376205035
