In [1]:
from itertools import combinations
import re
import optuna
from sklearn.base import BaseEstimator, clone, RegressorMixin
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score, learning_curve
from tqdm import tqdm
from data_manipulations import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_features = pl.read_parquet('train_features.parquet')
train_targets = pl.read_parquet('train_targets.parquet')
test_features = pl.read_parquet('test_features.parquet')
test_targets = pl.read_parquet('test_targets.parquet')

data = prepare_data_for_ml(
    train_features, train_targets, test_features, test_targets,
)

In [3]:
class SparseHGBR(BaseEstimator, RegressorMixin):
    def __init__(
        self,
        learning_rate=0.25,
        max_leaf_nodes=8,
        min_samples_leaf=1,
        importance_threshold=0.1,
    ):
        self.learning_rate = learning_rate
        self.max_leaf_nodes = max_leaf_nodes
        self.min_samples_leaf = min_samples_leaf
        self.importance_threshold = importance_threshold
        self.model = HistGradientBoostingRegressor(
            learning_rate=self.learning_rate,
            max_leaf_nodes=self.max_leaf_nodes,
            max_depth=2,  # fast!
            min_samples_leaf=self.min_samples_leaf,
            early_stopping=True,  # 'auto',
            scoring=scoring,  # 'loss',
            random_state=0, # None,
        )
        self.models = None

    def fit(self, features, targets):
        self.models = []
        for ind in range(targets.shape[1]):
            model = clone(self.model)
            model.fit(features, targets[:, ind])
            result = permutation_importance(
                estimator=model,
                X=features,
                y=targets[:, ind],
                scoring=scoring,
                n_repeats=3,
                n_jobs=-1,
                random_state=0,
            )
            importances = result.importances_mean
            support = importances >= self.importance_threshold
            if not np.any(support):
                support[np.argmax(importances)] = True

            model = clone(self.model)
            model.fit(features[:, support], targets[:, ind])
            self.models.append((model, support))
        return self

    def predict(self, features):
        predictions = [model.predict(features[:, support]) for model, support in self.models]
        return np.stack(predictions, axis=1)

def get_cv_score(kwargs):
    return cross_val_score(
        SparseHGBR(**kwargs),
        data['train_features_rect'].to_numpy(),
        data['train_targets_rect'].to_numpy(),
        cv=5,
        scoring=scoring,
        n_jobs=-1,
    ).mean()

In [4]:
# def restrict(frame, heads=(25, 20)):
#     keys = frame.columns[:2]
#     temp = join_many((
#         frame.select(keys[0]).unique().sort(by=keys[0]).head(heads[0]),
#         frame.select(keys[1]).unique().sort(by=keys[1]).head(heads[1]),
#     ), how='cross')
#     return frame.join(temp, on=keys)

# train_features = restrict(train_features)
# train_targets = restrict(train_targets)
# test_features = restrict(test_features)
# test_targets = restrict(test_targets)

# data = prepare_data_for_ml(train_features, train_targets, test_features, test_targets)
# data['train_features_rect'].shape, data['train_targets_rect'].shape

# Exploration

In [5]:
%%time

n_trials = 50
progress_bar = tqdm(total=n_trials)

param_grid = {
    'learning_rate': ['float', 1e-4, 0.25, {}],  # 0.1
    'max_leaf_nodes': ['int', 2, 31, {}],  # 31
    'min_samples_leaf': ['int', 200, 1000, {}],  # 20
    'importance_threshold': ['float', 1e-3, 1, {}],  # 1.0
}

def objective(trial):
    kwargs = {
        key: getattr(trial, f'suggest_{typ}')(key, *bounds, **options)
        for key, (typ, *bounds, options) in param_grid.items()
    }
    score = get_cv_score(kwargs)
    progress_bar.update(1)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)

  0%|          | 0/50 [00:00<?, ?it/s][I 2025-04-23 10:48:11,960] A new study created in memory with name: no-name-b34a6df9-c9f3-491e-b935-e4c704559e23
  2%|▏         | 1/50 [00:32<26:11, 32.08s/it][I 2025-04-23 10:48:44,045] Trial 0 finished with value: 0.4993550502529196 and parameters: {'learning_rate': 0.2284355339945733, 'max_leaf_nodes': 8, 'min_samples_leaf': 930, 'importance_threshold': 0.6484439824012482}. Best is trial 0 with value: 0.4993550502529196.
  4%|▍         | 2/50 [01:05<26:24, 33.01s/it][I 2025-04-23 10:49:17,695] Trial 1 finished with value: 0.4993550502529196 and parameters: {'learning_rate': 0.19307190604405436, 'max_leaf_nodes': 3, 'min_samples_leaf': 775, 'importance_threshold': 0.6687052110710826}. Best is trial 0 with value: 0.4993550502529196.
  6%|▌         | 3/50 [01:37<25:22, 32.39s/it][I 2025-04-23 10:49:49,349] Trial 2 finished with value: 0.4993550502529196 and parameters: {'learning_rate': 0.05722413406516057, 'max_leaf_nodes': 25, 'min_samples_leaf'

CPU times: user 9.66 s, sys: 2.77 s, total: 12.4 s
Wall time: 59min 32s


In [6]:
prefix = r'^params_'
cv_results = pl.DataFrame(study.trials_dataframe())
cv_results = (
    cv_results
    .rename({
        col: re.sub(prefix, '', col)
        for col in cv_results.columns
    })
)

In [7]:
best = (
    cv_results
    .filter(pl.col('value') == pl.col('value').max())
    .select(*param_grid.keys(), 'value')
    .to_dicts()[0]
)
best

{'learning_rate': 0.04802194459487866,
 'max_leaf_nodes': 9,
 'min_samples_leaf': 567,
 'importance_threshold': 0.004278476100861628,
 'value': 0.7164945979559654}

In [9]:
best = {
    'learning_rate': 0.05,
    'max_leaf_nodes': 9,
    'min_samples_leaf': 550,
    'importance_threshold': 0.004,
}

# Stability, local grid search

In [10]:
param_grid = {
    'learning_rate': np.linspace(best['learning_rate'] * 0.8, best['learning_rate'] * 1.2, 5),
    'max_leaf_nodes': np.arange(max(best['max_leaf_nodes'] - 2, 2), best['max_leaf_nodes'] + 2),
    'min_samples_leaf': np.arange(max(best['min_samples_leaf'] - 10, 1), best['min_samples_leaf'] + 10, 4),
    'importance_threshold': np.linspace(best['importance_threshold'] * 0.8, best['importance_threshold'] * 1.2, 5),
}

In [11]:
param_grid_local = pl.concat([
    join_many([
        pl.DataFrame(v_l, schema=[k_l]),
        pl.DataFrame(v_r, schema=[k_r]),
        pl.DataFrame(best).drop(k_l, k_r),
    ], how='cross').select(best.keys()).with_columns(group=pl.lit(ind))
    for ind, ((k_l, v_l), (k_r, v_r)) in enumerate(combinations(param_grid.items(), 2))
])
group_mapping = dict(enumerate(combinations(param_grid.keys(), 2)))

In [None]:
%%time

iterator = param_grid_local.drop('group').iter_rows(named=True)
scores = [get_cv_score(kwargs) for kwargs in tqdm(iterator, total=param_grid_local.height)]
cv_results = param_grid_local.with_columns(pl.Series(scores).alias('value'))


  0%|          | 0/135 [00:00<?, ?it/s][A

In [None]:
for group, part in cv_results.partition_by('group', include_key=False, as_dict=True).items():
    heatmap_part(part, group_mapping[group[0]], 'value')

In [None]:
(
    cv_results
    .filter(pl.col('value') == pl.col('value').max())
    .select(*param_grid.keys(), 'value')
    .to_dicts()
    [0]
)

# Learning curve

In [None]:
model = SparseHGBR(**best)

In [None]:
%%time

train_sizes, train_scores, test_scores = learning_curve(
    model,
    data['train_features_rect'].to_numpy(),
    data['train_targets_rect'].to_numpy(),
    cv=5,
    scoring=scoring,
    n_jobs=-1,
)
plot_learning_curve(train_sizes, train_scores, test_scores)

# Score on test

In [None]:
fitted_model, preds = fit_predict_sklearn_model(
    train_features, train_targets, test_features, test_targets,
    model,
)
join_truth_and_score(preds, data['test_days'], test_targets)