# RF Hyperparameter Optimisation using Dask - Comparing Different Approaches

## Initialisation

In [None]:
from itertools import product
from time import time

import numpy as np
from dask.distributed import Client, as_completed
from joblib import parallel_backend
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from tqdm import tqdm


class Time:
    def __init__(self, name=""):
        self.name = name

    def __enter__(self):
        self.start = time()

    def __exit__(self, type, value, traceback):
        print("Time taken for {}: {}".format(self.name, time() - self.start))

## Hyperparameter Optimisation Using Dask on CX1

In [None]:
# Used for `score()` on the current node (i.e. not using Dask).
local_n_jobs = 5

client = Client(
    n_workers=5, threads_per_worker=5
)  # Create a LocalCluster for demonstration purposes.
client

### Common Parameters and Data

In [None]:
# Define the common training and test data.
np.random.seed(1)
X = np.random.random((int(2e3), 40))
y = X[:, 0] + X[:, 1] + np.random.random((X.shape[0],))

# Define the number of splits.
n_splits = 5
kf = KFold(n_splits=n_splits)

# Define the parameter space.
parameters_RF = {
    "n_estimators": [50],
    "max_depth": [6, 9, 12],
    "min_samples_split": [2],
    "min_samples_leaf": [1, 5, 10],
}

default_param_dict = {
    "random_state": 1,
    "bootstrap": True,
    "max_features": "auto",
}

rf_params_list = [
    dict(zip(parameters_RF, param_values))
    for param_values in product(*parameters_RF.values())
]

rf_params = default_param_dict.copy()

### Define our own RF implementation that submits individual trees as Dask tasks.

In [None]:
from wildfires.dask_cx1 import DaskRandomForestRegressor, fit_dask_rf_grid_search_cv

with Time("Custom Dask Gridsearch"):
    results = fit_dask_rf_grid_search_cv(
        DaskRandomForestRegressor(**default_param_dict),
        X,
        y,
        n_splits,
        parameters_RF,
        client,
        verbose=True,
        return_train_score=True,
        refit=False,
        local_n_jobs=local_n_jobs,
    )

### Perform individual fits in series

Wait for each RF fit to complete (using the Dask backend) and score (using local threading backend, since `predict()` (used by `score()` requires 'sharedmem'!) before starting the next one.

In [None]:
def fit_and_score(X, y, train_index, test_index, rf_params):
    rf = RandomForestRegressor(**rf_params)
    with parallel_backend("dask"):

        rf.fit(X[train_index], y[train_index])

    with parallel_backend("threading", n_jobs=local_n_jobs):
        test_score = rf.score(X[test_index], y[test_index])
        train_score = rf.score(X[train_index], y[train_index])

    return test_score, train_score


test_scores_list = []
train_scores_list = []

with Time("In Series"):
    for rf_grid_params in tqdm(rf_params_list, desc="Params"):
        rf_params.update(rf_grid_params)
        test_scores = []
        train_scores = []
        for i, (train_index, test_index) in enumerate(list(kf.split(X))):
            test_score, train_score = fit_and_score(
                X, y, train_index, test_index, rf_params
            )
            test_scores.append(test_score)
            train_scores.append(train_score)

        test_scores_list.append(test_scores)
        train_scores_list.append(train_scores)

### Dask-ML GridSearchCV

This works, but only allocates one thread per **forest fit**, _not per tree_, making for very slow training when `n_fits < n_workers`.

Only use this when `n_fits >> n_workers`, where `n_fits = n_parameters * n_splits`, or when individual model `fit()` calls are only single threaded (**unlike** `RandomForestRegressor.fit()`, which releases the GIL).

In [None]:
from dask_ml.model_selection import GridSearchCV

gs = GridSearchCV(
    RandomForestRegressor(random_state=1, bootstrap=True, max_features="auto"),
    parameters_RF,
    cv=n_splits,
    return_train_score=True,
    refit=False,
)
with Time("Dask-ML GridSearchCV"):
    gs = gs.fit(X, y)

### Native sklearn GridSearchCV fails with CancelledError

It is apparent (prior to failing) that this does spread out the training of individual trees, which should have lead to expected speedups when `n_fits < n_workers` (or about the same magnitude).

The CancelledError occurrence has already been reported:
 - https://github.com/scikit-learn/scikit-learn/issues/12315
 - https://github.com/scikit-learn/scikit-learn/issues/15383
 - https://github.com/joblib/joblib/issues/959
 - https://github.com/joblib/joblib/issues/1021

In [None]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(
    RandomForestRegressor(random_state=1, bootstrap=True, max_features="auto"),
    parameters_RF,
    cv=n_splits,
    return_train_score=True,
    refit=False,
)
with Time("Scikit-learn GridSearchCV with Dask"):
    with parallel_backend("dask"):
        gs = gs.fit(X, y)