# Hyperparameter RF Optimisation for Lagged Variables

## Initialisation

In [None]:
import logging
import os
import re
import sys
import warnings
from collections import namedtuple
from functools import reduce
from itertools import combinations
from operator import mul

import cloudpickle
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import shap
from joblib import Memory, Parallel, delayed
from loguru import logger as loguru_logger
from matplotlib.patches import Rectangle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from tqdm import tqdm

import wildfires.analysis
from alepython import ale_plot
from alepython.ale import _second_order_ale_quant
from wildfires.analysis import *
from wildfires.dask_cx1 import *
from wildfires.data import *
from wildfires.logging_config import enable_logging
from wildfires.qstat import get_ncpus
from wildfires.utils import *

loguru_logger.enable("alepython")
loguru_logger.remove()
loguru_logger.add(sys.stderr, level="WARNING")

logger = logging.getLogger(__name__)

enable_logging("jupyter")

warnings.filterwarnings("ignore", ".*Collapsing a non-contiguous coordinate.*")
warnings.filterwarnings("ignore", ".*DEFAULT_SPHERICAL_EARTH_RADIUS*")
warnings.filterwarnings("ignore", ".*guessing contiguous bounds*")

normal_coast_linewidth = 0.5
mpl.rc("figure", figsize=(14, 6))
mpl.rc("font", size=9.0)

save_name = "analysis_lags_rf_cross_val"

figure_saver = FigureSaver(directories=os.path.join("~", "tmp", save_name), debug=True,)
memory = get_memory(save_name, verbose=100)
CACHE_DIR = os.path.join(DATA_DIR, ".pickle", save_name)

### Load the customized `get_data()` function for this experiment.

In [None]:
from get_lags_rf_cross_val_data import get_data

## Creating the Data Structures used for Fitting

In [None]:
shift_months = [1, 3, 6, 9, 12, 18, 24]

(
    e_s_endog_data,
    e_s_exog_data,
    e_s_master_mask,
    e_s_filled_datasets,
    e_s_masked_datasets,
    e_s_land_mask,
) = get_data(shift_months=shift_months, selection_variables=None)

### Offset data from 12 or more months before the current month in order to ease analysis (interpretability).

We are interested in the trends in these properties, not their absolute values, therefore we subtract a recent 'seasonal cycle' analogue.
This hopefully avoids capturing the same relationships for a variable and its 12 month counterpart due to their high correlation.

In [None]:
to_delete = []
for column in e_s_exog_data:
    match = re.search(r"-\d{1,2}", column)
    if match:
        span = match.span()
        # Change the string to reflect the shift.
        original_offset = int(column[slice(*span)])
        if original_offset > -12:
            # Only shift months that are 12 or more months before the current month.
            continue
        comp = -(-original_offset % 12)
        new_column = " ".join(
            (
                column[: span[0] - 1],
                f"{original_offset} - {comp}",
                column[span[1] + 1 :],
            )
        )
        if comp == 0:
            comp_column = column[: span[0] - 1]
        else:
            comp_column = " ".join(
                (column[: span[0] - 1], f"{comp}", column[span[1] + 1 :])
            )
        print(column, comp_column)
        e_s_exog_data[new_column] = e_s_exog_data[column] - e_s_exog_data[comp_column]
        to_delete.append(column)
for column in to_delete:
    del e_s_exog_data[column]

## Hyperparameter Optimisation Using Dask on CX1

In [None]:
client = get_client()
client

In [None]:
from itertools import product

from dask.distributed import worker_client
from joblib import parallel_backend
from sklearn.ensemble import RandomForestRegressor

#  Does spread out training of individual trees, but then fails with CancelledError...
# from sklearn.model_selection import GridSearchCV
# This works but only allocates one thread per Forest.
# from dask_ml.model_selection import GridSearchCV, cross_val_score

In [None]:
from concurrent.futures import ProcessPoolExecutor, wait

from sklearn.model_selection import KFold, cross_val_score


def fit_and_score(X, y, train_index, test_index, rf_params):
    with parallel_backend("dask"):
        rf = RandomForestRegressor(**rf_params)

        rf.fit(X[train_index], y[train_index])

        test_score = rf.score(X[test_index], y[test_index])
        train_score = rf.score(X[train_index], y[train_index])

    return test_score, train_score


# Define the training and test data.
X_train, X_test, y_train, y_test = train_test_split(
    e_s_exog_data, e_s_endog_data, random_state=1, shuffle=True, test_size=0.3
)

# Define the parameter space.

# parameters_RF = {
#     "n_estimators": [100, 500, 1000],
#     "max_depth": [5, 9, 12],
#     "min_samples_split": [2, 5, 10],
#     "min_samples_leaf": [1, 4],
#     "max_leaf_nodes": [500, 1500, None],
# }

parameters_RF = {
    "n_estimators": [500],
    "max_depth": [9, 12],
    "min_samples_split": [2],
    "min_samples_leaf": [1],
    "max_leaf_nodes": [500, 2000],
}

default_param_dict = {
    "random_state": 1,
    "bootstrap": True,
    "max_features": "auto",
}

rf_params_list = [
    dict(zip(parameters_RF, param_values))
    for param_values in product(*parameters_RF.values())
]

kf = KFold(n_splits=3)

cross_val_cache2 = SimpleCache("rf_cross_val2", cache_dir=CACHE_DIR)

# XXX:
cross_val_cache2.clear()


@cross_val_cache2
def run_cross_val2():
    #     X = X_train[:int(5e4)].values
    #     y = y_train[:int(5e4)].values

    X = X_train.values
    y = y_train.values

    test_scores_list = []
    train_scores_list = []

    rf_params = default_param_dict.copy()

    for rf_grid_params in tqdm(rf_params_list, desc="Grid"):
        rf_params.update(rf_grid_params)

        test_scores = []
        train_scores = []
        for train_index, test_index in tqdm(list(kf.split(X)), desc="Splits"):
            test_score, train_score = fit_and_score(
                X, y, train_index, test_index, rf_params
            )
            test_scores.append(test_score)
            train_scores.append(train_score)

        test_scores_list.append(test_scores)
        train_scores_list.append(train_scores)

    mean_test_scores = [np.mean(scores) for scores in test_scores_list]
    best_params = rf_params_list[np.argmin(mean_test_scores)]
    print("Best parameters:", best_parameters)

    rf_params.update(best_params)
    est = RandomForestRegressor(**rf_params)
    with parallel_backend("dask"):
        est.fit(X_train.values, y_train.values)

    return est, test_scores_list, train_scores_list


est, test_scores_list, train_scores_list = run_cross_val2()

#### Get the best model parameters using the scores.

In [None]:
vis_dict = {"score": [], "parameter_index": [], "type": []}
for i, (test_scores, train_scores) in enumerate(
    zip(test_scores_list, train_scores_list)
):
    n_split = len(train_scores)
    assert n_split == len(test_scores)

    vis_dict["parameter_index"].extend([i] * n_split * 2)

    vis_dict["score"].extend(test_scores)
    vis_dict["type"].extend(["Test"] * n_split)

    vis_dict["score"].extend(train_scores)
    vis_dict["type"].extend(["Train"] * n_split)

vis_df = pd.DataFrame(vis_dict)

import seaborn as sns

mpl.rc("figure", figsize=(7, 5))
with figure_saver("hyperparam_opt_small"):
    ax = sns.boxplot(x="parameter_index", y="score", hue="type", data=vis_df)

In [None]:
for parameter_index, parameter_values in enumerate(rf_params_list):
    print("Index:", parameter_index)
    print(parameter_values)

In [None]:
# #  Does spread out training of individual trees, but then FAILS with CancelledError, or spends forever on the last step...
# from sklearn.model_selection import GridSearchCV

# # This works, but only seems to allocate one thread PER FOREST.
# # from dask_ml.model_selection import GridSearchCV

# # Define the training and test data.
# X_train, X_test, y_train, y_test = train_test_split(
#     e_s_exog_data, e_s_endog_data, random_state=1, shuffle=True, test_size=0.3
# )

# # Define the parameter space.

# # parameters_RF = {
# #     "n_estimators": [100, 500, 1000],
# #     "max_depth": [5, 9, 12],
# #     "min_samples_split": [2, 5, 10],
# #     "min_samples_leaf": [1, 4],
# #     "max_leaf_nodes": [500, 1500, None],
# # }

# parameters_RF = {
#     "n_estimators": [100, 500],
#     "max_depth": [9, 12],
#     "min_samples_split": [2, 4],
#     "min_samples_leaf": [1, 4],
#     "max_leaf_nodes": [500, 2000],
# }

# rf_params_list = [dict(zip(parameters_RF, param_values)) for param_values in product(*parameters_RF.values())]

# opt_cache = SimpleCache("rf_opt", cache_dir=CACHE_DIR)

# # XXX:
# opt_cache.clear()

# @opt_cache
# def grid_search():
# #     X = X_train[:int(5e4)].values
# #     y = y_train[:int(5e4)].values

#     X = X_train.values
#     y = y_train.values

#     gs = GridSearchCV(
#         RandomForestRegressor(
#             random_state=1,
#             bootstrap=True,
#             max_features='auto',
#         ),
#         parameters_RF,
# #         scheduler=client,
#         return_train_score=True,
#         refit=False,
#         cv=3,
#     )

#     with parallel_backend("dask", scatter=[X, y]):  # Is this needed?
#         gs.fit(X, y)
# #         gs.fit(X_train, y_train)

#     return gs

# gs = grid_search()

#### Best estimator

In [None]:
est

#### Number of trees

In [None]:
from scipy.stats import describe

print(describe([tree.get_n_leaves() for tree in est.estimators_]))