## Setup

In [None]:
from specific import *

### Get shifted data

In [None]:
(
    endog_data,
    exog_data,
    master_mask,
    filled_datasets,
    masked_datasets,
    land_mask,
) = get_offset_data()

In [None]:
client = get_client()
client

### Define the training and test data

In [None]:
@data_split_cache
def get_split_data():
    X_train, X_test, y_train, y_test = train_test_split(
        exog_data, endog_data, random_state=1, shuffle=True, test_size=0.3
    )
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = get_split_data()

### Specific model training without grid seach

In [None]:
n_splits = 5

param_dict = {
    "random_state": 1,
    "bootstrap": True,
    "ccp_alpha": 0.0,
    "max_depth": 18,
    "max_features": "auto",
    "min_samples_leaf": 1,
    "min_samples_split": 2,
    "n_estimators": 500,
}

#### Cached results only

In [None]:
cached = CachedResults(
    estimator_class=DaskRandomForestRegressor, n_splits=n_splits, cache_dir=CACHE_DIR
)
results = cached.collate_scores(train_scores=True)

In [None]:
model = DaskRandomForestRegressor(**param_dict)
model.n_jobs = 32
model_key = tuple(sorted(model.get_params().items()))
try:
    model = cached.get_estimator(model_key)
except KeyError:
    with parallel_backend("dask"):
        model.fit(X_train, y_train)
    cached.store_estimator(model_key, model)

#### Place into expected cache location.

In [None]:
@cross_val_cache
def dummy_f():
    return {}, model


_, model1 = dummy_f()

### Grid search

In [None]:
import scipy.stats

n_splits = 5

# Define the parameter space.

# 1024 combinations ([100, 200] est., x 5 splits) takes ~ 20 hrs.

parameters_RF = {
    "n_estimators": [300, 500],
    "max_depth": [14, 18],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto"],
    "ccp_alpha": np.linspace(0, 4e-9, 10),
}

default_param_dict = {
    "random_state": 1,
    "bootstrap": True,
}

## Hyperparameter optimisation

#### Calculate results

In [None]:
results, rf = fit_dask_sub_est_random_search_cv(
    DaskRandomForestRegressor(**default_param_dict),
    X_train.values,
    y_train.values,
    parameters_RF,
    client,
    n_splits=n_splits,
    max_time="24h",
    n_iter=None,
    verbose=True,
    return_train_score=True,
    refit=True,
    local_n_jobs=30,
    random_state=0,
    cache_dir=CACHE_DIR,
)

## Hyperparameter Search Visualisation

In [None]:
hyperparams = defaultdict(list)

for param_tuples, param_results in results.items():
    for category, scores in param_results.items():
        if len(scores) == n_splits:
            hyperparams[category].append(np.mean(scores))
            hyperparams[category + "_std"].append(np.std(scores))
        else:
            print(param_tuples, category, len(scores))
            break  # Do not append anything.
    else:
        for param, param_value in param_tuples:
            hyperparams[param].append(param_value)

In [None]:
hyperparams = pd.DataFrame(hyperparams)
score_keys = list(param_results)
score_std_keys = [score_key + "_std" for score_key in score_keys]
param_keys = list(set(hyperparams.columns) - set(score_keys) - set(score_std_keys))
hyperparams.fillna(-1, inplace=True)

In [None]:
hyperparams_gap = hyperparams[hyperparams["test_score"] > 0.64].copy()
hyperparams_gap["gap"] = hyperparams_gap["train_score"] - hyperparams_gap["test_score"]
print(len(hyperparams_gap))
hyperparams_gap.sort_values(by="gap")

In [None]:
hyperparams.sort_values(by="test_score", ascending=False)[:20]

In [None]:
hyperparams.boxplot(column=score_keys, by=["min_samples_split", "n_estimators"])

In [None]:
melted = pd.melt(
    hyperparams[hyperparams["test_score"] > 0.65].drop(columns=score_std_keys),
    id_vars=param_keys,
    value_vars=score_keys,
    var_name="category",
    value_name="score",
)
melted

### Visualise the effect of individual parameters

In [None]:
from alepython.ale import _sci_format

for param_key in param_keys:
    if param_key == "ccp_alpha":
        fig = plt.figure(figsize=(25, 6))
    else:
        fig = plt.figure(figsize=(9, 6))

    ax = sns.boxplot(x=param_key, y="score", hue="category", data=melted)
    ax.set(ylabel="R2 Score")
    ax.grid(which="both", alpha=0.4, linestyle="--")

    if param_key == "ccp_alpha":
        ax.xaxis.set_ticklabels(
            _sci_format(
                np.array(
                    list(map(lambda x: float(x.get_text()), ax.xaxis.get_ticklabels()))
                )
            )
        )
        ax.xaxis.set_tick_params(rotation=45)

    figure_saver.save_figure(fig, param_key, sub_directory="hyperparameters")

### Repeat for the standard deviations

In [None]:
melted_std = pd.melt(
    hyperparams[hyperparams["test_score"] > 0.65].drop(columns=score_keys),
    id_vars=param_keys,
    value_vars=score_std_keys,
    var_name="category",
    value_name="score_std",
)
melted_std

### Visualise the effect of individual parameters

In [None]:
from alepython.ale import _sci_format

for param_key in param_keys:
    if param_key == "ccp_alpha":
        fig = plt.figure(figsize=(25, 6))
    else:
        fig = plt.figure(figsize=(9, 6))

    ax = sns.boxplot(x=param_key, y="score_std", hue="category", data=melted_std)
    ax.set(ylabel="R2 Score")
    ax.grid(which="both", alpha=0.4, linestyle="--")

    if param_key == "ccp_alpha":
        ax.xaxis.set_ticklabels(
            _sci_format(
                np.array(
                    list(map(lambda x: float(x.get_text()), ax.xaxis.get_ticklabels()))
                )
            )
        )
        ax.xaxis.set_tick_params(rotation=45)

    figure_saver.save_figure(fig, param_key, sub_directory="hyperparameters")

### Dependence of R2 gap on performance

In [None]:
mask = hyperparams["test_score"] > 0.66
gap = hyperparams[mask]["train_score"] - hyperparams[mask]["test_score"]

# colorby = "max_depth"
for colorby in param_keys:
    c = hyperparams[mask][colorby]
    try:
        np.asarray(c, dtype=np.float64)
    except ValueError:
        continue
    for key in ("train_score", "test_score")[1:]:
        plt.figure()
        plt.scatter(hyperparams[mask][key], gap, marker="o", alpha=0.3, c=c)
        plt.ylabel("R2 train - test")
        plt.xlabel(key)
        plt.colorbar(label=colorby)
        plt.grid(alpha=0.4, linestyle="--")

### Scoring evaluation

In [None]:
%%time

scores = {}

model.n_jobs = get_ncpus()
with parallel_backend("threading", n_jobs=get_ncpus()):
    y_pred = model.predict(X_test)
    scores["test_r2"] = r2_score(y_test, y_pred)
    scores["test_mse"] = mean_squared_error(y_test, y_pred)

    train_y_pred = model.predict(X_train)
    scores["train_r2"] = r2_score(y_train, train_y_pred)
    scores["train_mse"] = mean_squared_error(y_train, train_y_pred)

In [None]:
scores

In [None]:
plt.hexbin(y_pred, y_test, bins="log")

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(y_pred - y_test, bins=800)
plt.yscale("log")

In [None]:
y_test = y_test.values

In [None]:
diffs = y_pred - y_test

In [None]:
mask = y_test > 0.01

indices = np.argsort(diffs[mask])
plt.scatter(
    np.arange(len(indices)),
    diffs[mask][indices],
    marker="o",
    rasterized=True,
    alpha=0.1,
    c=np.log(y_test[mask][indices]),
)
plt.colorbar(label="log(BA Test)")
plt.ylabel("Prediction - Observation (test)")
plt.yscale("symlog", linthreshy=0.01)

In [None]:
plt.figure(figsize=(30, 15))
plt.scatter(
    np.log10(y_test), diffs, rasterized=True, marker="o", alpha=0.1, c=np.log10(y_pred)
)
plt.colorbar(label="log10(Pred)")
plt.yscale("symlog", linthreshy=0.00001)
plt.ylabel("Pred - Obs")
plt.xlabel("log10 Obs")
plt.title("Validation Data")

In [None]:
train_diffs = train_y_pred - y_train

In [None]:
plt.figure(figsize=(30, 15))
plt.scatter(
    np.log10(y_train),
    train_diffs,
    rasterized=True,
    marker="o",
    alpha=0.1,
    c=np.log10(train_y_pred),
)
plt.colorbar(label="log10(Pred)")
plt.yscale("symlog", linthreshy=0.00001)
plt.ylabel("Pred - Obs")
plt.xlabel("log10 Obs")
plt.title("Training Data")

In [None]:
mask = y_train > 0.01
plt.figure(figsize=(30, 15))
plt.scatter(
    np.log10(y_train),
    np.log10(train_y_pred),
    rasterized=True,
    marker="o",
    alpha=0.01,
    c=np.log10(train_y_pred),
)
plt.colorbar(label="log10(Pred)")
plt.plot(np.log10(y_train), np.log10(y_train))
# plt.yscale('symlog', linthreshy=0.00001);
plt.ylabel("log10 Pred")
plt.xlabel("log10 Obs")
plt.title("Training Data")