## Initialisation

In [None]:
import logging
import os
import warnings
from itertools import product

import matplotlib as mpl
import numpy as np
from joblib import Memory
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split

import wildfires.analysis
from wildfires.analysis import *
from wildfires.dask_cx1 import get_client
from wildfires.data import *
from wildfires.logging_config import enable_logging

FigureSaver.debug = True
FigureSaver.directory = os.path.expanduser(os.path.join("~", "tmp", "time_lags"))
os.makedirs(FigureSaver.directory, exist_ok=True)
logger = logging.getLogger(__name__)

enable_logging("jupyter")
warnings.filterwarnings("ignore", ".*Collapsing a non-contiguous coordinate.*")
warnings.filterwarnings("ignore", ".*DEFAULT_SPHERICAL_EARTH_RADIUS*")
warnings.filterwarnings("ignore", ".*guessing contiguous bounds*")

normal_coast_linewidth = 0.5
mpl.rc("figure", figsize=(14, 6))
mpl.rc("font", size=9.0)

np.random.seed(1)

memory = get_memory("analysis_time_lags", verbose=100)

## Creating the Data Structures used for Fitting

In [None]:
shift_months = [1, 3, 6, 12, 24]

selection_variables = (
    "VOD Ku-band -3 Month",
    "SIF",
    "VOD Ku-band -1 Month",
    "Dry Day Period -3 Month",
    "FAPAR",
    "pftHerb",
    "LAI -1 Month",
    "popd",
    "Dry Day Period -24 Month",
    "pftCrop",
    "FAPAR -1 Month",
    "FAPAR -24 Month",
    "Max Temp",
    "Dry Day Period -6 Month",
    "VOD Ku-band -6 Month",
)

ext_selection_variables = selection_variables + (
    "Dry Day Period -1 Month",
    "FAPAR -6 Month",
    "ShrubAll",
    "SWI(1)",
    "TreeAll",
)

(
    endog_data,
    exog_data,
    master_mask,
    filled_datasets,
    masked_datasets,
    land_mask,
) = wildfires.analysis.time_lags.get_data(selection_variables=selection_variables)

(
    s_endog_data,
    s_exog_data,
    s_master_mask,
    s_filled_datasets,
    s_masked_datasets,
    s_land_mask,
) = wildfires.analysis.time_lags.get_data(
    shift_months=[1, 3, 6, 12, 24], selection_variables=selection_variables
)

(
    e_s_endog_data,
    e_s_exog_data,
    e_s_master_mask,
    e_s_filled_datasets,
    e_s_masked_datasets,
    e_s_land_mask,
) = wildfires.analysis.time_lags.get_data(
    shift_months=[1, 3, 6, 12, 24], selection_variables=ext_selection_variables
)

## Hyperparameter Optimisation Using CX1

In [None]:
# Define the training and test data.
X_train, X_test, y_train, y_test = train_test_split(
    exog_data, endog_data, random_state=1, shuffle=True, test_size=0.3
)

# Worker specifications.
specs = {"memory": "15GB", "walltime": "10:00:00", "cores": 8}
# Connect to an existing cluster with at least those specs.
client = get_client(**specs)

# Define the parameter space.
parameters_RF = {
    "n_estimators": [10, 50, 100],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [3, 10, 20],
    "max_features": ["auto"],
    "bootstrap": [False, True],
    "random_state": [1],
}


def fit_func(X, y, rf_params):
    rf = RandomForestRegressor(**rf_params)
    scores = cross_val_score(rf, X, y, cv=5)
    # XXX: What about the n_jobs parameters for the above two things?
    # Optionally fit model on all the data and store the fitted model using pickle.
    return scores


# fitting = CX1Fit(X_train, y_train, data_name="full_no_shift", param_grid=parameters_RF)
# output = fitting.get_best_model(timeout=60 * 60)

# scores_list = client.gather(client.map(fit_func, (X_train, y_train, dict(parameters_RF, params)
#                                        for params in product(*list(
# parameters_RF.values())))))
print(
    (X_train, y_train, dict(zip(parameters_RF, params)))
    for params in product(*list(parameters_RF.values()))
)

In [None]:
if output:
    regr = output["model"]

    print(estimator)
    y_pred = regr.predict(X_test)

    # Carry out predictions on the training dataset to diagnose overfitting.
    y_pred_train = regr.predict(X_train)

    results = {}
    results["R2_train"] = regr.score(X_train, y_train)
    results["R2_test"] = regr.score(X_test, y_test)

    model_name = "RF"
    print(f"{model_name} R2 train: {results['R2_train']}")
    print(f"{model_name} R2 test: {results['R2_test']}")

    importances = regr.feature_importances_
    std = np.std([tree.feature_importances_ for tree in regr.estimators_], axis=0)

    importances_df = pd.DataFrame(
        {
            "Name": exog_data.columns.values,
            "Importance": importances,
            "Importance STD": std,
            "Ratio": np.array(std) / np.array(importances),
        }
    )
    print(
        "\n"
        + str(
            importances_df.sort_values("Importance", ascending=False).to_string(
                index=False, float_format="{:0.3f}".format, line_width=200
            )
        )
    )