## Setup

In [None]:
from specific import *

### Get shifted data

In [None]:
(
    endog_data,
    exog_data,
    master_mask,
    filled_datasets,
    masked_datasets,
    land_mask,
) = get_offset_data()

In [None]:
client = get_client()
client

### Define the training and test data

In [None]:
@data_split_cache
def get_split_data():
    X_train, X_test, y_train, y_test = train_test_split(
        exog_data, endog_data, random_state=1, shuffle=True, test_size=0.3
    )
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = get_split_data()

## Fit combinations

In [None]:
exog_data.columns

In [None]:
veg_features = get_filled_names(["VOD Ku-band", "LAI", "SIF", "FAPAR"])
shifts = ["", *[f" -{x} Month" for x in [1, 3, 6, 9]]]
veg_lags = []
for shift in shifts:
    shift_arr = []
    for veg_feature in veg_features:
        shift_arr.append(veg_feature + shift)
    veg_lags.append(shift_arr)
assert all(feature in exog_data for unpacked in veg_lags for feature in unpacked)
veg_lags

In [None]:
combinations = [
    (
        "Dry Day Period",
        "Max Temp",
        "pftCrop",
        "Dry Day Period -3 Month",
        "popd",
        "Dry Day Period -1 Month",
        "Dry Day Period -9 Month",
        "Dry Day Period -6 Month",
        "ShrubAll",
        "AGB Tree",
        *veg_lag_product,
    )
    for veg_lag_product in product(*veg_lags)
]

assert all(len(combination) == 15 for combination in combinations)

len(combinations)

In [None]:
scores = dask_fit_combinations(
    DaskRandomForestRegressor(**param_dict),
    X_train,
    y_train,
    client,
    combinations,
    n_splits=n_splits,
    local_n_jobs=max(get_ncpus() - 1, 1),
    verbose=True,
    cache_dir=CACHE_DIR,
)

In [None]:
len(scores)

In [None]:
r2_test_scores = {
    key: [data["test_score"][i]["r2"] for i in data["test_score"]]
    for key, data in scores.items()
}
mse_test_scores = {
    key: [data["test_score"][i]["mse"] for i in data["test_score"]]
    for key, data in scores.items()
}

In [None]:
keys = np.array(list(r2_test_scores))
mean_r2_test_scores = np.array([np.mean(scores) for scores in r2_test_scores.values()])
mean_mse_test_scores = np.array(
    [np.mean(scores) for scores in mse_test_scores.values()]
)

In [None]:
sort_indices = np.argsort(mean_r2_test_scores)[::-1]
keys = keys[sort_indices]
mean_r2_test_scores = mean_r2_test_scores[sort_indices]
mean_mse_test_scores = mean_mse_test_scores[sort_indices]

In [None]:
fig, ax = plt.subplots()
ax.plot(mean_r2_test_scores)
ax2 = ax.twinx()
_ = ax2.plot(mean_mse_test_scores, c="C1")

In [None]:
N = 20
fig, ax = plt.subplots()
ax.plot(mean_r2_test_scores[:N])
ax2 = ax.twinx()
_ = ax2.plot(mean_mse_test_scores[:N], c="C1")

In [None]:
np.max(mean_r2_test_scores)

In [None]:
mean_r2_test_scores[0]

In [None]:
print("\n".join(sort_features(list(keys[0]))))

In [None]:
r2_test_scores[tuple(keys[0])], np.mean(r2_test_scores[tuple(keys[0])])

### Impact of single vegetation variable inclusion on mean scores

In [None]:
all_var_means = {}
for var in ["VOD", "LAI", "SIF", "FAPAR"]:
    var_means = defaultdict(list)
    for i in range(6):
        for key, mean_r2 in zip(keys, mean_r2_test_scores):
            count = sum(var in feature for feature in key)
            var_means[count].append(mean_r2)
    lengths = [len(d) for d in var_means.values()]
    series = {
        key: pd.Series(d).reindex(range(max(lengths))) for key, d in var_means.items()
    }
    var_means = pd.DataFrame(series)[list(range(6))]
    all_var_means[var] = var_means

    plt.figure(figsize=(15, 7))
    pd.DataFrame(var_means).boxplot()
    plt.title(var)