## Setup

In [None]:
from specific import *

exp_name = experiment_name_dict[PROJECT_DIR.name]

### Get shifted data

In [None]:
(
    endog_data,
    exog_data,
    master_mask,
    filled_datasets,
    masked_datasets,
    land_mask,
) = get_offset_data()

### Retrieve previous results from the 'model' notebook

In [None]:
X_train, X_val, y_train, y_val = data_split_cache.load()
rf = get_model()
masked_train_data = get_mm_data(y_train.values, master_mask, "train")
masked_val_data = get_mm_data(y_val.values, master_mask, "val")

### Predict out-of-sample BA

In [None]:
n_threads = get_ncpus()
rf.n_jobs = n_threads
with parallel_backend("threading", n_jobs=n_threads):
    predicted_ba = get_mm_data(rf.predict(X_val), master_mask, "val")

### Histogram of Predictions vs. Observations

In [None]:
f_pred = get_unmasked(predicted_ba)
f_obs = get_unmasked(masked_val_data)
assert f_pred.shape == f_obs.shape
print("N samples:", f_pred.shape)
print(f"Mean out-of-sample obs.: {np.mean(f_obs):0.2e}")
print(f"Mean out-of-sample pred.: {np.mean(f_pred):0.2e}")
print(f"Min out-of-sample obs.: {np.min(f_obs):0.2e}")
print(f"Min out-of-sample pred.: {np.min(f_pred):0.2e}")
print(f"Max out-of-sample obs.: {np.max(f_obs):0.2e}")
print(f"Max out-of-sample pred.: {np.max(f_pred):0.2e}")

In [None]:
dict(zip(*np.unique(f_obs[f_obs < np.min(f_pred)], return_counts=True))).items()

In [None]:
{
    key: format(val, "e")
    for key, val in dict(
        zip(*np.unique(f_obs[f_obs < np.min(f_pred)], return_counts=True))
    ).items()
}

In [None]:
dict(zip(*np.unique(f_obs[f_obs < np.min(f_pred)], return_counts=True)))[
    0.0
] / f_pred.shape[0]

In [None]:
fig, axes = plt.subplots(
    1, 2, sharey=True, gridspec_kw=dict(width_ratios=[0.07, 1]), figsize=(7, 4)
)

lower_range_lim = 2.2e-5

axes[0].hist(
    np.hstack((f_obs.reshape(-1, 1), f_pred.reshape(-1, 1))),
    bins=[0, lower_range_lim],
    histtype="bar",
    label=["Obs.", "Pred."],
)
axes[0].set_xlim(0, 2.1e-5)


@ticker.FuncFormatter
def major_formatter(x, pos):
    if x == 0:
        return 0
    exp = math.floor(np.log10(x))
    coeff = x / 10 ** exp
    if coeff.is_integer():
        coeff = round(coeff)
        if coeff == 1:
            return fr"$10^{ {exp}}$"
        return fr"${coeff} \times 10^{ {exp}}$"
    return f"{x:0.1e}"


axes[0].xaxis.set_major_formatter(major_formatter)


axes[1].hist(
    np.hstack((f_obs.reshape(-1, 1), f_pred.reshape(-1, 1))),
    bins=np.geomspace(lower_range_lim, max(np.max(f_pred), np.max(f_obs)), num=15),
    histtype="bar",
    label=["Obs.", "Pred."],
)
axes[1].set_xscale("log")
axes[1].set_xlim(2e-5, 1)

for ax in axes:
    ax.set_yscale("log")
    ax.grid(alpha=0.4, linestyle="--")
    ax.set_ylim(3e1, 6e5)

axes[1].legend(loc="best")

axes[0].set_ylabel(f"counts (out-of-sample, {exp_name} model)")
axes[1].set_xlabel("BA")

fig.tight_layout(w_pad=-1.6)
figure_saver.save_figure(fig, "obs_pred_hist", sub_directory="predictions")

In [None]:
with figure_saver("obs_pred_comp", sub_directory="predictions"):
    cube_plotting(
        np.mean(masked_val_data - predicted_ba, axis=0),
        fig=plt.figure(figsize=(5.1, 2.3)),
        cmap="BrBG",
        cmap_midpoint=0,
        cmap_symmetric=False,
        boundaries=[-0.01, -0.001, -1e-4, 0, 0.001, 0.01, 0.02],
        colorbar_kwargs=dict(
            label="Ob. - Pr.",
        ),
        title="",
    )

In [None]:
_ = cube_plotting(
    np.mean(masked_val_data - predicted_ba, axis=0) / np.mean(masked_val_data, axis=0),
    cmap="brewer_RdYlBu_11",
    cmap_symmetric=False,
    boundaries=[-5, -4, -3, -2, 0, 1e-2, 1e-1],
    cmap_midpoint=0,
)

In [None]:
ba_plotting(predicted_ba, masked_val_data, figure_saver)

In [None]:
u_pre = get_unmasked(predicted_ba)
u_val = get_unmasked(masked_val_data)

min_non_zero_val = u_val[u_val > 0].min()

x_edges = np.append(0, np.geomspace(min_non_zero_val, 1, 100))
y_edges = np.geomspace(np.min(f_pred), np.max(f_pred), 100 + 1)

h = np.histogram2d(u_val, u_pre, bins=[x_edges, y_edges])[0]

fig, ax = plt.subplots(figsize=(6, 4), dpi=100)
img = ax.pcolor(
    x_edges,
    y_edges,
    h.T,
    norm=LogNorm(),
)

# Plot diagonal 1:1 line.
plt.plot(
    *(np.linspace(max(min(u_val), min(u_pre)), min(max(u_val), max(u_pre)), 50),) * 2,
    linestyle="--",
    c="C3",
    lw=2
)

ax.set_xscale("symlog", linthresh=min_non_zero_val, linscale=2e-1, subs=range(2, 10))
ax.set_yscale("log")

spacing = 5


def offset_simple_sci_format(x, *args, **kwargs):

    canon = simple_sci_format(x, *args, **kwargs)
    if np.isclose(x, 1e-5):
        return " " * spacing + canon
    elif np.isclose(x, 0):
        return canon + " " * spacing
    return canon


ax.xaxis.set_major_formatter(
    ticker.FuncFormatter(lambda x, pos: offset_simple_sci_format(x))
)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: simple_sci_format(x)))

ax.set_xlabel("Observed (BA)")
ax.set_ylabel("Predicted (BA)")

ax.grid()
ax.set_axisbelow(True)

fig.colorbar(
    img,
    shrink=0.7,
    aspect=30,
    format=ticker.FuncFormatter(lambda x, pos: simple_sci_format(x)),
    pad=0.02,
    label="samples",
)
figure_saver.save_figure(plt.gcf(), "obs_pred_bin", sub_directory="predictions")

In [None]:
u_val = get_unmasked(masked_val_data)
u_pre = get_unmasked(predicted_ba)

mask = np.isclose(u_val, 0) | np.isclose(u_pre, 0)
u_val = u_val[~mask]
u_pre = u_pre[~mask]

plt.figure(figsize=(6, 4), dpi=200)
plt.hexbin(u_val, u_pre, bins="log", xscale="log", yscale="log")

# Plot diagonal 1:1 line.
plt.plot(
    *(np.linspace(max(min(u_val), min(u_pre)), min(max(u_val), max(u_pre)), 50),) * 2,
    linestyle="--",
    c="C3",
    lw=2
)

plt.gca().xaxis.set_major_formatter(
    ticker.FuncFormatter(lambda x, pos: simple_sci_format(x))
)
plt.gca().yaxis.set_major_formatter(
    ticker.FuncFormatter(lambda x, pos: simple_sci_format(x))
)

plt.xlabel("Observed (BA)")
plt.ylabel("Predicted (BA)")
plt.colorbar(
    shrink=0.7,
    aspect=30,
    format=ticker.FuncFormatter(lambda x, pos: simple_sci_format(x)),
    pad=0.02,
    label="samples",
)
# figure_saver.save_figure(plt.gcf(), "obs_pred_bin", sub_directory="predictions")

### Example timeseries

In [None]:
# valid_indices = list(zip(*np.where(np.sum(~predicted_ba.mask, axis=0) > 5)))
valid_indices = list(
    zip(
        *np.where(
            (np.max(predicted_ba.data, axis=0) > 0.1)
            & (np.sum(~predicted_ba.mask, axis=0) > 4)
        )
    )
)
print("Nr. valid:", len(valid_indices))

for i in np.random.RandomState(0).choice(
    len(valid_indices), min(100, len(valid_indices)), replace=False
):
    plt.figure(figsize=(7, 3))
    plt.plot(masked_val_data[(slice(None), *valid_indices[i])], label="obs", marker="o")
    plt.plot(predicted_ba[(slice(None), *valid_indices[i])], label="pred", marker="x")
    plt.legend(loc="best")