In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from joblib import parallel_backend
from wildfires.analysis import cube_plotting
from wildfires.dask_cx1 import DaskRandomForestRegressor
from wildfires.data import regions_GFED
from wildfires.utils import match_shape

from empirical_fire_modelling.configuration import Experiment
from empirical_fire_modelling.data import get_data
from empirical_fire_modelling.utils import get_client, tqdm

In [None]:
gfed_regions = regions_GFED()
gfed_regions

In [None]:
_ = cube_plotting(gfed_regions, fig=plt.figure(figsize=(14, 6)))

In [None]:
client = get_client(fallback=True, fallback_threaded=True)

In [None]:
endog_data, exog_data, master_mask = get_data(Experiment.ALL)[:3]

In [None]:
np.sum(~master_mask), endog_data.shape, exog_data.shape

In [None]:
master_mask.shape

### Values are either all masked or all unmasked at each location, which is goood

In [None]:
np.unique(np.sum(master_mask, axis=0))

In [None]:
np.all(np.all(master_mask[:1] == master_mask, axis=0))

In [None]:
def get_map_data(data_1d, master_mask):
    """Go from 1D data to data on a map, defined by master_mask."""
    map_data = np.ma.MaskedArray(
        np.zeros_like(master_mask, dtype=np.float64), mask=np.ones_like(master_mask)
    )
    map_data[~master_mask] = data_1d
    return map_data

In [None]:
def get_gfed_region_cv_splits(X, y):
    """Split X and y according to the GFED regions."""
    ignore_regions = ["Ocean"]
    for region_name, region_code in gfed_regions.attributes["region_codes"].items():
        if region_name in ignore_regions:
            continue

        # Select the region as the hold-out data.
        hold_out_selection = (
            match_shape(gfed_regions.data == region_code, master_mask.shape)
            & ~master_mask
        )
        train_selection = (
            match_shape(gfed_regions.data != region_code, master_mask.shape)
            & ~master_mask
        )

        # Transform X, y to 3D arrays before selecting using the above masks.
        mm_y = get_map_data(y.values, master_mask)

        hold_out_y = mm_y.data[hold_out_selection]
        train_y = mm_y.data[train_selection]

        # Repeat for all column in X.

        hold_out_X_data = {}
        train_X_data = {}
        for col in X:
            mm_x_col = get_map_data(X[col].values, master_mask)

            hold_out_X_data[col] = mm_x_col.data[hold_out_selection]
            train_X_data[col] = mm_x_col.data[train_selection]

        hold_out_X = pd.DataFrame(hold_out_X_data)
        train_X = pd.DataFrame(train_X_data)

        yield region_name, train_X, hold_out_X, train_y, hold_out_y

In [None]:
for region_name, train_X, hold_out_X, train_y, hold_out_y in get_gfed_region_cv_splits(
    exog_data, endog_data
):
    print(region_name, train_X.shape, hold_out_X.shape, train_y.shape, hold_out_y.shape)
    rf = DaskRandomForestRegressor(n_estimators=32, max_depth=15)
    with parallel_backend("dask"):
        rf.fit(train_X, train_y)
    print(rf.score(hold_out_X, hold_out_y))
    print(rf.score(train_X, train_y))

In [None]:
for region_name, train_X, hold_out_X, train_y, hold_out_y in get_gfed_region_cv_splits(
    exog_data, endog_data
):
    print(region_name, train_X.shape, hold_out_X.shape, train_y.shape, hold_out_y.shape)
    rf = DaskRandomForestRegressor(n_estimators=32, max_depth=8)
    with parallel_backend("dask"):
        rf.fit(train_X, train_y)
    print(rf.score(hold_out_X, hold_out_y))
    print(rf.score(train_X, train_y))

In [None]:
plt.hist(train_y, bins=30)
plt.yscale("log")

In [None]:
plt.hist(hold_out_y, bins=30)
plt.yscale("log")

In [None]:
master_mask_single = master_mask[0]
valid_indices = np.where(~master_mask_single.ravel())[0]
n_valid_indices = len(valid_indices)
n_train = int(n_valid_indices * 0.7)
n_test = n_valid_indices - n_train
print(n_train, n_test, n_valid_indices)

shuffled_indices = valid_indices.copy()
np.random.default_rng(0).shuffle(shuffled_indices)

train_indices = shuffled_indices[:n_train]
test_indices = shuffled_indices[n_train:]

train_mask_single = np.ones_like(master_mask_single)
train_mask_single.ravel()[train_indices] = False

test_mask_single = np.ones_like(master_mask_single)
test_mask_single.ravel()[test_indices] = False

train_y = get_map_data(endog_data.values, master_mask)[
    match_shape(~train_mask_single, master_mask.shape)
]
test_y = get_map_data(endog_data.values, master_mask)[
    match_shape(~test_mask_single, master_mask.shape)
]

train_X_data = {}
test_X_data = {}

for col in tqdm(exog_data.columns):
    train_X_data[col] = get_map_data(exog_data[col].values, master_mask)[
        match_shape(~train_mask_single, master_mask.shape)
    ]
    test_X_data[col] = get_map_data(exog_data[col].values, master_mask)[
        match_shape(~test_mask_single, master_mask.shape)
    ]

train_X = pd.DataFrame(train_X_data)
test_X = pd.DataFrame(test_X_data)

In [None]:
train_X.shape, train_y.shape, test_X.shape, test_y.shape

In [None]:
_ = cube_plotting(train_mask_single)

In [None]:
_ = cube_plotting(test_mask_single)

In [None]:
_ = cube_plotting(master_mask_single)

In [None]:
rf = DaskRandomForestRegressor(n_estimators=10, max_depth=15)
with parallel_backend("dask"):
    rf.fit(train_X, train_y)
print(rf.score(test_X, test_y))
print(rf.score(train_X, train_y))