In [None]:
import gc
import logging
import sys
import warnings
from functools import partial

import matplotlib as mpl
import numpy as np
import pandas as pd
from joblib.parallel import parallel_backend
from loguru import logger as loguru_logger
from wildfires.qstat import get_ncpus

from empirical_fire_modelling.configuration import Experiment
from empirical_fire_modelling.data import (
    get_data,
    get_endog_exog_mask,
    get_experiment_split_data,
    get_first_cube_datetimes,
)
from empirical_fire_modelling.logging_config import enable_logging
from empirical_fire_modelling.model import get_model, get_model_scores
from empirical_fire_modelling.utils import tqdm

mpl.rc_file("../matplotlibrc")

loguru_logger.enable("alepython")
loguru_logger.remove()
loguru_logger.add(sys.stderr, level="WARNING")

logger = logging.getLogger(__name__)
enable_logging(level="WARNING")

warnings.filterwarnings("ignore", ".*Collapsing a non-contiguous coordinate.*")
warnings.filterwarnings("ignore", ".*DEFAULT_SPHERICAL_EARTH_RADIUS.*")
warnings.filterwarnings("ignore", ".*guessing contiguous bounds.*")

warnings.filterwarnings(
    "ignore", 'Setting feature_perturbation = "tree_path_dependent".*'
)

In [None]:
experiment = Experiment["15VEG_FAPAR_MON"]

In [None]:
# Operate on cached data only.
get_experiment_split_data.check_in_store(experiment)
X_train, X_test, y_train, y_test = get_experiment_split_data(experiment)

# Operate on cached data only.
get_data(experiment, cache_check=True)
endog_data, exog_data, master_mask = get_endog_exog_mask(experiment)

# Operate on cached fitted models only.
get_model(X_train, y_train, cache_check=True)
model = get_model(X_train, y_train)

datetimes = get_first_cube_datetimes(get_data(experiment)[3])

print("Nr. of months:", len(datetimes))
print("Nr. of years:", len(datetimes) / 12)
print("30% of years:", 0.3 * len(datetimes) / 12)
print("First time:", datetimes[0].year, datetimes[0].month)
print("Last time:", datetimes[-1].year, datetimes[-1].month)

In [None]:
normal_scores = get_model_scores(model, X_test, X_train, y_test, y_train)
normal_scores

#### Go from DataFrame and Series back to MaskedArrays

In [None]:
mm_valid_indices = np.where(~master_mask.ravel())[0]

In [None]:
mm_y = np.ma.MaskedArray(np.zeros_like(master_mask, dtype=np.float64), mask=True)
mm_y.ravel()[mm_valid_indices] = endog_data.values

mm_X_data = {}
for column in tqdm(exog_data.columns):
    mm_X_data[column] = np.ma.MaskedArray(
        np.zeros_like(master_mask, dtype=np.float64), mask=True
    )
    mm_X_data[column].ravel()[mm_valid_indices] = exog_data[column].values

#### Ignore some years, refit the model

In [None]:
def temporal_fitting(test_years):
    temporal_train_inds = []
    temporal_test_inds = []
    train_years = []
    for i, datetime in enumerate(datetimes):
        if datetime.year in test_years:
            temporal_test_inds.append(i)
        else:
            train_years.append(datetime.year)
            temporal_train_inds.append(i)

    train_years = tuple(sorted(set(train_years)))

    print("test years:", test_years)
    print("train years:", train_years)
    print("Nr. test inds:", len(temporal_test_inds))
    print("Nr. train inds:", len(temporal_train_inds))
    assert len(temporal_test_inds) + len(temporal_train_inds) == len(datetimes)
    print(f"Test % of total: {100 * len(temporal_test_inds) / len(datetimes):0.1f}")

    temporal_X_train_data = {}
    temporal_X_test_data = {}

    for variable, mm_variable in mm_X_data.items():
        temporal_mm_X_train_data_variable = mm_variable[temporal_train_inds]
        temporal_X_train_data[variable] = temporal_mm_X_train_data_variable.data[
            ~temporal_mm_X_train_data_variable.mask
        ]

        temporal_mm_X_test_data_variable = mm_variable[temporal_test_inds]
        temporal_X_test_data[variable] = temporal_mm_X_test_data_variable.data[
            ~temporal_mm_X_test_data_variable.mask
        ]

    temporal_X_train = pd.DataFrame(temporal_X_train_data)
    temporal_X_test = pd.DataFrame(temporal_X_test_data)

    del temporal_X_train_data
    del temporal_X_test_data
    gc.collect()

    temporal_train_mm_y = mm_y[temporal_train_inds]
    temporal_test_mm_y = mm_y[temporal_test_inds]

    temporal_y_train = pd.Series(
        temporal_train_mm_y.data[~temporal_train_mm_y.mask], name=endog_data.name
    )
    temporal_y_test = pd.Series(
        temporal_test_mm_y.data[~temporal_test_mm_y.mask], name=endog_data.name
    )

    temporal_model = get_model(
        temporal_X_train,
        temporal_y_train,
        parallel_backend_call=partial(
            parallel_backend, "threading", n_jobs=get_ncpus()
        ),
    )

    temporal_scores = get_model_scores(
        temporal_model,
        temporal_X_test,
        temporal_X_train,
        temporal_y_test,
        temporal_y_train,
    )
    print(temporal_scores)

In [None]:
test_years = tuple(range(2009, 2013))
temporal_fitting(test_years)

In [None]:
test_years = tuple(range(2016, 2020))
temporal_fitting(test_years)