In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from carbonplan_trace.v1.glas_allometric_eq import REALM_GROUPINGS
from carbonplan_trace.v1 import load
import carbonplan_trace.v1.model as m
import pandas as pd
from carbonplan_trace.v1.landsat_preprocess import access_credentials
import numpy as np

import matplotlib.pyplot as plt

In [None]:
access_key_id, secret_access_key = access_credentials()

In [None]:
# we train one model per realm

# realms = list(REALM_GROUPINGS.keys())
# only use australia for example, but we would want all when rerunning this
realms = ["australia"]

In [None]:
# This block of code is used for generating difference parameter sets for hyperparameter optimization (HPO) of the model
# the params here are for the xgboost model

# import itertools

# def product_dict(**kwargs):
#     keys = kwargs.keys()
#     vals = kwargs.values()
#     for instance in itertools.product(*vals):
#         yield dict(zip(keys, instance))


# param_set = {
#     "learning_rate": [0.07, 0.05, 0.03],
#     "max_depth": [10, 12, 14],
#     "colsample_bytree": [0.5, 0.7, 0.9],
#     "subsample": [0.5, 0.7, 0.9],
#     "min_child_weight": [2, 4, 6],
#     "lambda": [1, 1.5, 2],
#     "alpha": [0, 0.5, 1],
#     "gamma": [0, 0.5, 1],
# }

# groupings = [
#     ["learning_rate"],
#     ["max_depth"],
#     ["colsample_bytree", "subsample", "min_child_weight"],
#     ["lambda", "alpha", "gamma"],
# ]

# dims = [list(range(len(param_set[g[0]]))) for g in groupings]
# param_set_list = []
# for orders in list(itertools.product(*dims)):
#     d = {}
#     for o, g in zip(orders, groupings):
#         for k in g:
#             d[k] = param_set[k][o]
#     param_set_list.append(d)

In [None]:
# helper functions for assessing model performances


def get_all_prediction_result(model, df_train, df_test, df_val):

    df_train["biomass_pred"] = model._predict(df_train)
    df_test["biomass_pred"] = model._predict(df_test)
    df_val["biomass_pred"] = model._predict(df_val)

    return df_train, df_test, df_val


def calculate_temporal_variability(df, y1=2007, y2=2008, precision=3):
    year1 = df.loc[df.year == y1, ["lat", "lon", "biomass"]]
    year2 = df.loc[df.year == y2, ["lat", "lon", "biomass"]]

    year1["lat_round"] = year1.lat.round(precision)
    year1["lon_round"] = year1.lon.round(precision)
    year2["lat_round"] = year2.lat.round(precision)
    year2["lon_round"] = year2.lon.round(precision)

    merged = year1.merge(year2, on=["lat_round", "lon_round"], suffixes=["_year1", "_year2"])

    mae = (merged.biomass_year2 - merged.biomass_year1).abs().mean()
    me = (merged.biomass_year2 - merged.biomass_year1).mean()

    return {"mae": mae, "me": me}


def plot_scatter(sub, title, n=500000):
    xmin = -10
    size = min(len(sub), n)
    toplot = sub.sample(n=size)
    xmax = toplot.biomass.quantile(0.95)
    plt.scatter(toplot.biomass, toplot.biomass_pred, s=1, alpha=0.03)
    plt.plot([xmin, xmax], [xmin, xmax], "k")
    plt.xlabel("True Biomass (Mg/ha)")
    plt.ylabel("Predicted Biomass (Mg/ha)")
    plt.xlim(xmin, xmax)
    plt.ylim(xmin, xmax)
    plt.title(title)

In [None]:
scores = []
# whether to randomly split the train/test data or to split train/test based on year
# doesn't seem to make too big of a difference on validation performance
random_split = True
# whether to reload the training data from individual years, or use the compiled data directly
# only needs to be True when the training data is re-generated
reload = False
# whether to overwrite the models already trained
overwrite = False

for model_class in [m.random_forest_model, m.xgb_model]:
    for realm in realms:
        print(f"Building model for {realm} realm")

        # load data, add year information
        df = load.training(
            realm=realm,
            reload=reload,
            access_key_id=access_key_id,
            secret_access_key=secret_access_key,
        )
        print(f"    size of entire df is {round(df.size / 1e9, 2)}Gb")

        for strategy in ["none"]:  # ["first", "last", "none"]:
            # strategy = "first" means that the first year is used for validation, and "last" means the last year is used for validation
            # strategy = none means that no data is reserved for validation => used for training the final production model,
            # whereas first/last allow us to assess model performance during the model design and tuning phases
            df_train, df_test, df_val = m.train_test_split_based_on_year(
                df, val_strategy=strategy, random_train_test=random_split
            )
            print(f"    training sample size = {len(df_train)}")
            print(f"    testing sample size = {len(df_test)}")
            print(f"    eval sample size = {len(df_val)}")

            # this for loop is for running different parameter sets in HPO
            for params in [{}]:

                # instantiating the model also does .fit
                # this will load the model if it already exist and overwrite=False, and fit the model if overwrite=True or the model does not exist
                model = model_class(
                    realm=realm,
                    df_train=df_train,
                    df_test=df_test,
                    output_folder="s3://carbonplan-climatetrace/v2.1/models/",  # v1 or v2
                    overwrite=overwrite,
                    validation_year=strategy,
                    params=params,
                )

                # do model evaluation on each split of the data: train, test, and validation
                for split, sub in zip(("train", "test", "val"), (df_train, df_test, df_val)):
                    # validation data can be empty if val strategy = 'none'
                    if len(sub) > 0:
                        model_score = model.evaluate(sub)
                        model_score["model_name"] = model.name
                        model_score["split"] = split
                        model_score["realm"] = realm
                        model_score["validation_year"] = strategy
                        model_score["random_split"] = random_split
                        model_score["sample_size"] = len(sub)
                        model_score.update(params)
                        scores.append(model_score)

                df_train["biomass_pred"] = model.predict(df_train)
                df_test["biomass_pred"] = model.predict(df_test)

            # plot the prediction result
            plt.figure(figsize=(10, 4.5))
            plt.subplot(1, 2, 1)
            plot_scatter(df_train, title=f"{realm} train samples")
            plt.subplot(1, 2, 2)
            plot_scatter(df_test, title=f"{realm} test samples")
            plt.savefig(f"{realm}_model_scatter.png")
            plt.show()
            plt.close()

            # plotting feature importance if the model being trained is random forest
            if "rf" in model.name:
                plt.figure(figsize=(10, 4))
                plt.title(f"{realm} feature importance")
                xticks = np.arange(len(m.features)) * 2
                plt.bar(xticks, model.model.feature_importances_)
                plt.xticks(ticks=xticks, labels=m.features, rotation="vertical")
                plt.savefig(f"{realm}_feature_imp.png")
                plt.show()
                plt.close()
            # TODO: plot something else if we're training the xgboost model

scores = pd.DataFrame(scores)

In [None]:
scores

# only selecting everything that's test or val split
# scores.loc[scores.split == 'val]

# doing weighted average of the scores
# (scores.loc[scores.split == 'test'].r2 * scores.loc[scores.split == 'test'].sample_size).sum() / scores.loc[scores.split == 'test'].sample_size.sum()

In [None]:
for random_split in [True, False]:
    print(random_split)
    sub = scores.loc[(scores.split == "val") & (scores.random_split == random_split)]
    print(f"validation score = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")
    sub = scores.loc[(scores.split == "test") & (scores.random_split == random_split)]
    print(f"testing score    = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")
    sub = scores.loc[(scores.split == "train") & (scores.random_split == random_split)]
    print(f"training score   = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")

In [None]:
for validation_year in ["first", "last"]:
    print(validation_year)
    sub = scores.loc[(scores.split == "val") & (scores.validation_year == validation_year)]
    print(f"validation score = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")
    sub = scores.loc[(scores.split == "test") & (scores.validation_year == validation_year)]
    print(f"testing score    = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")
    sub = scores.loc[(scores.split == "train") & (scores.validation_year == validation_year)]
    print(f"training score   = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")