In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from carbonplan_trace.v1.glas_allometric_eq import REALM_GROUPINGS
from carbonplan_trace.v1 import load
import carbonplan_trace.v1.model as m
import pandas as pd
from carbonplan_trace.v1.landsat_preprocess import access_credentials
import numpy as np

In [None]:
access_key_id, secret_access_key = access_credentials()

In [None]:
realms = list(REALM_GROUPINGS.keys())

In [None]:
# HPO
import itertools


def product_dict(**kwargs):
    keys = kwargs.keys()
    vals = kwargs.values()
    for instance in itertools.product(*vals):
        yield dict(zip(keys, instance))


param_set = {
    "learning_rate": [0.07, 0.05, 0.03],
    "max_depth": [10, 12, 14],
    "colsample_bytree": [0.5, 0.7, 0.9],
    "subsample": [0.5, 0.7, 0.9],
    "min_child_weight": [2, 4, 6],
    "lambda": [1, 1.5, 2],
    "alpha": [0, 0.5, 1],
    "gamma": [0, 0.5, 1],
}

groupings = [
    ["learning_rate"],
    ["max_depth"],
    ["colsample_bytree", "subsample", "min_child_weight"],
    ["lambda", "alpha", "gamma"],
]

dims = [list(range(len(param_set[g[0]]))) for g in groupings]
param_set_list = []
for orders in list(itertools.product(*dims)):
    d = {}
    for o, g in zip(orders, groupings):
        for k in g:
            d[k] = param_set[k][o]
    param_set_list.append(d)

In [None]:
def get_all_prediction_result(model, df_train, df_test, df_val):

    df_train["biomass_pred"] = model._predict(df_train)
    df_test["biomass_pred"] = model._predict(df_test)
    df_val["biomass_pred"] = model._predict(df_val)

    return pd.concat(
        [
            df_train[["lat", "lon", "year", "biomass_pred"]],
            df_test[["lat", "lon", "year", "biomass_pred"]],
            df_val[["lat", "lon", "year", "biomass_pred"]],
        ]
    ).rename(columns={"biomass_pred": "biomass"})


def calculate_temporal_variability(df, y1=2007, y2=2008, precision=3):
    year1 = df.loc[df.year == y1, ["lat", "lon", "biomass"]]
    year2 = df.loc[df.year == y2, ["lat", "lon", "biomass"]]

    year1["lat_round"] = year1.lat.round(precision)
    year1["lon_round"] = year1.lon.round(precision)
    year2["lat_round"] = year2.lat.round(precision)
    year2["lon_round"] = year2.lon.round(precision)

    merged = year1.merge(year2, on=["lat_round", "lon_round"], suffixes=["_year1", "_year2"])

    mae = (merged.biomass_year2 - merged.biomass_year1).abs().mean()
    me = (merged.biomass_year2 - merged.biomass_year1).mean()

    return {"mae": mae, "me": me}

In [None]:
scores = []
random_split = True
reload = False
overwrite = False

for model_class in [m.random_forest_model]:  # m.xgb_model
    for realm in realms:
        print(f"Building model for {realm} realm")

        # load data, add year information
        df = load.training(
            realm=realm,
            reload=reload,
            access_key_id=access_key_id,
            secret_access_key=secret_access_key,
        )
        print(f"    size of entire df is {round(df.size / 1e9, 2)}Gb")

        for strategy in ["first", "last", "no"]:
            # split into train/test based on year
            df_train, df_test, df_val = m.train_test_split_based_on_year(
                df, val_strategy=strategy, random_train_test=random_split
            )
            print(f"    training sample size = {len(df_train)}")
            print(f"    testing sample size = {len(df_test)}")
            print(f"    eval sample size = {len(df_val)}")

            # build 2 models: 1) baseline/mean, 2) xgboost
            # TODO: build linear model as another baseline model
            # m.baseline_model, m.gradient_boost_model, m.random_forest_model

            for params in [{}]:

                model = model_class(
                    realm=realm,
                    df_train=df_train,
                    df_test=df_test,
                    output_folder="s3://carbonplan-climatetrace/v1/models/",
                    overwrite=overwrite,
                    validation_year=strategy,
                    params=params,
                )

                for split, sub in zip(("train", "test", "val"), (df_train, df_test, df_val)):
                    if len(sub) > 0:
                        model_score = model.evaluate(sub)
                        model_score["model_name"] = model.name
                        model_score["split"] = split
                        model_score["realm"] = realm
                        model_score["validation_year"] = strategy
                        model_score["random_split"] = random_split
                        model_score["sample_size"] = len(sub)
                        model_score.update(params)
                        scores.append(model_score)

scores = pd.DataFrame(scores)

In [None]:
for random_split in [True, False]:
    print(random_split)
    sub = scores.loc[(scores.split == "val") & (scores.random_split == random_split)]
    print(f"validation score = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")
    sub = scores.loc[(scores.split == "test") & (scores.random_split == random_split)]
    print(f"testing score    = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")
    sub = scores.loc[(scores.split == "train") & (scores.random_split == random_split)]
    print(f"training score   = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")

In [None]:
for validation_year in ["first", "last"]:
    print(validation_year)
    sub = scores.loc[(scores.split == "val") & (scores.validation_year == validation_year)]
    print(f"validation score = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")
    sub = scores.loc[(scores.split == "test") & (scores.validation_year == validation_year)]
    print(f"testing score    = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")
    sub = scores.loc[(scores.split == "train") & (scores.validation_year == validation_year)]
    print(f"training score   = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}")

In [None]:
temporal_variability = pd.read_csv("temporal_variability.csv")

In [None]:
temporal_variability["realm"] = temporal_variability.model_name.apply(lambda x: x.split("_")[1])
temporal_variability["model_type"] = temporal_variability.model_name.apply(
    lambda x: x.split("_")[0]
)

sample_size = (
    scores.loc[(scores.random_split == True) & (scores.model_name.str.startswith("xgb"))]
    .groupby("realm")
    .sample_size.sum()
)

In [None]:
# weighted average
merged = temporal_variability.loc[temporal_variability.random_split != True].merge(
    sample_size, how="left", on="realm"
)
name_dict = {
    "gb": "gradient boosting",
    "ground": "lidar derived",
    "rf": "random forest",
    "xgb": "xgboost",
}
merged["model_type"] = merged.model_type.apply(lambda x: name_dict[x])

print(
    "Biomass MAE between years 2007 and 2008 of the same location using different model architecture"
)
print("")
for model, g in merged.groupby("model_type"):
    print(
        model.ljust(20),
        np.round((g.mae * g.sample_size).sum() / g.sample_size.sum(), 4),
    )

In [None]:
# simple average
temporal_variability.loc[temporal_variability.random_split != True].merge(
    sample_size, how="left", on="realm"
).groupby("model_type").mae.mean()

In [None]:
scores = pd.read_csv("HPO_1.csv")

In [None]:
df.loc[df.split == "test"].groupby(
    ["learning_rate", "max_depth", "colsample_bytree", "lambda"]
).mean().sort_values(by="r2")

In [None]:
# from sklearn.preprocessing import OneHotEncoder
# igbp_encoder = OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore').fit(df_train[['igbp']])
#     # one hot encoding for igbp
#     encoded_igbp = igbp_encoder.transform(X[['igbp']])
#     X = X.drop(['igbp'], axis=1)
#     for i in range(encoded_igbp.shape[1]):
#         X[f'igbp_cat_{str(i+1)}'] = encoded_igbp[:, i]