In [None]:
%load_ext autoreload
%autoreload 2

# Hyperparameter tuning

## Compute

In [None]:
from omegaconf import OmegaConf
import pandas as pd

from src.constants import AOIS_TEST
from src.data import UNOSAT_S1TS_Dataset
from src.classification.model_factory import load_model
from src.classification.trainer import S1TSDD_Trainer

In [None]:
def extract_features(df, start, end, prefix=""):

    # columns are datetime -> can slice directly between two dates
    df = df.loc[:, start:end]

    # features
    df_features = pd.DataFrame(index=df.index)
    df_features["mean"] = df.mean(axis=1)
    df_features["std"] = df.std(axis=1)
    df_features["median"] = df.median(axis=1)
    df_features["min"] = df.min(axis=1)
    df_features["max"] = df.max(axis=1)
    df_features["skew"] = df.skew(axis=1)
    df_features["kurt"] = df.kurt(axis=1)

    # rename columns using band, prefix (eg pre/post/pre_3x3, ...)
    df_vv = df_features.xs("VV", level="band")
    df_vh = df_features.xs("VH", level="band")
    df_vv.columns = [f"VV_{prefix}_{col}" for col in df_vv.columns]
    df_vh.columns = [f"VH_{prefix}_{col}" for col in df_vh.columns]
    return pd.concat([df_vv, df_vh], axis=1)


cfg = OmegaConf.create(
    dict(
        aggregation_method="mean",
        model_name="random_forest",
        model_kwargs=dict(
            n_estimators=100,
            n_jobs=12,
        ),
        data=dict(
            aois_test=[f"UKR{i}" for i in range(1, 19) if i not in [1, 2, 3, 4]],  # ["UKR6", "UKR8", "UKR12", "UKR15"],
            damages_to_keep=[1, 2, 3],
            extract_winds=["3x3"],  # ['1x1', '3x3', '5x5']
            random_neg_labels=0.0,  # percentage of negative labels to add in training set (eg 0.1 for 10%)
            time_periods=[
                dict(pre=("2021-02-24", "2022-02-23"), post=("2022-02-24", "2023-02-23")), # negative samples
                dict(pre=("2020-02-24", "2021-02-23"), post=("2021-02-24", "2022-02-23")) # positive samples
            ],
        ),
        seed=123,
        run_name=None,
    )
)

ds = UNOSAT_S1TS_Dataset(cfg.data, extract_features=extract_features)
df, df_test = ds.get_datasets("test")
X = df[[c for c in df.columns if c.startswith(("VV", "VH"))]].values
y = df["label"].values
X_test = df_test[[c for c in df_test.columns if c.startswith(("VV", "VH"))]].values
y_test = df_test["label"].values
X.shape, y.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier

param_grid = ParameterGrid(
    {
        "n_estimators": [10, 20, 50, 100, 200],
        "max_depth": [20, 40, None],
        # "min_samples_split": [2, 4, 8, 16],
        "min_samples_leaf": [1, 2, 4, 8],
        "max_samples": [0.5, 1.0],
    }
)
len(param_grid)

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from src.classification.utils import aggregate_predictions
from tqdm import tqdm

col_to_keep = ["aoi", "unosat_id", "orbit", "label"]
d_results = []
for param in tqdm(param_grid):

    df_preds = df_test[col_to_keep].copy()

    clf = RandomForestClassifier(**param, n_jobs=12, random_state=123)
    clf.fit(X, y)
    df_preds["preds_proba"] = clf.predict_proba(X_test)[:, 1]

    # aggregate predictions
    df_agg = df_preds.groupby(["aoi", "unosat_id", "label"]).agg({"preds_proba": "mean"}).reset_index()
    y_true = df_agg.reset_index()["label"]

    # compute metrics for different thresholds
    for thresh in [0.5, 0.6, 0.75]:
        y_pred_agg_thresh = (df_agg["preds_proba"] > thresh).astype(int)
        param[f"acc_{thresh:.2f}"] = accuracy_score(y_true, y_pred_agg_thresh)
        param[f'precision_{thresh:.2f}'] = precision_score(y_true, y_pred_agg_thresh)
        param[f'recall_{thresh:.2f}'] = recall_score(y_true, y_pred_agg_thresh)
        param[f'f1_{thresh:.2f}'] = f1_score(y_true, y_pred_agg_thresh)
    d_results.append(param)

df_results = pd.DataFrame(d_results)
df_results.to_csv("results_rf.csv", index=False)

## Analyze

In [None]:
import pandas as pd
from src.constants import PROJECT_PATH
df_results_cv = pd.read_csv(PROJECT_PATH / "results_rf_cv_all_data.csv")
#df_results_cv = df_results_cv[df_results_cv["n_estimators"] == 100]
cols = ['max_depth', 'max_samples', 'min_samples_leaf','n_estimators']
cols += [f'f1_{thresh:.2f}' for thresh in [0.5, 0.6, 0.75]]
cols += [f'f1_{thresh:.2f}_std' for thresh in [0.5, 0.6, 0.75]]
cols += [f'f0.5_{thresh:.2f}' for thresh in [0.5, 0.6, 0.75]]
cols += [f'f0.5_{thresh:.2f}_std' for thresh in [0.5, 0.6, 0.75]]
df_results_cv.sort_values("f0.5_0.50", ascending=False)[cols].head(15)

In [None]:
df_results_cv = pd.read_csv(PROJECT_PATH / "results_rf_cv_all_data_ntrees.csv")
#df_results_cv = df_results_cv[df_results_cv["n_estimators"] == 100]
cols = ['n_estimators']
cols += [f'f1_{thresh:.2f}' for thresh in [0.5, 0.6, 0.75]]
cols += [f'f1_{thresh:.2f}_std' for thresh in [0.5, 0.6, 0.75]]
cols += [f'f0.5_{thresh:.2f}' for thresh in [0.5, 0.6, 0.75]]
cols += [f'f0.5_{thresh:.2f}_std' for thresh in [0.5, 0.6, 0.75]]
df_results_cv.sort_values("f0.5_0.50", ascending=False)[cols].head(15)