# Set up for model experiments

We'll do the following here:

- Create distinct catalogs
- Document parameters changes that will accompany each
- Create yamls for each experiment

## Initial experiments - FTW baseline model, FTW dataset

The first tests will be on a few different parameters/settings on the existing FTW baseline model, on just the FTW dataset (the full one). 

### Catalog

In [90]:
import pandas as pd
from pathlib import Path
import yaml

In [84]:
catalog = pd.read_csv("../data/ftw-mappingafrica-combined-catalog.csv")
catalog.query("dataset == 'ftw'").to_csv("../data/ftw-catalog.csv", index=False)

Update: catalog will be adjusted to drop Portugal and presence-only data from validation set for FTW

In [None]:
catalog = pd.read_csv("../data/ftw-mappingafrica-combined-catalog.csv")
# catalog["split"]

# 22 records in India had no split assigned--placed them in train
catalog["split"] = catalog["split"].replace("none", "train")

catalog['split'] = catalog['split'].astype(str).str.strip().str.lower()
catalog['country'] = catalog['country'].astype(str).str.strip().str.lower()
catalog['null_prop'] = pd.to_numeric(catalog['null_prop'], errors='coerce')  # NaN if non-numeric

mask = ~(
    catalog['split'].isin(['validate', 'test'])
    & (
        (catalog['null_prop'] > 0) | (catalog['country'] == 'portugal')
    )
)
catalogr = catalog[mask].copy().reset_index(drop=True)

# FTW and full catalogs dropping Portugal and presence-only data from val/test
(catalogr.query("dataset == 'ftw'")
 .to_csv("../data/ftw-catalog2.csv", index=False))
(catalogr
 .to_csv("../data/ftw-mappingafrica-combined-catalog2.csv", index=False))

# small FTW for rapid model testing
ftw_small = (
    catalogr
     .query("dataset == 'ftw'")
     .sample(10000, random_state=42)
).reset_index(drop=True)#.to_csv("../data/ftw-catalog-small.csv", index=False))
# ftw_small["split"].value_counts()
ftw_small.to_csv("../data/ftw-catalog-small.csv", index=False)


Make another small catalog for debugging, including null prop > 0 in validation data

In [89]:
ftw_catalog = pd.read_csv("../data/ftw-catalog.csv")
ftw_small_debug = (ftw_catalog
                   .sample(1000, random_state=42)
                   .reset_index(drop=True))
# ftw_small_debug.query("split == 'validate' & null_prop > 0")
ftw_small_debug.to_csv("../data/ftw-catalog-small-debug.csv", index=False)


### Config adjustor

In [91]:
def write_yaml(template_path: str, output_path: str, updates: dict = None):
    """
    Write a YAML file from a template file, with optional updates.

    Args:
        template_path (str): Path to the base YAML template file.
        output_path (str): Path to the output YAML file.
        updates (dict, optional): Dictionary of keys/values to update.
    """

    def recursive_update(d, u):
        for k, v in u.items():
            if isinstance(v, dict) and isinstance(d.get(k), dict):
                recursive_update(d[k], v)
            else:
                d[k] = v

    with open(template_path, 'r') as f:
        config = yaml.safe_load(f)
        if updates:
            recursive_update(config, updates)

    class IndentDumper(yaml.SafeDumper):
        def increase_indent(self, flow=False, indentless=False):
            return super().increase_indent(flow, False)

    # custom representer for lists
    def represent_list(dumper, data):
        # flow style only if all elements are scalars
        if all(isinstance(x, (str, int, float, bool, type(None))) for x in data):
            return dumper.represent_sequence("tag:yaml.org,2002:seq", data, 
                                             flow_style=True)
        else:
            return dumper.represent_sequence("tag:yaml.org,2002:seq", data, 
                                             flow_style=False)

    IndentDumper.add_representer(list, represent_list)

    with open(output_path, 'w') as f:
        yaml.dump(
            config,
            f,
            Dumper=IndentDumper,
            default_flow_style=False,  # keep dicts block-style
            sort_keys=False,
            indent=2,
            allow_unicode=True
        )


### Experiments

All experiments here are FTW baseline model, window B only, on the FTW dataset.

Single parameter or no change:

1. FTW defaults (for comparison with FTW's results)
2. Locally-weighted tversky focal loss
3. min-max normalization, lab
4. min-max normalization, gab
5. photometric augmentation package
6. satslidemix
7. rescale

#### Setup

Below we set up a yaml for each experiment. Provide the following:

- `cfg_name`: name of the config/experiment file (without .yaml)
- `update`: dictionary of changes to make to the base config

Also define a global `home_dir` for the path to the repo containing the catalog. That's done once in the first cell. 

#### # 1

In [92]:
home_dir = "~/projects"
cfg_name = "ftwbaseline-exp1"
base_update = dict(
    data=dict(
        init_args=dict(
            catalog=f"{home_dir}/"\
                "ftw-mappingafrica-integration/data/ftw-catalog2.csv",
        )
    )
)
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

#### # 2


In [93]:
cfg_name = "ftwbaseline-localtversky-exp2"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["model"] = dict(init_args=dict(loss="localtversky"))
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

#### # 3


In [94]:
cfg_name = "ftwbaseline-minmax_lab-exp3"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"] = dict(
    normalization_strategy="min_max",
    normalization_stat_procedure="lab",
    global_stats=None,
)

write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

#### # 4

Not yet run (need to calculate global stats)


#### # 5

In [95]:
augs = ["rotation", "hflip", "vflip", "sharpness"]
cfg_name = "ftwbaseline-photometric-exp5"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"] = dict(
    aug_list=augs + ["brightness", "contrast", "gaussian_noise"]
)    
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

#### # 6

In [96]:
augs = ["rotation", "hflip", "vflip", "sharpness"]
cfg_name = "ftwbaseline-satslide-exp6"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"] = dict(
    aug_list=augs + ["satslidemix"]
)    
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

#### # 7

## Full catalog

Setting up runs that will pull from the full catalog, starting with the FTW baseline model.

We'll also specify some validation sets to evaluate results. To start we will use the separated FTW and Mapping Africa "global" validation samples, without confining them to specific countries. 

In [98]:
augs = ["rotation", "hflip", "vflip", "sharpness"]
cfg_name = "fullcat-ftwbaseline-exp1"
base_update = dict(
    data=dict(
        init_args=dict(
            catalog=f"{home_dir}/"\
                "ftw-mappingafrica-integration/data/mappingafrica-catalog2.csv"
        )
    )
)
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

In [97]:
augs = ["rotation", "hflip", "vflip", "sharpness"]
cfg_name = "ftwbaseline-rescale-exp7"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"] = dict(
    aug_list=augs + ["rescale"]
)    
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

## Mapping Africa equivalent

We will train a model close to the variant we used on Planet imagery for mapping countries in Africa, initially trained on just the Mapping Africa labels.

In [58]:
catalog = pd.read_csv("../data/ftw-mappingafrica-combined-catalog.csv")
(catalog.query("dataset == 'mappingafrica'")
 .to_csv("../data/mappingafrica-catalog.csv", index=False))

In [79]:
template_path = "../configs/ftwbaseline-exp1.yaml"
with open(template_path, 'r') as f:
    config = yaml.safe_load(f)

In [82]:
home_dir = "~/projects"
augs = ["rotation", "hflip", "vflip", "sharpness"]
cfg_name = "ma-approximate-baseline"
base_update = dict(
    data=dict(
        init_args=dict(
            catalog=f"{home_dir}/"\
                "ftw-mappingafrica-integration/data/mappingafrica-catalog.csv"
        )
    )
)
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["model"] = dict(
    init_args=dict(
        class_weights=None,
        loss="localtversky", 
        backbone="tu-vgg19_bn.tv_in1k",
        patch_weights=True,
        model_kwargs=dict(drop_rate=0.1)
    )
)
update["data"]["init_args"].update(
    dict(
        aug_list=augs + \
            ["brightness", "contrast", "gaussian_noise", "rescale", "gamma"],
        normalization_strategy="min_max",
        normalization_stat_procedure="lab",
        global_stats=None,
    ) 
)
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)