# Set up for model experiments

We'll do the following here:

- Create distinct catalogs
- Document parameters changes that will accompany each
- Create yamls for each experiment

### Catalogs

In [10]:
import pandas as pd
from pathlib import Path
import yaml
import os
import re
import glob 
import geopandas as gpd

In [11]:
catalog = pd.read_csv("../data/ftw-mappingafrica-combined-catalog.csv")
catalog.query("dataset == 'ftw'").to_csv("../data/ftw-catalog.csv", index=False)

Update: catalog will be adjusted to drop Portugal and presence-only data from validation set for FTW

In [None]:
catalog = pd.read_csv("../data/ftw-mappingafrica-combined-catalog.csv")
# catalog["split"]

# 22 records in India had no split assigned--placed them in train
catalog["split"] = catalog["split"].replace("none", "train")

catalog['split'] = catalog['split'].astype(str).str.strip().str.lower()
catalog['country'] = catalog['country'].astype(str).str.strip().str.lower()
catalog['null_prop'] = pd.to_numeric(catalog['null_prop'], errors='coerce')  # NaN if non-numeric

mask = ~(
    catalog['split'].isin(['validate', 'test'])
    & (
        (catalog['null_prop'] > 0) | (catalog['country'] == 'portugal')
    )
)
catalogr = catalog[mask].copy().reset_index(drop=True)

# FTW and full catalogs dropping Portugal and presence-only data from val/test
(catalogr.query("dataset == 'ftw'")
 .to_csv("../data/ftw-catalog2.csv", index=False))
(catalogr
 .to_csv("../data/ftw-mappingafrica-combined-catalog2.csv", index=False))

# small FTW for rapid model testing
ftw_small = (
    catalogr
     .query("dataset == 'ftw'")
     .sample(10000, random_state=42)
).reset_index(drop=True)#.to_csv("../data/ftw-catalog-small.csv", index=False))
# ftw_small["split"].value_counts()
ftw_small.to_csv("../data/ftw-catalog-small.csv", index=False)


Make another small catalog for debugging, including null prop > 0 in validation data

In [None]:
ftw_catalog = pd.read_csv("../data/ftw-catalog.csv")
ftw_small_debug = (ftw_catalog
                   .sample(1000, random_state=42)
                   .reset_index(drop=True))
# ftw_small_debug.query("split == 'validate' & null_prop > 0")
ftw_small_debug.to_csv("../data/ftw-catalog-small-debug.csv", index=False)

Mapping Africa only

In [None]:
catalog = pd.read_csv("../data/ftw-mappingafrica-combined-catalog.csv")
(catalog.query("dataset == 'mappingafrica'")
 .to_csv("../data/mappingafrica-catalog.csv", index=False))

Catalogs for inference (testing), chips and tiles

In [7]:

# # Recursively find all files in /Users/lestes/images/tiles and subfolders
tiles_list = [
    os.path.relpath(f, "/Users/lestes/images/tiles")
    for f in glob.glob("/Users/lestes/images/tiles/**/*", recursive=True)
    if os.path.isfile(f) and f.endswith("cog.tif")
    # (os.path.basename(f).startswith(("tile910", "tile513", "tile765"))      
]

# tile numbers of interest
tile_ids = ["910217", "910218", "513706", "513726", "765503"]  

# Filter tiles_list for specified tile_ids and keep only the last (latest year) 
# For each tile_id, keep only the image from the most recent year (folder)
tile_dict = {}
for f in tiles_list:
  basename = os.path.basename(f)
  for tid in tile_ids:
    if basename.startswith(f"tile{tid}"):
      year_folder = os.path.dirname(f)
      try:
        year_int = int(year_folder)
      except ValueError:
        continue  # skip if folder is not a year
      # Keep the file if it's the first seen or has a greater year
      if tid not in tile_dict or year_int > int(os.path.dirname(tile_dict[tid])):
        tile_dict[tid] = f
tiles_selected = list(tile_dict.values())

#             #  if file.startswith("tile91023"): 
                 
tile_catalog = pd.DataFrame(
    [{"name": re.sub("tile", "", os.path.basename(f).split("_")[0]),  
      # "year": os.path.dirname(f), 
      "date": f"{os.path.basename(f).split("_")[1]}-15",
      "window_b": f}
      for f in tiles_selected]
).sort_values(by=["name", "date"]).reset_index(drop=True)
# tile_catalog.to_csv("../data/mappingafrica-tile-catalog.csv", index=False)
# (tile_catalog.query("name in ['910217', '910218'] & year == '2024'")
#  .to_csv("../data/mappingafrica-tile-catalog-small.csv", index=False))

tile_catalog


Unnamed: 0,name,date,window_b
0,513706,2023-11-15,2023/tile513706_2023-11_buf179_cog.tif
1,513726,2023-11-15,2023/tile513726_2023-11_buf179_cog.tif
2,765503,2022-03-15,2022/tile765503_2022-03_buf179_cog.tif
3,910217,2024-06-15,2024/tile910217_2024-06_buf179_cog.tif
4,910218,2024-06-15,2024/tile910218_2024-06_buf179_cog.tif


In [8]:
pth = "~/Dropbox/projects/activelearning/mappingafrica/campaigns/data/grids"
tiles = [gpd.read_file(f"{pth}/{cntry}_tiles.geojson")[["tile", "geometry"]] 
         for cntry in ["ghana", "congo", "zambia"]]
tiles = pd.concat(tiles, axis=0)
tiles["tile"] = tiles["tile"].astype(int).astype(str)
tiles = tiles.query("tile in @tile_ids").reset_index(drop=True)

tiles_gdf = (
    pd.merge(tile_catalog, tiles, left_on="name", right_on="tile", how="inner")
    .drop(columns=["tile"])
    .pipe(gpd.GeoDataFrame, geometry="geometry", crs="EPSG:4326")
)
tiles_gdf.to_file("../data/mappingafrica-tile-catalog-small.geojson", 
                  driver="GeoJSON")


Make catalog placing FTW win_a labels under win_b to let model learn from both time points. Make new combined catalog. 



In [18]:
ftw_catalog = catalog.query("dataset == 'ftw'")
ftw_catalog2 = ftw_catalog.copy()
ftw_catalog2["window_b"] = ftw_catalog2["window_a"]
ftw_catalog_long = (pd.concat([ftw_catalog, ftw_catalog2], axis=0)
                    .reset_index(drop=True))
ftw_catalog_long["window_a"] = ""
# .to_csv("../data/ftw-mappingafrica-combined-catalog.csv", index=False)

# Long FTW catalog
ftw_catalog_long.to_csv("../data/ftw-catalog-long.csv", index=False)

# Long combined
catalog_long = pd.concat([ftw_catalog_long, 
                          catalog.query("dataset == 'mappingafrica'")], axis=0)
# list(catalog_long.shape)
# catalog_long.head()
catalog_long.to_csv("../data/ftw-mappingafrica-combined-catalog-long.csv", 
                    index=False)

And new catalog that drops from validation set all presence-only data, but keeping Portugal this time in one (long2), and for consistency with previous dropping Portugal and presence only in the others (long3). 

In [None]:
mask = ~(
    catalog_long['split'].isin(['validate', 'test'])
    & (catalog_long['null_prop'] > 0) 
)

catalogr = catalog_long[mask].copy().reset_index(drop=True)
print(len(catalogr), len(catalog_long))

(catalogr
 .to_csv("../data/ftw-mappingafrica-combined-catalog-long2.csv", index=False))

(catalogr.query("dataset == 'ftw'")
 .to_csv("../data/ftw-catalog-long2.csv", index=False))

mask = ~(
    catalog_long['split'].isin(['validate', 'test'])
    & (
        (catalog_long['null_prop'] > 0) | 
        (catalog_long['country'] == 'portugal')
    )
)

catalogr = catalog_long[mask].copy().reset_index(drop=True)
print(len(catalogr), len(catalog_long))

(catalogr
 .to_csv("../data/ftw-mappingafrica-combined-catalog-long3.csv", index=False))

(catalogr.query("dataset == 'ftw'")
 .to_csv("../data/ftw-catalog-long3.csv", index=False))

catalogr

## Experiments

### Functions

In [22]:
def write_yaml(template_path: str, output_path: str, updates: dict = None):
    """
    Write a YAML file from a template file, with optional updates.

    Args:
        template_path (str): Path to the base YAML template file.
        output_path (str): Path to the output YAML file.
        updates (dict, optional): Dictionary of keys/values to update.
    """

    def recursive_update(d, u):
        for k, v in u.items():
            if isinstance(v, dict) and isinstance(d.get(k), dict):
                recursive_update(d[k], v)
            else:
                d[k] = v

    with open(template_path, 'r') as f:
        config = yaml.safe_load(f)
        if updates:
            recursive_update(config, updates)

    class IndentDumper(yaml.SafeDumper):
        def increase_indent(self, flow=False, indentless=False):
            return super().increase_indent(flow, False)

    # custom representer for lists
    def represent_list(dumper, data):
        # flow style only if all elements are scalars
        if all(isinstance(x, (str, int, float, bool, type(None))) for x in data):
            return dumper.represent_sequence("tag:yaml.org,2002:seq", data, 
                                             flow_style=True)
        else:
            return dumper.represent_sequence("tag:yaml.org,2002:seq", data, 
                                             flow_style=False)

    IndentDumper.add_representer(list, represent_list)

    with open(output_path, 'w') as f:
        yaml.dump(
            config,
            f,
            Dumper=IndentDumper,
            default_flow_style=False,  # keep dicts block-style
            sort_keys=False,
            indent=2,
            allow_unicode=True
        )


### Single time point

#### FTW baseline, FTW catalog

All experiments here are FTW baseline model, window B only

##### Setup

Below we set up a yaml for each experiment. Provide the following:

- `cfg_name`: name of the config/experiment file (without .yaml)
- `update`: dictionary of changes to make to the base config

Also define a global `home_dir` for the path to the repo containing the catalog. That's done once in the first cell. 

In [12]:
home_dir = "~/projects"
base_update = dict(
    data=dict(
        init_args=dict(
            catalog=f"{home_dir}/"\
                "ftw-mappingafrica-integration/data/ftw-catalog2.csv",
        )
    )
)

##### # 1 Default

With updated optimal weights added

In [None]:
cfg_name = "ftwbaseline-exp1"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### # 2 Locally-weighted tversky focal loss


In [93]:
cfg_name = "ftwbaseline-localtversky-exp2"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["model"] = dict(init_args=dict(loss="localtversky"))
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### # 3 Local min-max normalization, across bands


In [94]:
cfg_name = "ftwbaseline-minmax_lab-exp3"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"] = dict(
    normalization_strategy="min_max",
    normalization_stat_procedure="lab",
    global_stats=None,
)

write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### # 3a Local min-max normalization, across bands, 1% clip

In [19]:
cfg_name = "ftwbaseline-minmax_lab-exp3a"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"] = dict(
    normalization_strategy="min_max",
    normalization_stat_procedure="lab",
    img_clip_val=1,
    global_stats=None,
)

write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### # 4 Global min-max normalization, across bands

Using the 1st and 99th percentiles as min-max. 


In [18]:
cfg_name = "ftwbaseline-minmax_gab-exp4"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")

# Ensure global_stats exists before popping
global_stats = update["data"]["init_args"].get("global_stats", {})
global_stats.pop("mean", None)
global_stats.pop("std", None)
update["data"]["init_args"]["global_stats"] = global_stats

update["data"]["init_args"].update(dict(
    normalization_strategy="min_max",
    normalization_stat_procedure="gab",
    global_stats={"min": [68.438525], "max": [5772.288821], 
                  "mean": None, "std": None},
))
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)


##### # 4a Per band Z-value, global stats

In [20]:
cfg_name = "ftwbaseline-zvalue_gpb-exp4a"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"] = dict(
    normalization_stat_procedure="gpb",
    global_stats={"mean": [874.538, 876.9152, 641.7087, 2925.554], 
                  "std": [759.7574, 648.3951, 619.3338, 1083.3489]},
)

write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### # 5 - photometric augmentations

In [95]:
augs = ["rotation", "hflip", "vflip", "sharpness"]
cfg_name = "ftwbaseline-photometric-exp5"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"] = dict(
    aug_list=augs + ["brightness", "contrast", "gaussian_noise"]
)    
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### # 6 - satslidemix

In [96]:
augs = ["rotation", "hflip", "vflip", "sharpness"]
cfg_name = "ftwbaseline-satslide-exp6"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"] = dict(
    aug_list=augs + ["satslidemix"]
)    
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### # 7 - rescale

In [97]:
augs = ["rotation", "hflip", "vflip", "sharpness"]
cfg_name = "ftwbaseline-rescale-exp7"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"] = dict(
    aug_list=augs + ["rescale"]
)    
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### # 8 - Tversky and min-max GAB

In [13]:
cfg_name = "ftwbaseline-localtversky-minmax_gab-exp8"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["model"] = dict(init_args=dict(loss="localtversky"))
update["data"]["init_args"].update(dict(
    normalization_strategy="min_max",
    normalization_stat_procedure="gab",
    global_stats={"min": [68.438525], "max": [5772.288821], 
                  "mean": None, "std": None},
))
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

#### FTW Baseline, full catalog

Setting up runs that will pull from the full catalog, starting with the FTW baseline model.

We'll also specify some validation sets to evaluate results. To start we will use the separated FTW and Mapping Africa "global" validation samples, without confining them to specific countries. 

##### # 1 Standard settings

In [116]:
cfg_name = "fullcat-ftwbaseline-exp1"
base_update = dict(
    data=dict(
        init_args=dict(
            catalog=f"{home_dir}/"\
                "ftw-mappingafrica-integration/data/ftw-mappingafrica-combined-catalog2.csv"   
        )
    )
)
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### # 2 LAB

In [120]:
cfg_name = "fullcat-ftwbaseline-exp2"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"].update(dict(
    normalization_strategy="min_max",
    normalization_stat_procedure="lab",
    global_stats=None,
))
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### # 3 Locally-weighted tversky focal loss



In [14]:
cfg_name = "fullcat-ftwbaseline-exp3"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["model"] = dict(init_args=dict(loss="localtversky"))
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

### FTW baseline, Mapping Africa

#### # 1 Baseline standard

In [122]:
cfg_name = "ma-ftwbaseline-exp1"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"] = dict(
    catalog=f"{home_dir}/"\
        f"ftw-mappingafrica-integration/data/mappingafrica-catalog.csv"   
)
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

#### # 2 Min-max, lab

In [124]:
cfg_name = "ma-ftwbaseline-exp2"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["data"]["init_args"].update(dict(
    normalization_strategy="min_max",
    normalization_stat_procedure="lab",
    global_stats=None,
))
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

### Mapping Africa Approximate U-Net

We will train a model close to the variant we used on Planet imagery for mapping countries in Africa, initially trained on just the Mapping Africa labels.

In [None]:
home_dir = "~/projects"
base_update = dict(
    data=dict(
        init_args=dict(
            catalog=f"{home_dir}/"\
                "ftw-mappingafrica-integration/data/mappingafrica-catalog.csv"
        )
    )
)

#### MA Approximate, MA catalog

In [20]:
augs = ["rotation", "hflip", "vflip", "sharpness"]
cfg_name = "ma-approximate-baseline"

update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["model"] = dict(
    init_args=dict(
        class_weights=None,
        loss="localtversky", 
        backbone="tu-vgg19_bn.tv_in1k",
        patch_weights=True,
        model_kwargs=dict(drop_rate=0.1)
    )
)
update["data"]["init_args"].update(
    dict(
        aug_list=augs + \
            ["brightness", "contrast", "gaussian_noise", "rescale", "gamma"],
        normalization_strategy="min_max",
        normalization_stat_procedure="lab",
        global_stats=None,
    ) 
)
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

#### MA Approximate, FTW catalog

In [13]:
cfg_name = "ftw-ma-approximate-exp1"
update = dict(
    data=dict(
        init_args=dict(
            catalog=f"{home_dir}/"\
                "ftw-mappingafrica-integration/data/ftw-catalog2.csv"
        )
    )
)
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
write_yaml("../configs/ma-approximate-baseline.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

### MA Approximate, Fullcat

In [15]:
cfg_name = "fullcat-ma-approximate-baseline-exp1"
update = dict(
    data=dict(
        init_args=dict(
            catalog=f"{home_dir}/"\
                "ftw-mappingafrica-integration/data/ftw-mappingafrica-combined-catalog2.csv"   
        )
    )
)
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
write_yaml("../configs/ma-approximate-baseline.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

### Single time point, round 2

Re-running training for the full catalog and FTW only versions using the new long catalog.



#### FTW baseline, FTW catalog

In [23]:
home_dir = "~/projects"
base_update = dict(
    data=dict(
        init_args=dict(
            catalog=f"{home_dir}/"\
                "ftw-mappingafrica-integration/data/ftw-catalog-long2.csv",
        )
    )
)

##### Locally-weighted tversky focal loss, redo

In [16]:
cfg_name = "ftwbaseline-localtversky-longcat"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["model"] = dict(init_args=dict(loss="localtversky"))
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### Tversky and min-max GAB, redo

In [21]:
cfg_name = "ftwbaseline-localtversky-minmax_gab-longcat"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["model"] = dict(init_args=dict(loss="localtversky"))
update["data"]["init_args"].update(dict(
    normalization_strategy="min_max",
    normalization_stat_procedure="gab",
    global_stats={"min": [68.438525], "max": [5772.288821], 
                  "mean": None, "std": None},
))
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)

##### Tversky, min-max GAB, photometric augmentations

In [26]:
augs = ["rotation", "hflip", "vflip", "sharpness"]
cfg_name = "ftwbaseline-localtversky-minmax_gab-photometric-longcat"
update = base_update.copy()
update["trainer"] = dict(default_root_dir=f"~/working/models/{cfg_name}")
update["model"] = dict(init_args=dict(loss="localtversky"))
update["data"]["init_args"].update(dict(
    aug_list=augs + ["brightness", "contrast", "gaussian_noise", "gamma"],
    normalization_strategy="min_max",
    normalization_stat_procedure="gab",
    global_stats={"min": [68.438525], "max": [5772.288821], 
                  "mean": None, "std": None},
))
write_yaml("../configs/template-hpc-config.yaml", 
           f"../configs/{cfg_name}.yaml", 
           updates=update)