In [None]:
from collections import defaultdict
from itertools import chain
import json
import os
from time import time

from dask.distributed import Client
import fsspec
import joblib
import numpy as np
import pandas as pd
from shapely.ops import cascaded_union
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import RadiusNeighborsClassifier

## And a bunch of carbonplan dependencies
from carbonplan_data import cat as core_cat

from carbonplan_forest_offsets.utils import aa_code_to_ss_code
from carbonplan_forest_offsets.data import cat, get_retro_bucket
from carbonplan_forest_offsets.analysis.assign_project_fldtypcd import load_classification_data
from carbonplan_forest_offsets.load.geometry import (
    get_overlapping_states,
    load_supersections,
)
from carbonplan_forest_offsets.load.project_db import load_project_db

In [None]:
def get_aoi(ss_ids):
    da = core_cat.nlcd.raster(region="conus").to_dask()
    crs = da.attrs["crs"]

    supersections = load_supersections().to_crs(crs)

    subset_supersection = supersections[supersections["ss_id"].isin(ss_ids)].copy()
    subset_supersection.loc[:, "dissolve_all"] = 1

    aoi = subset_supersection.dissolve(by="dissolve_all").buffer(150_000).to_crs("epsg:4326").item()
    return aoi

In [None]:
def species_array_to_d(species_array):
    return {str(species["code"]): round(species["fraction"], 4) for species in species_array}


def load_data(ss_ids):
    """returns trained classifier and data vectorizer to apply to multiple opr_ids"""

    if (len(ss_ids) == 1) & (ss_ids[0] > 200):
        data = load_classification_data(["ak"])

    else:
        da = core_cat.nlcd.raster(region="conus").to_dask()
        crs = da.attrs["crs"]

        supersections = load_supersections().to_crs(crs)

        subset_supersection = supersections[supersections["ss_id"].isin(ss_ids)].copy()
        subset_supersection.loc[:, "dissolve_all"] = 1

        aoi = (
            subset_supersection.dissolve(by="dissolve_all")
            .buffer(150_000)
            .to_crs("epsg:4326")
            .item()
        )

        postal_codes = get_overlapping_states(aoi)
        print(f"preparing to load: {[x for x in postal_codes]}")
        data = load_classification_data(postal_codes, aoi=aoi)
    return data


def prepare_regional_classifier(data):
    """returns trained classifier and data vectorizer to apply to multiple opr_ids"""

    base_clf = RadiusNeighborsClassifier(weights="distance", algorithm="brute", outlier_label=-999)
    param_grid = [
        {"radius": np.arange(0.15, 0.651, 0.025)}
    ]  # initial testing never yielded a case where we went above 0.5

    print(f"doing GridSearch ")

    clf = GridSearchCV(base_clf, param_grid, cv=5, refit=True, verbose=10)
    start = time()
    with joblib.parallel_backend("dask"):
        clf.fit(data["features"], data["targets"])
    print("search took %.2f seconds" % (time() - start))
    return clf, data["dictvectorizer"]

In [None]:
client = Client(threads_per_worker=1)
client

In [None]:
project_db = cat.project_db_json.read()

In [None]:
clf_cache = {}  # keep cache one step outside so dont overwrite it :)

In [None]:
supersection_ids = list(set(chain(*[project["supersection_ids"] for project in project_db])))

classifications = defaultdict(dict)

for ssid in supersection_ids:
    print(ssid)

    data = load_data([ssid])
    data_encoder = data["dictvectorizer"]

    # train the classifier
    clf, data_encoder = prepare_regional_classifier(data)
    clf_cache[ssid] = clf, data_encoder

    for project in project_db:
        # excluded projects; see Extended Methods.
        if project["opr_id"] in ["CAR1094", "CAR1032", "ACR360", "CAR1102"]:
            continue

        if ssid not in project["supersection_ids"]:
            continue

        for aa in project["assessment_areas"]:

            if (aa["code"] == 999) or (
                aa_code_to_ss_code().get(aa["code"], False) == ssid and aa["species"]
            ):
                species_arr = aa["species"]
                feat_dict = species_array_to_d(species_arr)
                feats = data_encoder.transform(feat_dict)
                classification = pd.Series(clf.predict_proba(feats).flatten(), index=clf.classes_)
                classifications[project["opr_id"]][str((ssid, aa["code"]))] = (
                    classification[classification > 0].sort_values().to_dict()
                )

## Store some outputs

Store the 5-fold CV radius parameter.


In [None]:
fit_radii = {k: v[0].best_params_["radius"] for k, v in clf_cache.items()}

fs_prefix, fs_kwargs = get_retro_bucket()
fn = f"{fs_prefix}/results/radius_neighbor_params.json"
with fsspec.open(fn, mode="w", **fs_kwargs) as f:
    json.dump(fit_radii, f, indent=2)

In [None]:
fs_prefix, fs_kwargs = get_retro_bucket()
fn = f"{fs_prefix}/results/classifications.json"
with fsspec.open(fn, mode="w", **fs_kwargs) as f:
    json.dump(classifications, f, indent=2)