In [None]:
!pip install tenacity

In [None]:
from collections import defaultdict
from itertools import chain
import json
import os
from time import time

from dask.distributed import Client
import fsspec
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from shapely.ops import cascaded_union
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import RadiusNeighborsClassifier

## And a bunch of carbonplan dependencies
from carbonplan_data import cat as core_cat

from carbonplan_forest_offsets.utils import aa_code_to_ss_code
from carbonplan_forest_offsets.data import cat, get_temp_bucket
from carbonplan_forest_offsets.analysis.assign_project_fldtypcd import load_classification_data
from carbonplan_forest_offsets.load.geometry import (
    get_overlapping_states,
    load_supersections,
)
from carbonplan_forest_offsets.load.project_db import load_project_db
from carbonplan_forest_offsets.load.geometry import load_supersections


plt.rcParams.update({"font.size": 14, "svg.fonttype": "none"})

In [None]:
def get_aoi(ss_ids):
    da = core_cat.nlcd.raster(region="conus").to_dask()
    crs = da.attrs["crs"]

    supersections = load_supersections().to_crs(crs)

    subset_supersection = supersections[supersections["ss_id"].isin(ss_ids)].copy()
    subset_supersection.loc[:, "dissolve_all"] = 1

    aoi = subset_supersection.dissolve(by="dissolve_all").buffer(150_000).to_crs("epsg:4326").item()
    return aoi

In [None]:
def species_array_to_d(species_array):
    return {str(species["code"]): round(species["fraction"], 4) for species in species_array}


def load_data(ss_ids):
    """returns trained classifier and data vectorizer to apply to multiple opr_ids"""

    if (len(ss_ids) == 1) & (ss_ids[0] > 200):
        data = load_classification_data(["ak"])

    else:
        da = core_cat.nlcd.raster(region="conus").to_dask()
        crs = da.attrs["crs"]

        supersections = load_supersections().to_crs(crs)

        subset_supersection = supersections[supersections["ss_id"].isin(ss_ids)].copy()
        subset_supersection.loc[:, "dissolve_all"] = 1

        aoi = (
            subset_supersection.dissolve(by="dissolve_all")
            .buffer(150_000)
            .to_crs("epsg:4326")
            .item()
        )

        postal_codes = get_overlapping_states(aoi)
        print(f"preparing to load: {[x for x in postal_codes]}")
        data = load_classification_data(postal_codes, aoi=aoi)
    return data


def prepare_regional_classifier(data):
    """returns trained classifier and data vectorizer to apply to multiple opr_ids"""

    base_clf = RadiusNeighborsClassifier(weights="distance", algorithm="brute", outlier_label=-999)
    param_grid = [
        {"radius": np.arange(0.15, 0.651, 0.025)}
    ]  # initial testing never yielded a case where we went above 0.5

    print(f"doing GridSearch ")

    clf = GridSearchCV(base_clf, param_grid, cv=5, refit=True, verbose=10)
    start = time()
    with joblib.parallel_backend("dask"):
        clf.fit(data["features"], data["targets"])
    print("search took %.2f seconds" % (time() - start))
    return clf, data["dictvectorizer"]

In [None]:
client = Client(threads_per_worker=1)
client

In [None]:
fs_prefix, fs_kwargs = get_temp_bucket()
fn = f"{fs_prefix}/radius_neighbor_params.json"
with fsspec.open(fn, mode="r", **fs_kwargs) as f:
    radii = json.load(f)

In [None]:
store = {}

In [None]:
for ss_id, radius in radii.items():
    if ss_id not in store:
        print(f"scoring {ss_id}...")

        d = load_data([int(ss_id)])

        X_train, X_test, y_train, y_test = train_test_split(
            d["features"], d["targets"], stratify=d["targets"], test_size=0.2
        )
        clf = RadiusNeighborsClassifier(
            weights="distance", algorithm="brute", outlier_label=-999, radius=radius
        )

        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        scores = (
            f1_score(y_test, preds, average="weighted"),
            f1_score(y_test, preds, average="micro"),
            f1_score(y_test, preds, average="macro"),
        )

        store[ss_id] = scores

In [None]:
if len(radii) == len(store):

    ss_names = load_supersections(include_ak=False).set_index("ss_id")["SSection"].to_dict()

    ss_names = {str(k): v for k, v in ss_names.items()}

    renamed_store = {ss_names.get(k, k): v for k, v in store.items()}
    del renamed_store["285"]
    del renamed_store["286"]
    renamed_store["Southeast and South Central Alaska"] = renamed_store["287"]
    del renamed_store["287"]

    fs_prefix, fs_kwargs = get_retro_bucket()
    fn = f"{fs_prefix}/reclassification/classifier_fscores.json"

    with fsspec.open(fn, mode="w", **fs_kwargs) as f:
        json.dump(renamed_store, f, indent=2)
else:
    raise