In [None]:
%load_ext autoreload
%autoreload 2
import fsspec
import numpy as np
import pandas as pd
import xarray as xr
from carbonplan.data import cat
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import matplotlib
from carbonplan_data import utils
from matplotlib import cm

from carbonplan_trace.v1 import utils as trace_utils
import os
from carbonplan_trace.v0.data import cat as trace_cat
from carbonplan_trace.v1 import glas_height_metrics as ht

matplotlib.rc('font', family='sans-serif') 
matplotlib.rc('font', serif='Helvetica Neue') 
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams.update({'font.size': 14, "svg.fonttype": "none"})

# import altair as alt
# alt.data_transformers.disable_max_rows()
# alt.themes.enable("carbonplan_light")

from gcsfs import GCSFileSystem
fs = GCSFileSystem(cache_timeout=0)


In [None]:
from datetime import datetime, timezone

# Open data

- from this study
- from Lidar validation data sets (Margolis et al 2015 and Neigh et al 2015)


In [None]:
def open_study_data(tiles=None, min_lat=-90, max_lat=90, min_lon=-180, max_lon=180):
    folder = "gs://carbonplan-climatetrace/intermediates/preprocessed_lidar/"
    if not tiles:
        tiles = [
            os.path.splitext(os.path.split(path)[-1])[0]
            for path in fs.ls(folder)
            if not path.endswith("/")
        ]
    uris = [f"{folder}{tile}.zarr" for tile in tiles]
    ds_list = []
    for uri in uris:
        try:
            mapper = fsspec.get_mapper(uri)
            ds = xr.open_zarr(mapper, consolidated=True)
            ds = ds.stack(unique_index=("record_index", "shot_number"))
            ds = ds.dropna(dim="unique_index", how="any", subset=["lat", "lon"])
            #             ds = ds.drop_vars('spatial_ref')
            ds.attrs["crs"] = "EPSG:4326"
            ds = trace_utils.subset_data_for_bounding_box(ds, min_lat, max_lat, min_lon, max_lon)
            ds_list.append(ds)
        except KeyError:
            print(f"did not find {uri}")

    ds = xr.concat(ds_list, dim="unique_index", data_vars="minimal").chunk({"unique_index": 2000})
    for k in ds:
        _ = ds[k].encoding.pop("chunks", None)

    return ds

In [None]:
import pandas as pd

rename_dict = {
    "Glas record index": "record_index",
    "rec_ndx": "record_index",
    "Shotn": "shot_number",
    "shotn": "shot_number",
    "lngtd": "lon",
    "h14": "VH",
    "fslope": "f_slope",
    "Senergy": "senergy",
    "h25": "h25_Neigh",
    "h50": "h50_Neigh",
    "h75": "h75_Neigh",
    "h90": "h90_Neigh",
    "glas_biom_str": "biomass",
}


def convert_df_to_xr(df):
    df["unique_index"] = (
        df.record_index.astype(str).str.zfill(9) + "_" + df.shot_number.astype(str).str.zfill(2)
    )
    df.set_index(["record_index", "shot_number"], inplace=True)

    ds = {}
    for c in df.columns:
        ds[c] = xr.DataArray(
            df[c].values,
            dims=["unique_index"],
            coords={"unique_index": df.unique_index.values},
        )
    ds = xr.Dataset(ds)
    ds.coords["unique_index"] = df.index
    return ds


def open_margolis_data(min_lat=-90, max_lat=90, min_lon=-180, max_lon=180):
    files = [
        "gs://carbonplan-climatetrace/inputs/boreal_lidar_biomass/Alaska_BBB3_hg_L3c_L3f_wPALSbiom.txt",
        "gs://carbonplan-climatetrace/inputs/boreal_lidar_biomass/Canada_east_BBB3_hg_L3c_L3f_wPALSbiom.txt",
        "gs://carbonplan-climatetrace/inputs/boreal_lidar_biomass/Canada_west_BBB3_hg_L3c_L3f_wPALSbiom.txt",
    ]

    margolis = []
    for file in files:
        with fs.open(file) as f:
            data = np.genfromtxt(f, skip_header=8)
        with fs.open(file) as f:
            lines = f.readlines()
        headers = lines[7]
        headers = [c for c in headers.decode("utf-8").strip().split(" ") if c != ""]
        margolis.append(pd.DataFrame(data=data, columns=headers))
    margolis = pd.concat(margolis)
    for c in margolis:
        if c in rename_dict:
            margolis.rename(columns={c: rename_dict[c]}, inplace=True)

    margolis = margolis.replace(-9999, np.nan).replace(99999, np.nan)
    margolis = convert_df_to_xr(margolis)
    #     margolis = trace_utils.subset_data_for_bounding_box(margolis, min_lat, max_lat, min_lon, max_lon)

    return margolis


def open_neigh_data(min_lat=-90, max_lat=90, min_lon=-180, max_lon=180):
    files = [
        "gs://carbonplan-climatetrace/inputs/boreal_lidar_biomass/EA_east_L3c_hg.csv",
        "gs://carbonplan-climatetrace/inputs/boreal_lidar_biomass/EA_east_L3f_hg.csv",
        "gs://carbonplan-climatetrace/inputs/boreal_lidar_biomass/EA_west_L2a_hg.csv",
        "gs://carbonplan-climatetrace/inputs/boreal_lidar_biomass/EA_west_L3a_hg.csv",
    ]

    neigh = []
    for file in files:
        with fs.open(file) as f:
            df = pd.read_csv(f)
        neigh.append(df)
    neigh = pd.concat(neigh)
    for c in neigh:
        if c in rename_dict:
            neigh.rename(columns={c: rename_dict[c]}, inplace=True)

    neigh = neigh.replace(-9999, np.nan).replace(99999, np.nan)
    neigh = convert_df_to_xr(neigh)
    #     neigh = trace_utils.subset_data_for_bounding_box(neigh, min_lat, max_lat, min_lon, max_lon)

    return neigh

In [None]:
margolis = open_margolis_data()
neigh = open_neigh_data()

In [None]:
min_lat = margolis.lat.min().values
max_lat = margolis.lat.max().values
min_lon = margolis.lon.min().values
max_lon = margolis.lon.max().values

tiles = trace_utils.find_tiles_for_bounding_box(
    min_lat=min_lat, max_lat=max_lat, min_lon=min_lon, max_lon=max_lon
)
study1 = open_study_data(tiles=tiles)

In [None]:
def merge_ds(study, ref, var_name, precision=3):

    variables_study = [
        var for var in list(study.variables.keys()) if var_name.lower() in var.lower()
    ]
    variables_ref = [var for var in list(ref.variables.keys()) if var_name.lower() in var.lower()]

    coords = ["lat", "lon"]

    left = study[variables_study + coords].to_dataframe().reset_index()
    right = ref[variables_ref + coords].to_dataframe().reset_index()

    left["lat_round"] = left.lat.round(precision)
    left["lon_round"] = left.lon.round(precision)
    right["lat_round"] = right.lat.round(precision)
    right["lon_round"] = right.lon.round(precision)

    return pd.merge(
        left=left,
        right=right,
        on=["lat_round", "lon_round", "shot_number"],
        suffixes=["_study", "_ref"],
    )

In [None]:
margolis_record_index = merge_ds(
    study=study1, ref=margolis, var_name="glas_elev"
).record_index_study.unique()
len(margolis_record_index)

In [None]:
study1 = study1.where(study1.record_index.isin(margolis_record_index), drop=True)
study1.nbytes / 1e9

In [None]:
min_lat = neigh.lat.min().values
max_lat = neigh.lat.max().values
min_lon = neigh.lon.min().values
max_lon = neigh.lon.max().values

tiles = trace_utils.find_tiles_for_bounding_box(
    min_lat=min_lat, max_lat=max_lat, min_lon=min_lon, max_lon=max_lon
)
study2 = open_study_data(tiles=tiles)

neigh_record_index = merge_ds(
    study=study2, ref=neigh, var_name="glas_elev"
).record_index_study.unique()
len(neigh_record_index)

In [None]:
study2 = study2.where(study2.record_index.isin(neigh_record_index), drop=True)
study2.nbytes / 1e9

In [None]:
study = xr.concat([study1, study2], dim="unique_index")

In [None]:
study.nbytes / 1e9

In [None]:
margolis.nbytes / 1e9

In [None]:
neigh.nbytes / 1e9

In [None]:
study.load()

## plotting function


In [None]:
from sklearn.metrics import r2_score, mean_squared_error


def plot_scatter_comparison(ax, df, col_name, compared_col_name, params):
    xmin, xmax = params["xmin"], params["xmax"]
    unit = params["unit"]

    x = df[compared_col_name].values
    y = df[col_name].values

    ax.plot([xmin, xmax], [xmin, xmax], "r")
    r2 = r2_score(x, y)
    rmse = mean_squared_error(x, y, squared=False)
    ax.scatter(x, y, c="k", s=0.01)
    ax.text(params["text_x"], params["text_y1"], f"R squared = {round(r2, 2)}")
    ax.text(params["text_x"], params["text_y2"], f"RMSE = {round(rmse, 2)} {unit}")
    if unit != "":
        unit_str = f"({unit})"
    else:
        unit_str = ""
    ax.set_xlabel(f"{compared_col_name} {unit_str}")
    ax.set_ylabel(f"{col_name} {unit_str}")
    ax.set_xlim(xmin, xmax)
    ax.set_ylim(xmin, xmax)
    ax.set_xticks(params["ticks"])
    ax.set_yticks(params["ticks"])

# Debug specific height metrics


## QMCH and MeanH


In [None]:
# calculate QMCH in a few different ways
def quadratic_mean_to_ground_ht(ds):
    """
    Quadratic mean height of the waveform from ground peak to signal beginning (meters).
    Ground peak defined as whichever of the two lowest peaks has greater amplitude.
    """
    return ht.get_heights_from_distance(
        ds, top_metric="quadratic_mean_dist", bottom_metric="ground_peak_dist"
    )


def quadratic_mean_to_sig_end_ht(ds):
    """
    Quadratic mean height of the waveform from ground peak to signal beginning (meters).
    Ground peak defined as whichever of the two lowest peaks has greater amplitude.
    """
    return ht.get_heights_from_distance(
        ds, top_metric="quadratic_mean_dist", bottom_metric="sig_end_dist"
    )


study["QMCH_adj_ground"] = ht.quadratic_mean_to_adj_ground_ht(study)
study["QMCH_ground"] = quadratic_mean_to_ground_ht(study)
study["QMCH_sig_end"] = quadratic_mean_to_sig_end_ht(study)

In [None]:
# calculate MeanH in a few different ways
def mean_to_ground_ht(ds):
    """
    Quadratic mean height of the waveform from ground peak to signal beginning (meters).
    Ground peak defined as whichever of the two lowest peaks has greater amplitude.
    """
    return ht.get_heights_from_distance(
        ds, top_metric="mean_dist", bottom_metric="ground_peak_dist"
    )


def mean_to_sig_end_ht(ds):
    """
    Quadratic mean height of the waveform from ground peak to signal beginning (meters).
    Ground peak defined as whichever of the two lowest peaks has greater amplitude.
    """
    return ht.get_heights_from_distance(ds, top_metric="mean_dist", bottom_metric="sig_end_dist")


study["MeanH_adj_ground"] = ht.mean_to_adj_ground_ht(study)
study["MeanH_ground"] = mean_to_ground_ht(study)
study["MeanH_sig_end"] = mean_to_sig_end_ht(study)

In [None]:
all_vars = [var for var in list(study.variables.keys()) if "QMCH" in var] + [
    var for var in list(study.variables.keys()) if "MeanH" in var
]

sub = study[all_vars + ["lat", "lon"]]
sub.load()

In [None]:
default_params = {
    "xmin": -12,
    "xmax": 42,
    "unit": "m",
    "text_x": -8,
    "text_y1": 36,
    "text_y2": 30,
    "ticks": np.arange(-10, 42, 10),
}

for var in ["QMCH", "MeanH"]:

    joined = merge_ds(study=sub, ref=margolis, var_name=var, precision=3)

    plt.figure(figsize=(13, 4.5))
    plt.suptitle(var)
    for i, col in enumerate([f"{var}_adj_ground", f"{var}_ground", f"{var}_sig_end"]):
        plt.subplot(1, 3, i + 1)
        ax = plt.gca()
        plot_scatter_comparison(
            ax=ax,
            df=joined.dropna(),
            col_name=col,
            compared_col_name=var,
            params=default_params,
        )
    plt.tight_layout()
    plt.show()
    plt.close()

## f slope


In [None]:
# calculate fslope in a few different ways


def front_slope_to_surface_energy_ratio_old(ds):
    """
    Front slope to surface energy ratio. We calculated fslope_WHRC as the change in amplitude per meter (volts/meter) in the outer canopy.
    We then applied the following linear transformation in order to calculate fslope on the same scale as provided in data published by
    Margolis et al. (2015): f_slope = 0.5744 + 19.7762 * fslope_WHRC
    """
    # get the highest peak (highest in elevation = smallest distance)
    # the fillna is necessary since argmin raises an error otherwise
    # the filled nans will become nans again since canopy_amp at those locations are also nans
    canopy_ind = ds.gaussian_fit_dist.fillna(1e10).argmin(dim="n_gaussian_peaks").compute()
    canopy_amp = ds.gaussian_amp.isel(n_gaussian_peaks=canopy_ind)
    canopy_dist = ds.gaussian_fit_dist.isel(n_gaussian_peaks=canopy_ind)

    # calculate amplitude at signal begin as noise mean + nsig * noise sd since this is how signal
    # begin is defined (ie. the highest elevation where signal crosses this threshold)
    # the value of nsig is coded based on the GLAS Algorithm Theoretical Basis Document retrieved at
    # https://www.csr.utexas.edu/glas/pdf/WFAtbd_v5_02011Sept.pdf  (See Appendix 3, pg 99)
    time_of_switch = datetime(2000, 1, 1, 12, 0, 0, tzinfo=timezone.utc).timestamp() + 289742400
    # for any time before the time of switch, b_nsig = 3.5, 7.5 afterwards
    b_nsig = xr.where(ds.time < time_of_switch, x=3.5, y=7.5)

    sig_begin_amp = ds.noise_mean + b_nsig * ds.noise_sd

    # calculate slope as y2-y1 / x2-x1
    fslope_WHRC = (canopy_amp - sig_begin_amp) / (canopy_dist - ds.sig_begin_dist)
    # min max obtained from inspecting data in Margolis et al. (2015)
    return (0.5744 + 19.7762 * fslope_WHRC.clip(min=0)).clip(max=15)


def get_highest_peak_ind(ds, buffer=1):
    """
    Identify highest peak in smoothed waveform, adopted after Sun et al 2008.
    buffer indicates the lowest bin that can be identified as ground peak (buffer = 3 indicates that
    the lowest 3 bins are excluded from ground peak identification)
    """
    assert buffer >= 1

    # ensure that things are ordered the same way
    all_distances = ds.rec_wf_sample_dist.transpose("rec_bin", "unique_index")
    wf = ds.processed_wf.transpose("rec_bin", "unique_index")

    # initialize an array of ground peak distance with the shape of record index x shot number
    default = wf.rec_bin.shape[0] - 2
    highest_ind = xr.DataArray(
        default,
        dims=["unique_index"],
        coords=[wf.coords["unique_index"]],
    )

    for i in np.arange(default, buffer, -1):
        mask = (
            # where the current bin has waveform intensity larger then the previous bin and the next bin
            (wf.isel(rec_bin=i) > wf.isel(rec_bin=i - 1))
            & (wf.isel(rec_bin=i) > wf.isel(rec_bin=i + 1))
            & (highest_ind == default)  # and this is the first peak found
        )

        # where mask = True, set the ground distance to be equal to distance of current bin i
        # otherwise continue to use the data stored in ground distance
        highest_ind = xr.where(mask, x=i, y=highest_ind)

    return highest_ind


def front_slope_to_surface_energy_ratio_smooth(ds):
    """
    Front slope to surface energy ratio. We calculated fslope_WHRC as the change in amplitude per meter (volts/meter) in the outer canopy.
    We then applied the following linear transformation in order to calculate fslope on the same scale as provided in data published by
    Margolis et al. (2015): f_slope = 0.5744 + 19.7762 * fslope_WHRC
    """
    highest_ind = get_highest_peak_ind(ds).compute()

    canopy_amp = ds.processed_wf.isel(rec_bin=highest_ind)
    canopy_dist = ds.rec_wf_sample_dist.isel(rec_bin=highest_ind)

    # calculate amplitude at signal begin as noise mean + nsig * noise sd since this is how signal
    # begin is defined (ie. the highest elevation where signal crosses this threshold)
    # the value of nsig is coded based on the GLAS Algorithm Theoretical Basis Document retrieved at
    # https://www.csr.utexas.edu/glas/pdf/WFAtbd_v5_02011Sept.pdf  (See Appendix 3, pg 99)
    time_of_switch = datetime(2000, 1, 1, 12, 0, 0, tzinfo=timezone.utc).timestamp() + 289742400
    # for any time before the time of switch, b_nsig = 3.5, 7.5 afterwards
    b_nsig = xr.where(ds.time < time_of_switch, x=3.5, y=7.5)

    sig_begin_amp = ds.noise_mean + b_nsig * ds.noise_sd

    # calculate slope as y2-y1 / x2-x1
    fslope_WHRC = (canopy_amp - sig_begin_amp) / (canopy_dist - ds.sig_begin_dist)
    # min max obtained from inspecting data in Margolis et al. (2015)
    return (0.5744 + 19.7762 * fslope_WHRC.clip(min=0)).clip(max=15)


def front_slope_to_surface_energy_ratio_old_max(ds):
    """
    Front slope to surface energy ratio. We calculated fslope_WHRC as the change in amplitude per meter (volts/meter) in the outer canopy.
    We then applied the following linear transformation in order to calculate fslope on the same scale as provided in data published by
    Margolis et al. (2015): f_slope = 0.5744 + 19.7762 * fslope_WHRC
    """
    # get the highest peak (highest in elevation = smallest distance)
    # the fillna is necessary since argmin raises an error otherwise
    # the filled nans will become nans again since canopy_amp at those locations are also nans
    canopy_ind = ds.gaussian_amp.fillna(-99).argmax(dim="n_gaussian_peaks").compute()
    canopy_amp = ds.gaussian_amp.isel(n_gaussian_peaks=canopy_ind)
    canopy_dist = ds.gaussian_fit_dist.isel(n_gaussian_peaks=canopy_ind)

    # calculate amplitude at signal begin as noise mean + nsig * noise sd since this is how signal
    # begin is defined (ie. the highest elevation where signal crosses this threshold)
    # the value of nsig is coded based on the GLAS Algorithm Theoretical Basis Document retrieved at
    # https://www.csr.utexas.edu/glas/pdf/WFAtbd_v5_02011Sept.pdf  (See Appendix 3, pg 99)
    time_of_switch = datetime(2000, 1, 1, 12, 0, 0, tzinfo=timezone.utc).timestamp() + 289742400
    # for any time before the time of switch, b_nsig = 3.5, 7.5 afterwards
    b_nsig = xr.where(ds.time < time_of_switch, x=3.5, y=7.5)

    sig_begin_amp = ds.noise_mean + b_nsig * ds.noise_sd

    # calculate slope as y2-y1 / x2-x1
    fslope_WHRC = (canopy_amp - sig_begin_amp) / (canopy_dist - ds.sig_begin_dist)
    # min max obtained from inspecting data in Margolis et al. (2015)
    return (0.5744 + 19.7762 * fslope_WHRC.clip(min=0)).clip(max=15)


def front_slope_to_surface_energy_ratio_smooth_max(ds):
    """
    Front slope to surface energy ratio. We calculated fslope_WHRC as the change in amplitude per meter (volts/meter) in the outer canopy.
    We then applied the following linear transformation in order to calculate fslope on the same scale as provided in data published by
    Margolis et al. (2015): f_slope = 0.5744 + 19.7762 * fslope_WHRC
    """
    max_ind = ds.processed_wf.fillna(-99).argmax(dim="rec_bin").compute()
    canopy_amp = ds.processed_wf.isel(rec_bin=max_ind)
    canopy_dist = ds.rec_wf_sample_dist.isel(rec_bin=max_ind)

    # calculate amplitude at signal begin as noise mean + nsig * noise sd since this is how signal
    # begin is defined (ie. the highest elevation where signal crosses this threshold)
    # the value of nsig is coded based on the GLAS Algorithm Theoretical Basis Document retrieved at
    # https://www.csr.utexas.edu/glas/pdf/WFAtbd_v5_02011Sept.pdf  (See Appendix 3, pg 99)
    time_of_switch = datetime(2000, 1, 1, 12, 0, 0, tzinfo=timezone.utc).timestamp() + 289742400
    # for any time before the time of switch, b_nsig = 3.5, 7.5 afterwards
    b_nsig = xr.where(ds.time < time_of_switch, x=3.5, y=7.5)

    sig_begin_amp = ds.noise_mean + b_nsig * ds.noise_sd

    # calculate slope as y2-y1 / x2-x1
    fslope_WHRC = (canopy_amp - sig_begin_amp) / (canopy_dist - ds.sig_begin_dist)
    # min max obtained from inspecting data in Margolis et al. (2015)
    return (0.5744 + 19.7762 * fslope_WHRC.clip(min=0)).clip(max=15)

In [None]:
study["f_slope_old"] = front_slope_to_surface_energy_ratio_old(study)
study["f_slope_smooth"] = front_slope_to_surface_energy_ratio_smooth(study)

study["f_slope_old_max"] = front_slope_to_surface_energy_ratio_old_max(study)
study["f_slope_smooth_max"] = front_slope_to_surface_energy_ratio_smooth_max(study)

all_vars = [var for var in list(study.variables.keys()) if "f_slope" in var]

sub_fslope = study[all_vars + ["lat", "lon"]]
sub_fslope.load()

In [None]:
joined.describe()

In [None]:
def plot_scatter_comparison(ax, df, col_name, compared_col_name, params):
    xmin, xmax = params["xmin"], params["xmax"]
    unit = params["unit"]

    cond = (df[col_name] >= 2.552) & (df[col_name] <= 10) & (df[compared_col_name] <= 10)
    x = df.loc[cond, compared_col_name].values
    y = df.loc[cond, col_name].values

    ax.plot([xmin, xmax], [xmin, xmax], "r")
    r2 = r2_score(x, y)
    rmse = mean_squared_error(x, y, squared=False)
    ax.scatter(x, y, c="k", s=0.01)
    ax.text(params["text_x"], params["text_y1"], f"R squared = {round(r2, 2)}")
    ax.text(params["text_x"], params["text_y2"], f"RMSE = {round(rmse, 2)} {unit}")
    if unit != "":
        unit_str = f"({unit})"
    else:
        unit_str = ""
    ax.set_xlabel(f"{compared_col_name} {unit_str}")
    ax.set_ylabel(f"{col_name} {unit_str}")
    ax.set_xlim(xmin, xmax)
    ax.set_ylim(xmin, xmax)
    ax.set_xticks(params["ticks"])
    ax.set_yticks(params["ticks"])


var = "f_slope"
joined = merge_ds(study=sub_fslope, ref=neigh, var_name=var, precision=3)
plot_params = {
    "xmin": -0.5,
    "xmax": 8.5,
    "unit": "",
    "text_x": 0.3,
    "text_y1": 7.5,
    "text_y2": 6.5,
    "ticks": np.arange(0, 8.5, 2),
}

plt.figure(figsize=(8.5, 9))
plt.suptitle(var)
for i, col in enumerate(["f_slope_old", "f_slope_smooth", "f_slope_old_max", "f_slope_smooth_max"]):
    plt.subplot(2, 2, i + 1)
    ax = plt.gca()
    plot_scatter_comparison(
        ax=ax,
        df=joined.dropna(),
        col_name=col,
        compared_col_name=var,
        params=plot_params,
    )
plt.tight_layout()
plt.show()
plt.close()

## senergy


In [None]:
def energy_adj_ground_to_sig_end_old(ds):
    """
    Waveform energy from the ground peak.  We calculated senergy_whrc as the energy of the waveform (in digital counts) from the ground peak
    to the signal end multiplied by two. Ground peak defined as whichever of the two lowest peaks has greater amplitude. We then applied the
    following linear transformation in order to calculate on the same scale as data published by Margolis et al. (2015)
    senergy = -4.397006 + 0.006208 * senergy_whrc
    """
    from carbonplan_trace.v1.glas_preprocess import (
        select_valid_area,
    )  # avoid circular import

    path = "gs://carbonplan-climatetrace/inputs/volt_table.csv"
    volt_table = pd.read_csv(path)
    volt_to_digital_count = volt_table.set_index("volt_value")["ind"].to_dict()
    wf_in_digital_count = xr.apply_ufunc(
        volt_to_digital_count.__getitem__,
        ds.rec_wf.astype(float).round(6).fillna(-0.195279),
        vectorize=True,
        dask="parallelized",
    )

    ds = ht.get_dist_metric_value(ds, metric="adj_ground_peak_dist")
    # the processed wf is from sig beg to sig end, select adj ground peak to sig end instead
    ground_energy = select_valid_area(
        bins=ds.rec_wf_sample_dist,
        wf=wf_in_digital_count,
        signal_begin_dist=ds.adj_ground_peak_dist,
        signal_end_dist=ds.sig_end_dist,
    )

    # make sure dimensions matches up
    dims = ds.processed_wf.dims
    ground_energy = ground_energy.transpose(dims[0], dims[1])

    senergy_whrc = ground_energy.sum(dim="rec_bin") * 2

    return -4.397006 + 0.006208 * senergy_whrc


def energy_adj_ground_to_sig_end_smooth(ds):
    """
    Waveform energy from the ground peak.  We calculated senergy_whrc as the energy of the waveform (in digital counts) from the ground peak
    to the signal end multiplied by two. Ground peak defined as whichever of the two lowest peaks has greater amplitude. We then applied the
    following linear transformation in order to calculate on the same scale as data published by Margolis et al. (2015)
    senergy = -4.397006 + 0.006208 * senergy_whrc
    """
    from carbonplan_trace.v1.glas_preprocess import (
        select_valid_area,
        smooth_wf,
    )  # avoid circular import

    path = "gs://carbonplan-climatetrace/inputs/volt_table.csv"
    volt_table = pd.read_csv(path)
    volt_to_digital_count = volt_table.set_index("volt_value")["ind"].to_dict()

    wf_in_digital_count = xr.apply_ufunc(
        volt_to_digital_count.__getitem__,
        ds.rec_wf.astype(float).round(6).fillna(-0.195279),
        vectorize=True,
        dask="parallelized",
    )

    smoothed_digital_cnt = xr.apply_ufunc(
        smooth_wf,
        wf_in_digital_count,
        ds.tx_wf.fillna(0),
        input_core_dims=[["rec_bin"], ["tx_bin"]],
        output_core_dims=[["rec_bin"]],
        vectorize=True,
        dask="parallelized",
        dask_gufunc_kwargs={"allow_rechunk": 1},
    )

    ds = ht.get_dist_metric_value(ds, metric="adj_ground_peak_dist")
    # the processed wf is from sig beg to sig end, select adj ground peak to sig end instead
    ground_energy = select_valid_area(
        bins=ds.rec_wf_sample_dist,
        wf=smoothed_digital_cnt,
        signal_begin_dist=ds.adj_ground_peak_dist,
        signal_end_dist=ds.sig_end_dist,
    )

    # make sure dimensions matches up
    dims = ds.processed_wf.dims
    ground_energy = ground_energy.transpose(dims[0], dims[1])

    senergy_whrc = ground_energy.sum(dim="rec_bin") * 2

    return -4.397006 + 0.006208 * senergy_whrc


def energy_adj_ground_to_sig_end_no_transform(ds):
    """
    Waveform energy from the ground peak.  We calculated senergy_whrc as the energy of the waveform (in digital counts) from the ground peak
    to the signal end multiplied by two. Ground peak defined as whichever of the two lowest peaks has greater amplitude. We then applied the
    following linear transformation in order to calculate on the same scale as data published by Margolis et al. (2015)
    senergy = -4.397006 + 0.006208 * senergy_whrc
    """
    from carbonplan_trace.v1.glas_preprocess import (
        select_valid_area,
        smooth_wf,
    )  # avoid circular import

    ds = ht.get_dist_metric_value(ds, metric="adj_ground_peak_dist")
    # the processed wf is from sig beg to sig end, select adj ground peak to sig end instead
    ground_energy = select_valid_area(
        bins=ds.rec_wf_sample_dist,
        wf=ds.processed_wf,
        signal_begin_dist=ds.adj_ground_peak_dist,
        signal_end_dist=ds.sig_end_dist,
    )

    # make sure dimensions matches up
    dims = ds.processed_wf.dims
    ground_energy = ground_energy.transpose(dims[0], dims[1])

    senergy_whrc = ground_energy.sum(dim="rec_bin") * 2

    return senergy_whrc

In [None]:
def get_adj_ground_peak_dist(ds):
    """
    the centroid position of whichever of the two lowest fitted Gaussian peaks has greater amplitude, as defined by Rosette, North, and Suarez (2008)
    """
    # find the larger peak between the bottom two
    # We have a filter where we only process records with at least 2 peaks -- fillna is needed here because argmax doesn't deal with all nans
    loc = (
        ds.gaussian_amp.isel(n_gaussian_peaks=slice(2))
        .fillna(0)
        .argmax(dim="n_gaussian_peaks")
        .compute()
    )
    return ds.gaussian_fit_dist.isel(n_gaussian_peaks=loc)


def get_adj_ground_peak_dist_new(ds):
    at_least_two_peaks = study.num_gaussian_peaks > 2
    loc = (
        study.gaussian_amp.isel(n_gaussian_peaks=slice(2))
        .fillna(0)
        .argmax(dim="n_gaussian_peaks")
        .compute()
    )
    loc = xr.where(at_least_two_peaks, loc, 0)
    return ds.gaussian_fit_dist.isel(n_gaussian_peaks=loc)

In [None]:
def energy_adj_ground_to_sig_end_new_adj(ds):
    """
    Waveform energy from the ground peak.  We calculated senergy_whrc as the energy of the waveform (in digital counts) from the ground peak
    to the signal end multiplied by two. Ground peak defined as whichever of the two lowest peaks has greater amplitude. We then applied the
    following linear transformation in order to calculate on the same scale as data published by Margolis et al. (2015)
    senergy = -4.397006 + 0.006208 * senergy_whrc
    """
    from carbonplan_trace.v1.glas_preprocess import (
        select_valid_area,
    )  # avoid circular import

    path = "gs://carbonplan-climatetrace/inputs/volt_table.csv"
    volt_table = pd.read_csv(path)
    volt_to_digital_count = volt_table.set_index("volt_value")["ind"].to_dict()
    wf_in_digital_count = xr.apply_ufunc(
        volt_to_digital_count.__getitem__,
        ds.rec_wf.astype(float).round(6).fillna(-0.195279),
        vectorize=True,
        dask="parallelized",
    )

    adj_ground_peak_dist = get_adj_ground_peak_dist_new(ds)
    # the processed wf is from sig beg to sig end, select adj ground peak to sig end instead
    ground_energy = select_valid_area(
        bins=ds.rec_wf_sample_dist,
        wf=wf_in_digital_count,
        signal_begin_dist=adj_ground_peak_dist,
        signal_end_dist=ds.sig_end_dist,
    )

    # make sure dimensions matches up
    dims = ds.processed_wf.dims
    ground_energy = ground_energy.transpose(dims[0], dims[1])

    senergy_whrc = ground_energy.sum(dim="rec_bin") * 2

    return -4.397006 + 0.006208 * senergy_whrc

In [None]:
# study['senergy_old'] = energy_adj_ground_to_sig_end_old(study)
# study['senergy_smooth'] = energy_adj_ground_to_sig_end_smooth(study)
# study['senergy_no_transform'] = energy_adj_ground_to_sig_end_no_transform(study)

study["senergy_new_adj"] = energy_adj_ground_to_sig_end_new_adj(study)

In [None]:
all_vars = [var for var in list(study.variables.keys()) if "senergy" in var] + [
    "lat",
    "lon",
]
all_vars

sub = study[all_vars]
sub.load()

In [None]:
def plot_scatter_comparison(ax, df, col_name, compared_col_name, params):
    xmin, xmax = params["xmin"], params["xmax"]
    unit = params["unit"]

    #     cond = (df[col_name] >= 26.643) & (df[col_name] <= 150) & (df[compared_col_name] <= 150)
    cond = [True] * len(df)
    x = df.loc[cond, compared_col_name].values
    y = df.loc[cond, col_name].values

    ax.plot([xmin, xmax], [xmin, xmax], "r")
    r2 = r2_score(x, y)
    rmse = mean_squared_error(x, y, squared=False)
    ax.scatter(x, y, c="k", s=0.01, alpha=0.5)
    ax.text(params["text_x"], params["text_y1"], f"R squared = {round(r2, 2)}")
    ax.text(params["text_x"], params["text_y2"], f"RMSE = {round(rmse, 2)} {unit}")
    if unit != "":
        unit_str = f"({unit})"
    else:
        unit_str = ""
    ax.set_xlabel(f"{compared_col_name} {unit_str}")
    ax.set_ylabel(f"{col_name} {unit_str}")
    ax.set_xlim(xmin, xmax)
    ax.set_ylim(xmin, xmax)
    ax.set_xticks(params["ticks"])
    ax.set_yticks(params["ticks"])


var = "senergy"
joined = merge_ds(study=sub, ref=margolis, var_name=var, precision=3)
plot_params = {
    "xmin": -10,
    "xmax": 210,
    "unit": "",
    "text_x": 10,
    "text_y1": 180,
    "text_y2": 155,
    "ticks": np.arange(0, 210, 50),
}

plt.figure(figsize=(13, 9))
plt.suptitle(var)
for i, col in enumerate(
    ["senergy_old", "senergy_no_transform", "senergy_smooth", "senergy_new_adj"]
):
    plt.subplot(2, 3, i + 1)
    ax = plt.gca()
    plot_scatter_comparison(
        ax=ax,
        df=joined.dropna(),
        col_name=col,
        compared_col_name=var,
        params=plot_params,
    )
plt.tight_layout()
plt.show()
plt.close()

In [None]:
study["neigh_25_old"] = ht.pct_25_to_adj_ground_ht(study)
study["neigh_50_old"] = ht.pct_50_to_adj_ground_ht(study)
study["neigh_75_old"] = ht.pct_75_to_adj_ground_ht(study)
study["neigh_90_old"] = ht.pct_90_to_adj_ground_ht(study)

In [None]:
def pct_25_to_ground_ht(ds):
    return ht.get_heights_from_distance(
        ds, top_metric="pct_25_dist", bottom_metric="ground_peak_dist"
    )


def pct_50_to_ground_ht(ds):
    return ht.get_heights_from_distance(
        ds, top_metric="pct_50_dist", bottom_metric="ground_peak_dist"
    )


def pct_75_to_ground_ht(ds):
    return ht.get_heights_from_distance(
        ds, top_metric="pct_75_dist", bottom_metric="ground_peak_dist"
    )


def pct_90_to_ground_ht(ds):
    return ht.get_heights_from_distance(
        ds, top_metric="pct_90_dist", bottom_metric="ground_peak_dist"
    )

In [None]:
study["neigh_25_ground"] = pct_25_to_ground_ht(study)
study["neigh_50_ground"] = pct_50_to_ground_ht(study)
study["neigh_75_ground"] = pct_75_to_ground_ht(study)
study["neigh_90_ground"] = pct_90_to_ground_ht(study)

In [None]:
all_vars = [var for var in list(study.variables.keys()) if "neigh" in var] + [
    "lat",
    "lon",
]
all_vars

sub = study[all_vars]
sub.load()

In [None]:
var = "neigh"
joined = merge_ds(study=sub, ref=margolis, var_name=var, precision=3)

In [None]:
joined.columns

In [None]:
default_params = {
    "xmin": -12,
    "xmax": 42,
    "unit": "m",
    "text_x": -8,
    "text_y1": 36,
    "text_y2": 30,
    "ticks": np.arange(-10, 42, 10),
}

for pct in ["25", "50", "75", "90"]:
    plt.figure(figsize=(9, 5))
    plt.suptitle(f"{pct}th percentile")
    for i, col in enumerate([f"neigh_{pct}_old", f"neigh_{pct}_ground"]):
        plt.subplot(1, 2, i + 1)
        ax = plt.gca()
        plot_scatter_comparison(
            ax=ax,
            df=joined.dropna(),
            col_name=col,
            compared_col_name=f"h{pct}_Neigh",
            params=default_params,
        )
    plt.tight_layout()
    plt.show()
    plt.close()

# Review all height metrics


In [None]:
height_metrics = [
    "VH",
    "h25_Neigh",
    "h50_Neigh",
    "h75_Neigh",
    "h90_Neigh",
    "QMCH",
    "MeanH",
    "f_slope",
    "senergy",
]
study = ht.get_all_height_metrics(study, height_metrics, recalc=True).compute()

In [None]:
def plot_scatter_comparison(ax, df, col_name, params):
    xmin, xmax = params["xmin"], params["xmax"]
    unit = params["unit"]

    if col_name == "f_slope":
        cond = (
            df[f"{col_name}_study"] >= 2.552
        )  # & (df[f'{col_name}_study'] <= 10) & (df[f'{col_name}_ref'] <= 10)
    elif col_name == "senergy":
        cond = df[f"{col_name}_study"] >= 26.643
    else:
        cond = [True] * len(df)

    x = df.loc[cond, f"{col_name}_ref"].values
    y = df.loc[cond, f"{col_name}_study"].values

    ax.plot([xmin, xmax], [xmin, xmax], "r")
    r2 = r2_score(x, y)
    rmse = mean_squared_error(x, y, squared=False)
    bias = np.mean(x - y)
    ax.scatter(x, y, c="k", s=0.01)
    ax.text(params["text_x"], params["text_y1"], f"R squared = {round(r2, 2)}")
    ax.text(params["text_x"], params["text_y2"], f"RMSE = {round(rmse, 2)} {unit}")
    ax.text(params["text_x"], params["text_y3"], f"bias = {round(bias, 2)} {unit}")

    if unit != "":
        unit_str = f"({unit})"
    else:
        unit_str = ""
    ax.set_xlabel(f"{col_name} from reference {unit_str}")
    ax.set_ylabel(f"{col_name} from this study {unit_str}")
    ax.set_xlim(xmin, xmax)
    ax.set_ylim(xmin, xmax)
    ax.set_xticks(params["ticks"])
    ax.set_yticks(params["ticks"])

In [None]:
plot_params = {
    "biomass": {
        "xmin": -10,
        "xmax": 360,
        "unit": "Mg/ha",
        "text_x": 10,
        "text_y1": 300,
        "text_y2": 280,
        "text_y3": 260,
        "ticks": np.arange(0, 360, 50),
    },
    "VH": {
        "xmin": -2,
        "xmax": 55,
        "unit": "m",
        "text_x": 2,
        "text_y1": 48,
        "text_y2": 42,
        "text_y3": 36,
        "ticks": np.arange(0, 55, 10),
    },
    "f_slope": {
        "xmin": -0.5,
        "xmax": 8.5,
        "unit": "",
        "text_x": 0.3,
        "text_y1": 7.5,
        "text_y2": 6.5,
        "text_y3": 5.5,
        "ticks": np.arange(0, 8.5, 2),
    },
    "senergy": {
        "xmin": -10,
        "xmax": 210,
        "unit": "",
        "text_x": 10,
        "text_y1": 180,
        "text_y2": 155,
        "text_y3": 130,
        "ticks": np.arange(0, 210, 50),
    },
}

default_params = {
    "xmin": -12,
    "xmax": 42,
    "unit": "m",
    "text_x": -8,
    "text_y1": 36,
    "text_y2": 30,
    "text_y3": 24,
    "ticks": np.arange(-10, 42, 10),
}

In [None]:
variables_to_compare = [
    "VH",
    "MeanH",
    "h25_Neigh",
    "h50_Neigh",
    "h75_Neigh",
    "h90_Neigh",
    "QMCH",
    "f_slope",
    "senergy",
]
coords = ["lat", "lon"]
all_vars = variables_to_compare + coords

left = study[all_vars].to_dataframe().reset_index()
right = margolis[all_vars].to_dataframe().reset_index()

precision = 4

left["lat_round"] = left.lat.round(precision)
left["lon_round"] = left.lon.round(precision)
right["lat_round"] = right.lat.round(precision)
right["lon_round"] = right.lon.round(precision)

joined_margolis = pd.merge(
    left=left,
    right=right,
    on=["lat_round", "lon_round", "shot_number"],
    suffixes=["_study", "_ref"],
)

In [None]:
plt.figure(figsize=(13, 12))
plt.suptitle("Comparison to Margolis et al 2015")
for i, var in enumerate(variables_to_compare):
    plt.subplot(3, 3, i + 1)
    ax = plt.gca()
    plot_scatter_comparison(
        ax=ax,
        df=joined_margolis.dropna(),
        col_name=var,
        params=plot_params.get(var, default_params),
    )
plt.tight_layout()
plt.show()
plt.close()

In [None]:
left = study[all_vars].to_dataframe().reset_index()
right = neigh[all_vars].to_dataframe().reset_index()

precision = 4

left["lat_round"] = left.lat.round(precision)
left["lon_round"] = left.lon.round(precision)
right["lat_round"] = right.lat.round(precision)
right["lon_round"] = right.lon.round(precision)

joined_neigh = pd.merge(
    left=left,
    right=right,
    on=["lat_round", "lon_round", "shot_number"],
    suffixes=["_study", "_ref"],
)

In [None]:
plt.figure(figsize=(13, 12))
plt.suptitle("Comparison to Neigh et al 2015")
for i, var in enumerate(variables_to_compare):
    plt.subplot(3, 3, i + 1)
    ax = plt.gca()
    plot_scatter_comparison(
        ax=ax,
        df=joined_neigh.dropna(),
        col_name=var,
        params=plot_params.get(var, default_params),
    )
plt.tight_layout()
plt.show()
plt.close()

In [None]:
joined = pd.concat([joined_margolis, joined_neigh])

In [None]:
plt.figure(figsize=(13, 12))
plt.suptitle("Comparison to all data")
for i, var in enumerate(variables_to_compare):
    plt.subplot(3, 3, i + 1)
    ax = plt.gca()
    plot_scatter_comparison(
        ax=ax,
        df=joined.dropna(),
        col_name=var,
        params=plot_params.get(var, default_params),
    )
plt.tight_layout()
plt.show()
plt.close()

In [None]:
len(joined_margolis)

In [None]:
len(joined_neigh)