In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import os
import glob
import numpy as np

In [2]:
# Read in metadata (has all the vars we need for allom modeling!)
neon_meta_df = pd.read_csv("/data/shared/src/arojas/NEON/data/NEON_Field_Site_Metadata_20230309_Env_Vars.csv")
print(neon_meta_df.shape)
print(neon_meta_df.columns)

(81, 60)
Index(['field_domain_id', 'field_site_id', 'field_site_name',
       'field_site_type', 'field_site_subtype', 'field_colocated_site',
       'field_site_host', 'field_site_url', 'field_nonneon_research_allowed',
       'field_access_details', 'field_neon_field_operations_office',
       'field_latitude', 'field_longitude', 'field_geodetic_datum',
       'field_utm_northing', 'field_utm_easting', 'field_utm_zone',
       'field_site_county', 'field_site_state', 'field_site_country',
       'field_mean_elevation_m', 'field_minimum_elevation_m',
       'field_maximum_elevation_m', 'field_mean_annual_temperature_C',
       'field_mean_annual_precipitation_mm', 'field_dominant_wind_direction',
       'field_mean_canopy_height_m', 'field_dominant_nlcd_classes',
       'field_domint_plant_species', 'field_usgs_huc', 'field_watershed_name',
       'field_watershed_size_km2', 'field_lake_depth_mean_m',
       'field_lake_depth_max_m', 'field_tower_height_m',
       'field_usgs_geology_

In [3]:
# Lets read in the veg structure measurements with env vars
neon_veg = pd.read_csv("../data/output/NEON_VEGSTRUCTURE_FIRSTMSMTS.csv")
print(neon_veg.shape)
# print(neon_veg.columns)

  neon_veg = pd.read_csv("../data/output/NEON_VEGSTRUCTURE_FIRSTMSMTS.csv")


(369490, 45)


In [4]:
# Get hmax for each site from veg structure and add to df
for siteid, group in neon_veg.groupby("siteID"):
    print(siteid,end="\r")
    # get indices
    group_idx = neon_meta_df.loc[neon_meta_df['field_site_id']==siteid].index
    # Get 95 percentile height
    hmax = np.nanpercentile(group['height'].values,95)
    neon_meta_df.loc[group_idx, "site_hmax"] = hmax

ABBYBARTBLANBONACLBJCPERDCFSDEJUDELADSNYGRSMGUANHARVHEALJERCJORNKONZLAJALENOMLBSMOABNIWONOGPONAQORNLOSBSPUUMRMNPSCBISERCSJERSOAPSRERSTEITALLTEAKTREEUKFSUNDEWOODWREFYELL

In [8]:
# Lets add the scaling exponents for allometric relationships!
# Read allometry tables
h_d_df = pd.read_csv("../data/output/site-allometry/NEON_site_allom_H_D.csv")
rc_d_df = pd.read_csv("../data/output/site-allometry/NEON_site_allom_Rc_D.csv")
rc_h_df = pd.read_csv("../data/output/site-allometry/NEON_site_allom_Rc_H.csv")
vc_h_df = pd.read_csv("../data/output/site-allometry/NEON_site_allom_Vc_H.csv")

site_id_list = neon_meta_df['field_site_id'].unique()
for siteid in site_id_list:
    print(siteid,end="\r")
    # get indices
    group_idx = neon_meta_df.loc[neon_meta_df['field_site_id']==siteid].index
    if len(h_d_df.loc[h_d_df['siteID']==siteid])==0:
        continue
    # Get scaling exponents
    scaling_exp = h_d_df.loc[h_d_df['siteID']==siteid]['slope'].values[0]
    neon_meta_df.loc[group_idx, "H_D_pow"] = scaling_exp
    scaling_exp = rc_d_df.loc[rc_d_df['siteID']==siteid]['slope'].values[0]
    neon_meta_df.loc[group_idx, "Rc_D_pow"] = scaling_exp
    scaling_exp = rc_h_df.loc[rc_h_df['siteID']==siteid]['slope'].values[0]
    neon_meta_df.loc[group_idx, "Rc_H_pow"] = scaling_exp
    scaling_exp = vc_h_df.loc[vc_h_df['siteID']==siteid]['slope'].values[0]
    neon_meta_df.loc[group_idx, "Vc_H_pow"] = scaling_exp

ABBYARIKBARCBARRBARTBIGCBLANBLDEBLUEBLWABONACARICLBJCOMOCPERCRAMCUPEDCFSDEJUDELADSNYFLNTGRSMGUANGUILHARVHEALHOPBJERCJORNKINGKONAKONZLAJALECOLENOLEWILIROMARTMAYFMCDIMCRAMLBSMOABNIWONOGPOAESOKSRONAQORNLOSBSPOSEPRINPRLAPRPOPUUMREDBRMNPSCBISERCSJERSOAPSRERSTEISTERSUGGSYCATALLTEAKTECRTOMBTOOKTOOLTREEUKFSUNDEWALKWLOUWOODWREFYELL

In [10]:
neon_meta_df.tail()

Unnamed: 0,field_domain_id,field_site_id,field_site_name,field_site_type,field_site_subtype,field_colocated_site,field_site_host,field_site_url,field_nonneon_research_allowed,field_access_details,...,sg_cec_avg,sg_clay_avg,sg_phh2o_avg,sg_sand_avg,sg_silt_avg,site_hmax,H_D_pow,Rc_D_pow,Rc_H_pow,Vc_H_pow
76,D07,WALK,Walker Branch NEON,Core Aquatic,Wadeable Stream,ORNL,Department of Energy,https://www.ornl.gov/division/esd,Very Limited,There is currently no system in place to autho...,...,,,,,,,,,,
77,D13,WLOU,West St Louis Creek NEON,Gradient Aquatic,Wadeable Stream,,US Forest Service,https://www.fs.usda.gov/wps/portal/fsinternet/...,Yes,Researchers should coordinate directly with th...,...,,,,,,,,,,
78,D09,WOOD,Chase Lake National Wildlife Refuge NEON,Core Terrestrial,,PRPO,US Fish and Wildlife Service,https://www.fws.gov/refuge/chase_lake/,Yes,This site host welcomes and encourages additio...,...,177.378,327.601,67.612,250.309,327.917,1.5,,,,
79,D16,WREF,Wind River Experimental Forest NEON,Core Terrestrial,,,"Pacific Northwest Research Station, US Forest ...",https://www.fs.usda.gov/pnw/,Yes,Reseachers should coordinate with the site man...,...,268.746,160.735,54.171,433.739,405.527,34.3,0.873,0.477,0.48,1.959
80,D12,YELL,Yellowstone National Park NEON,Core Terrestrial,,BLDE|YELL,National Park Service,https://irma.nps.gov/rprs/,Limited,The National Park Service is open to additiona...,...,184.694,210.962,65.484,447.397,341.455,14.8,0.541,0.557,0.909,2.818


In [8]:
##############
## Allometric modeling
##############

In [10]:
# Now that we have all structural, plot level, and environmental variables, lets model!

In [11]:
neon_meta_df.columns

Index(['field_domain_id', 'field_site_id', 'field_site_name',
       'field_site_type', 'field_site_subtype', 'field_colocated_site',
       'field_site_host', 'field_site_url', 'field_nonneon_research_allowed',
       'field_access_details', 'field_neon_field_operations_office',
       'field_latitude', 'field_longitude', 'field_geodetic_datum',
       'field_utm_northing', 'field_utm_easting', 'field_utm_zone',
       'field_site_county', 'field_site_state', 'field_site_country',
       'field_mean_elevation_m', 'field_minimum_elevation_m',
       'field_maximum_elevation_m', 'field_mean_annual_temperature_C',
       'field_mean_annual_precipitation_mm', 'field_dominant_wind_direction',
       'field_mean_canopy_height_m', 'field_dominant_nlcd_classes',
       'field_domint_plant_species', 'field_usgs_huc', 'field_watershed_name',
       'field_watershed_size_km2', 'field_lake_depth_mean_m',
       'field_lake_depth_max_m', 'field_tower_height_m',
       'field_usgs_geology_unit', 'f

In [62]:
from sklearn.ensemble import RandomForestRegressor
from matplotlib.colors import to_rgba
from sklearn.metrics import mean_squared_error

# Getting input data
# lets extract the columns
data_cols = ['wc_srad_avg_yr',
       'wc_vapr_avg_yr', 'wc_wind_avg_yr', 'wc_bio_1', 'wc_bio_12',
       'wc_bio_15', 'pet_he_yr', 'ai_yr', 'E', 'CWD', 'sg_cec_avg',
       'sg_clay_avg', 'sg_phh2o_avg', 'sg_sand_avg', 'sg_silt_avg',
       'site_hmax',"field_mean_elevation_m"]
# independent and dependent vars
allom_substrs_list = ["H_D","Rc_D","Rc_H","Vc_H"]
for allom_substr in allom_substrs_list:
    # allom_substr = "H_D"
    xy_data_arr = np.hstack([neon_meta_df[data_cols].values,
                            neon_meta_df[f"{allom_substr}_pow"].values.reshape(-1,1)])
    nodata_bool = ~np.isnan(xy_data_arr).any(axis=1)
    xy_data_arr = xy_data_arr[nodata_bool]
    X_data_arr = xy_data_arr[:,:-1]
    y_data_arr = xy_data_arr[:,-1]
    siteids_filt = neon_meta_df["field_site_id"].values[nodata_bool]
    
    # Get feature importance
    forest_classifier = RandomForestRegressor(random_state=8)
    forest_classifier.fit(X_data_arr,y_data_arr)
    
    feature_imp_df = pd.DataFrame({"score":forest_classifier.feature_importances_}, index=data_cols)
    feature_imp_df = feature_imp_df.sort_values(by="score", ascending=False)
    sns.set_context("notebook", font_scale=1.2)
    fig,ax = plt.subplots(1,1,figsize=(10,6))
    ax.set_title(f"{allom_substr} Allometry Random Forest Relative Importance")
    sns.barplot(x=feature_imp_df['score'], y=feature_imp_df.index, ax=ax)
    
    ## Add inset plot with errors and eval metrics
    left, bottom, width, height = [0.6, 0.25, 0.35, 0.4] # 0,0 is bottom left)
    ax2 = fig.add_axes([left, bottom, width, height])
    # Plot residuals vs fitted
    y_pred = forest_classifier.predict(X_data_arr)
    residuals = y_data_arr - y_pred
    ax2.scatter(y_pred, residuals,
                 color=to_rgba("#1f77b4", .3),
                 ec="#1f77b4", lw=0.5)
    ax2.axhline(0, color="#7f7f7f", ls="--")
    ax2.set_xlabel("Predicted")
    ax2.set_ylabel("Residual")
    # Fix yrange
    yrange_max = np.max(np.absolute(ax2.get_ylim()))
    ax2.set_ylim(yrange_max*-1,yrange_max)
    # Add metrics text
    r2 = forest_classifier.score(X_data_arr, y_data_arr)
    rmse = mean_squared_error(y_true=y_data_arr.reshape(-1,1),
                              y_pred=y_pred.reshape(-1,1), squared=False)
    txt = f"R2={np.round(r2,2)}\nRMSE={np.round(rmse,2)}"
    ax2.text(0.98, 0.0125, txt, ha='right', va='bottom',
             transform=ax2.transAxes)
    
    fig.tight_layout()
    fig.savefig(f"../figs/veg-struct/allom/NEON_site_allom_{allom_substr}_FeatImportance.png", dpi=300)
    plt.close()
    break
    
    
    

  fig.tight_layout()


In [66]:
stacked_arr = np.hstack([siteids_filt.reshape(-1,1),residuals.reshape(-1,1)])

In [67]:
stacked_arr

array([['ABBY', -0.041720000000000534],
       ['BART', 0.018020000000000147],
       ['BLAN', 0.002709999999999879],
       ['BONA', 0.053169999999999495],
       ['CLBJ', 0.03572999999999965],
       ['DEJU', 0.05645999999999929],
       ['DELA', -0.01389999999999969],
       ['DSNY', 0.03300000000000047],
       ['GRSM', -0.002709999999999768],
       ['GUAN', -0.10594999999999938],
       ['HARV', -0.00840000000000063],
       ['HEAL', 0.04793000000000036],
       ['JERC', 0.012040000000000495],
       ['KONZ', -0.004950000000000621],
       ['LAJA', -0.017649999999999666],
       ['LENO', 0.005260000000001153],
       ['MLBS', 0.0012799999999995038],
       ['MOAB', 0.01744000000000001],
       ['NIWO', 0.024929999999999675],
       ['NOGP', -0.10400999999999982],
       ['ONAQ', -0.005499999999999949],
       ['ORNL', 0.01013999999999926],
       ['OSBS', 0.010120000000000573],
       ['RMNP', 0.007379999999999831],
       ['SCBI', 0.0058600000000006425],
       ['SERC', 0.005789