In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

import matplotlib.patheffects as pe
import os
import matplotlib.pyplot as plt
import seaborn as sns
# Seaborn aesthetics
sns.set_context("notebook")
sns.set_theme(style="whitegrid", font_scale=1.4,
              rc={'grid.color': '#ededed'})

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, HuberRegressor
from sklearn import linear_model
import statsmodels.api as sm
import scipy
from scipy.stats import gaussian_kde
from matplotlib.colors import to_rgba

In [2]:
# Read the above saved file (if necessary)
appidv_all = pd.read_csv("../data/output/NEON_VEGSTRUCTURE_FIRSTMSMTS.csv")
# Use only live trees
appidv_all = appidv_all[~appidv_all['plantStatus'].str.contains("dead")].reset_index(drop=True)

# Get average crownd diameter
appidv_all["avgCrownDiameter"] = (appidv_all["maxCrownDiameter"] + appidv_all["ninetyCrownDiameter"])/2
# Get vertical crown diameter
appidv_all["verticalDiameter"] = appidv_all["height"] - appidv_all["baseCrownHeight"]
# calculate crown volume 
hrad = appidv_all["avgCrownDiameter"]/2
vrad = appidv_all["verticalDiameter"]/2
# appidv_all['crown_vol'] =(4/3) * np.pi * (hrad**2) * (vrad)
appidv_all['crown_vol'] =(4/3) * np.pi * (hrad**2) * ((1/4)*appidv_all["height"])

  appidv_all = pd.read_csv("../data/output/NEON_VEGSTRUCTURE_FIRSTMSMTS.csv")


In [3]:
## Clean data
# remember to divide by 100 for stemDiameter in meters!
appidv_all_filt = appidv_all.copy() # comment out if not needed

appidv_all_filt['stemDiameter_m'] = appidv_all_filt['stemDiameter']/100
appidv_all_filt['crownRadius'] = appidv_all_filt['avgCrownDiameter']/2
appidv_all_filt['verticalCrownRadius'] = appidv_all_filt['verticalDiameter']/2

# Remove data with errors (large DBH, height, etc.)
treeid_list = ['NEON.PLA.D01.BART.05414', 'NEON.PLA.D01.HARV.05718',
               'NEON.PLA.D01.HARV.05764', 'NEON.PLA.D03.JERC.00993',
               'NEON.PLA.D08.TALL.01932', 'NEON.PLA.D05.TREE.00161',
               'NEON.PLA.D17.SOAP.05687', 'NEON.PLA.D12.YELL.01123']
appidv_all_clean = appidv_all_filt[~appidv_all_filt['individualID'].isin(treeid_list)].copy().reset_index(drop=True)

# filter for only trees
growthForm_list = ['single bole tree', 'multi-bole tree', 'small tree'] # 'small tree'
# growthForm_list = ['small tree']
appidv_all_clean = appidv_all_clean[appidv_all_clean['growthForm'].isin(growthForm_list)].copy()
appidv_all_clean = appidv_all_clean[appidv_all_clean['plantStatus']=="Live"].copy().reset_index(drop=True)

In [None]:
#########

In [30]:
## Linear regression function
from scipy import stats
import numpy as np

def LinearRegression(x,y):
    """
    returns: regression model
    """
    filterbool = ~(np.isnan(x) | np.isnan(y))
    x = x[filterbool]
    y = y[filterbool]
    # Simple OLS linear regression using scipy stats, which is what statsmodel depends on
    try:
        slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    except:
        print(f"Not enough data at {siteid}. Length: ", len(x))
        return np.nan, np.nan, np.nan, np.nan, np.nan
    return slope, intercept, r_value, p_value, std_err


In [31]:
# init results df
results_df_h_d = pd.DataFrame({"siteID":appidv_all_clean['siteID'].unique()})
results_df_rc_d = pd.DataFrame({"siteID":appidv_all_clean['siteID'].unique()})
results_df_rc_h = pd.DataFrame({"siteID":appidv_all_clean['siteID'].unique()})
results_df_vc_h = pd.DataFrame({"siteID":appidv_all_clean['siteID'].unique()})



# Loop through each site, get regression, save scaling exponent
for siteid, site_df in appidv_all_clean.groupby("siteID"):
    print(siteid, end="\r")
    ## H ~ D
    x = np.log(site_df['stemDiameter'].values)
    y = np.log(site_df['height'].values)
    slope, intercept, r_value, p_value, std_err = LinearRegression(x,y)
    results_df_h_d.loc[results_df_h_d['siteID']==siteid, "slope"] = np.round(slope,3)
    results_df_h_d.loc[results_df_h_d['siteID']==siteid, "intercept"] = np.round(intercept,3)
    results_df_h_d.loc[results_df_h_d['siteID']==siteid, "R2"] = np.round(r_value**2,3)
    results_df_h_d.loc[results_df_h_d['siteID']==siteid, "p_value"] = np.round(p_value,4)
    results_df_h_d.loc[results_df_h_d['siteID']==siteid, "std_err"] = np.round(std_err,3)
    
    ## Rc ~ D
    x = np.log(site_df['stemDiameter'].values)
    y = np.log(site_df['crownRadius'].values)
    slope, intercept, r_value, p_value, std_err = LinearRegression(x,y)
    results_df_rc_d.loc[results_df_rc_d['siteID']==siteid, "slope"] = np.round(slope,3)
    results_df_rc_d.loc[results_df_rc_d['siteID']==siteid, "intercept"] = np.round(intercept,3)
    results_df_rc_d.loc[results_df_rc_d['siteID']==siteid, "R2"] = np.round(r_value**2,3)
    results_df_rc_d.loc[results_df_rc_d['siteID']==siteid, "p_value"] = np.round(p_value,4)
    results_df_rc_d.loc[results_df_rc_d['siteID']==siteid, "std_err"] = np.round(std_err,3)
    
    ## Rc ~ H
    x = np.log(site_df['height'].values)
    y = np.log(site_df['crownRadius'].values)
    slope, intercept, r_value, p_value, std_err = LinearRegression(x,y)
    results_df_rc_h.loc[results_df_rc_h['siteID']==siteid, "slope"] = np.round(slope,3)
    results_df_rc_h.loc[results_df_rc_h['siteID']==siteid, "intercept"] = np.round(intercept,3)
    results_df_rc_h.loc[results_df_rc_h['siteID']==siteid, "R2"] = np.round(r_value**2,3)
    results_df_rc_h.loc[results_df_rc_h['siteID']==siteid, "p_value"] = np.round(p_value,4)
    results_df_rc_h.loc[results_df_rc_h['siteID']==siteid, "std_err"] = np.round(std_err,3)
    
    ## Vc ~ H
    x = np.log(site_df['height'].values)
    y = np.log(site_df['crown_vol'].values)
    slope, intercept, r_value, p_value, std_err = LinearRegression(x,y)
    results_df_vc_h.loc[results_df_vc_h['siteID']==siteid, "slope"] = np.round(slope,3)
    results_df_vc_h.loc[results_df_vc_h['siteID']==siteid, "intercept"] = np.round(intercept,3)
    results_df_vc_h.loc[results_df_vc_h['siteID']==siteid, "R2"] = np.round(r_value**2,3)
    results_df_vc_h.loc[results_df_vc_h['siteID']==siteid, "p_value"] = np.round(p_value,4)
    results_df_vc_h.loc[results_df_vc_h['siteID']==siteid, "std_err"] = np.round(std_err,3)
    
    

Not enough data at JERC. Length:  0
Not enough data at JERC. Length:  0
Not enough data at JERC. Length:  0
Not enough data at SRER. Length:  0
Not enough data at SRER. Length:  2
Not enough data at SRER. Length:  2
YELL

In [33]:
# Lets save outputs!
results_df_h_d.to_csv("../data/output/site-allometry/NEON_site_allom_H_D.csv",index=False)
results_df_rc_d.to_csv("../data/output/site-allometry/NEON_site_allom_Rc_D.csv",index=False)
results_df_rc_h.to_csv("../data/output/site-allometry/NEON_site_allom_Rc_H.csv",index=False)
results_df_vc_h.to_csv("../data/output/site-allometry/NEON_site_allom_Vc_H.csv",index=False)