In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from pyproj import CRS, Transformer
import re
import rioxarray as rioxr
import xarray as xr
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
# Seaborn aesthetics
sns.set_context("notebook")
sns.set_theme(style="whitegrid", font_scale=1.4,
              rc={'grid.color': '#ededed'})

In [3]:
# Get tree data
appidv_all = pd.read_csv("../data/output/NEON_VEGSTRUCTURE_FIRSTMSMTS.csv")
# Use only live trees
appidv_all = appidv_all[~appidv_all['plantStatus'].str.contains("dead")].reset_index(drop=True)

# Get average crownd diameter
appidv_all["avgCrownDiameter"] = (appidv_all["maxCrownDiameter"] + appidv_all["ninetyCrownDiameter"])/2
# Get vertical crown diameter
appidv_all["verticalDiameter"] = appidv_all["height"] - appidv_all["baseCrownHeight"]
# calculate crown volume 
hrad = appidv_all["avgCrownDiameter"]/2
vrad = appidv_all["verticalDiameter"]/2
# appidv_all['crown_vol'] =(4/3) * np.pi * (hrad**2) * (vrad)
appidv_all['crown_vol'] =(4/3) * np.pi * (hrad**2) * ((1/4)*appidv_all["height"])

  appidv_all = pd.read_csv("../data/output/NEON_VEGSTRUCTURE_FIRSTMSMTS.csv")


In [4]:
# Filter out small veg
# appidv_all_filt = appidv_all.loc[appidv_all['stemDiameter']>5].copy()
# appidv_all_filt = appidv_all_filt.loc[appidv_all_filt['height']>2].copy()
# appidv_all_copy_filt = appidv_all_copy_filt.loc[appidv_all_copy_filt['maxCrownDiameter']>1].copy()

In [5]:
## Clean data
# remember to divide by 100 for stemDiameter in meters!
appidv_all['stemDiameter_m'] = appidv_all['stemDiameter']/100
appidv_all['crownRadius'] = appidv_all['avgCrownDiameter']/2
appidv_all['verticalCrownRadius'] = appidv_all['verticalDiameter']/2

# Remove data with errors (large DBH, height, etc.)
treeid_list = ['NEON.PLA.D01.BART.05414', 'NEON.PLA.D01.HARV.05718',
               'NEON.PLA.D01.HARV.05764', 'NEON.PLA.D03.JERC.00993',
               'NEON.PLA.D08.TALL.01932', 'NEON.PLA.D05.TREE.00161',
               'NEON.PLA.D17.SOAP.05687', 'NEON.PLA.D12.YELL.01123']
appidv_all_clean = appidv_all[~appidv_all['individualID'].isin(treeid_list)].copy().reset_index(drop=True)

# filter for only trees
growthForm_list = ['single bole tree', 'multi-bole tree', 'small tree'] # 'small tree'
# growthForm_list = ['small tree']
appidv_all_clean = appidv_all_clean[appidv_all_clean['growthForm'].isin(growthForm_list)].copy()
appidv_all_clean = appidv_all_clean[appidv_all_clean['plantStatus']=="Live"].copy().reset_index(drop=True)

In [6]:
appidv_all_clean.shape

(104396, 51)

In [7]:
## Lets get lat lon for each tree

# read in metadata for utm zone
neon_meta = pd.read_csv("../data/NEON_Field_Site_Metadata_20230309.csv")
# get utm zone (option 1 - faster way)
# utm_num = re.findall('\d+', "UTM18N")[0]
neon_meta['utm_num'] =  [re.findall('\d+', str(x))[0] for x in neon_meta['field_utm_zone']]

# Loop through each site, reproject from utmzone to latlon
df_list = []
for name,group in appidv_all_clean.groupby("siteID"):
    
    # Get EPSG code for UTM zone
    utm_num = neon_meta.loc[neon_meta["field_domain_id"]==group['domainID'].values[0]]["utm_num"].values[0]
    crs = CRS.from_string(f"+proj=utm +zone={utm_num} +north +ellps=WGS84 +datum=WGS84 +units=m +no_defs")
    
    # Get easting and northing
    x = group['easting'].values
    y = group['northing'].values
    
    from_crs = CRS.from_proj4(f"+proj=utm +zone={utm_num} +north +ellps=WGS84 +datum=WGS84 +units=m +no_defs")
    to_crs = CRS.from_epsg(4326)

    proj = Transformer.from_crs(from_crs, to_crs, always_xy=True)
    coordinates = proj.transform(x, y)
    group["lon"] = coordinates[0]
    group["lat"] = coordinates[1]
    df_list.append(group)

# Combine and replace inf with nan
appidv_all_latlon = pd.concat(df_list)
appidv_all_latlon.loc[~np.isfinite(appidv_all_latlon["lon"].values), "lon"]=np.nan
appidv_all_latlon.loc[~np.isfinite(appidv_all_latlon["lat"].values), "lat"]=np.nan

In [8]:
print(appidv_all_clean.shape)
print(appidv_all_latlon.shape)
print(appidv_all_latlon.columns)

(104396, 51)
(104396, 53)
Index(['uid', 'namedLocation', 'date', 'eventID', 'domainID', 'siteID',
       'plotID', 'individualID', 'tempStemID', 'tagStatus', 'growthForm',
       'plantStatus', 'stemDiameter', 'measurementHeight',
       'changedMeasurementLocation', 'height', 'baseCrownHeight',
       'breakHeight', 'breakDiameter', 'maxCrownDiameter',
       'ninetyCrownDiameter', 'canopyPosition', 'shape', 'basalStemDiameter',
       'basalStemDiameterMsrmntHeight', 'maxBaseCrownDiameter',
       'ninetyBaseCrownDiameter', 'dendrometerInstallationDate',
       'initialGapMeasurementDate', 'initialBandStemDiameter',
       'initialDendrometerGap', 'dendrometerHeight', 'dendrometerGap',
       'dendrometerCondition', 'bandStemDiameter', 'remarks', 'recordedBy',
       'measuredBy', 'dataEntryRecordID', 'dataQF', 'subplotID', 'taxonID',
       'scientificName', 'easting', 'northing', 'avgCrownDiameter',
       'verticalDiameter', 'crown_vol', 'stemDiameter_m', 'crownRadius',
       've

In [8]:
####################
## SAMPLE RASTERS

In [9]:
## Worldclim sampling
# Lets loop through all files
raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/worldclim"
# raster_input_vars = next(os.walk(raster_input_dir))[1]
site_id_list = appidv_all_latlon['siteID'].unique()

for site_id in site_id_list:
#     if site_id != "BART":
#         continue
    # Get indices for veg point locations
    df_filt = appidv_all_latlon.loc[appidv_all_latlon['siteID']==site_id].copy()
    df_indices = df_filt.index
    
    x_indexer = xr.DataArray(df_filt["lon"].values, dims=["point"])
    y_indexer = xr.DataArray(df_filt["lat"].values, dims=["point"])
    # Get input rastrs
    site_indir = os.path.join(raster_input_dir, site_id)
    # print(site_indir)
    raster_fpaths = glob.glob(os.path.join(site_indir,"*.tif"))
    for raster_fp in raster_fpaths:
        # get varname for adding to df
        var_name = os.path.basename(raster_fp)[15:-4]
        # Read input raster
        rarr = rioxr.open_rasterio(raster_fp,mask_and_scale=True)
        # Sample raster (returns 2d array (1,n_samples) )
        sampled_raster_vals = rarr.sel(x=x_indexer, y=y_indexer, method="nearest")[0]
        appidv_all_latlon.loc[df_indices,var_name] = sampled_raster_vals.values
        
        
    

In [10]:
appidv_all_latlon.columns

Index(['uid', 'namedLocation', 'date', 'eventID', 'domainID', 'siteID',
       'plotID', 'individualID', 'tempStemID', 'tagStatus', 'growthForm',
       'plantStatus', 'stemDiameter', 'measurementHeight',
       'changedMeasurementLocation', 'height', 'baseCrownHeight',
       'breakHeight', 'breakDiameter', 'maxCrownDiameter',
       'ninetyCrownDiameter', 'canopyPosition', 'shape', 'basalStemDiameter',
       'basalStemDiameterMsrmntHeight', 'maxBaseCrownDiameter',
       'ninetyBaseCrownDiameter', 'dendrometerInstallationDate',
       'initialGapMeasurementDate', 'initialBandStemDiameter',
       'initialDendrometerGap', 'dendrometerHeight', 'dendrometerGap',
       'dendrometerCondition', 'bandStemDiameter', 'remarks', 'recordedBy',
       'measuredBy', 'dataEntryRecordID', 'dataQF', 'subplotID', 'taxonID',
       'scientificName', 'easting', 'northing', 'avgCrownDiameter',
       'verticalDiameter', 'crown_vol', 'stemDiameter_m', 'crownRadius',
       'verticalCrownRadius', 'lon',

In [12]:
## PET and AI
# Lets loop through all files
# raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/PET"
raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/AI"
site_id_list = appidv_all_latlon['siteID'].unique()


for site_id in site_id_list:

    # Get indices for veg point locations
    df_filt = appidv_all_latlon.loc[appidv_all_latlon['siteID']==site_id].copy()
    df_indices = df_filt.index
    
    x_indexer = xr.DataArray(df_filt["lon"].values, dims=["point"])
    y_indexer = xr.DataArray(df_filt["lat"].values, dims=["point"])
    # Get input rastr
    raster_fp = glob.glob(os.path.join(raster_input_dir,f"{site_id}*"))[0]
    var_name = os.path.basename(raster_fp)[5:-4]
    # Read input raster
    rarr = rioxr.open_rasterio(raster_fp,mask_and_scale=True)
    # Sample raster (returns 2d array (1,n_samples) )
    sampled_raster_vals = rarr.sel(x=x_indexer, y=y_indexer, method="nearest")[0]
    appidv_all_latlon.loc[df_indices,var_name] = sampled_raster_vals.values
        
        
    

In [13]:
appidv_all_latlon.columns

Index(['uid', 'namedLocation', 'date', 'eventID', 'domainID', 'siteID',
       'plotID', 'individualID', 'tempStemID', 'tagStatus', 'growthForm',
       'plantStatus', 'stemDiameter', 'measurementHeight',
       'changedMeasurementLocation', 'height', 'baseCrownHeight',
       'breakHeight', 'breakDiameter', 'maxCrownDiameter',
       'ninetyCrownDiameter', 'canopyPosition', 'shape', 'basalStemDiameter',
       'basalStemDiameterMsrmntHeight', 'maxBaseCrownDiameter',
       'ninetyBaseCrownDiameter', 'dendrometerInstallationDate',
       'initialGapMeasurementDate', 'initialBandStemDiameter',
       'initialDendrometerGap', 'dendrometerHeight', 'dendrometerGap',
       'dendrometerCondition', 'bandStemDiameter', 'remarks', 'recordedBy',
       'measuredBy', 'dataEntryRecordID', 'dataQF', 'subplotID', 'taxonID',
       'scientificName', 'easting', 'northing', 'avgCrownDiameter',
       'verticalDiameter', 'crown_vol', 'stemDiameter_m', 'crownRadius',
       'verticalCrownRadius', 'lon',

In [14]:
#####################
#####################

In [16]:
## Chave environmental vars (CWD and E)
# Lets loop through all files
# raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/chave/E"
raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/chave/CWD"
site_id_list = appidv_all_latlon['siteID'].unique()


for site_id in site_id_list:

    # Get indices for veg point locations
    df_filt = appidv_all_latlon.loc[appidv_all_latlon['siteID']==site_id].copy()
    df_indices = df_filt.index
    
    x_indexer = xr.DataArray(df_filt["lon"].values, dims=["point"])
    y_indexer = xr.DataArray(df_filt["lat"].values, dims=["point"])
    # Get input rastr
    raster_fp = glob.glob(os.path.join(raster_input_dir,f"{site_id}*"))[0]
    var_name = os.path.basename(raster_fp)[5:-4]
    # Read input raster
    rarr = rioxr.open_rasterio(raster_fp,mask_and_scale=True)
    # Sample raster (returns 2d array (1,n_samples) )
    sampled_raster_vals = rarr.sel(x=x_indexer, y=y_indexer, method="nearest")[0]
    appidv_all_latlon.loc[df_indices,var_name] = sampled_raster_vals.values

In [17]:
appidv_all_latlon.columns

Index(['uid', 'namedLocation', 'date', 'eventID', 'domainID', 'siteID',
       'plotID', 'individualID', 'tempStemID', 'tagStatus', 'growthForm',
       'plantStatus', 'stemDiameter', 'measurementHeight',
       'changedMeasurementLocation', 'height', 'baseCrownHeight',
       'breakHeight', 'breakDiameter', 'maxCrownDiameter',
       'ninetyCrownDiameter', 'canopyPosition', 'shape', 'basalStemDiameter',
       'basalStemDiameterMsrmntHeight', 'maxBaseCrownDiameter',
       'ninetyBaseCrownDiameter', 'dendrometerInstallationDate',
       'initialGapMeasurementDate', 'initialBandStemDiameter',
       'initialDendrometerGap', 'dendrometerHeight', 'dendrometerGap',
       'dendrometerCondition', 'bandStemDiameter', 'remarks', 'recordedBy',
       'measuredBy', 'dataEntryRecordID', 'dataQF', 'subplotID', 'taxonID',
       'scientificName', 'easting', 'northing', 'avgCrownDiameter',
       'verticalDiameter', 'crown_vol', 'stemDiameter_m', 'crownRadius',
       'verticalCrownRadius', 'lon',

In [18]:
#####################
#####################

In [19]:
## Soilgrids sampling
# Lets loop through all files
raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/SOILGRIDS"
# raster_vars = next(os.walk(raster_input_dir))[1]
raster_vars = ["cec",  "clay",  "phh2o",  "sand",  "silt"]
site_id_list = appidv_all_latlon['siteID'].unique()

for raster_var in raster_vars:
    
    rastervar_indir = os.path.join(raster_input_dir, raster_var) 
    
    for site_id in site_id_list:
    #     if site_id != "BART":
    #         continue
        # Get indices for veg point locations
        df_filt = appidv_all_latlon.loc[appidv_all_latlon['siteID']==site_id].copy()
        df_indices = df_filt.index
        x_indexer = xr.DataArray(df_filt["lon"].values, dims=["point"])
        y_indexer = xr.DataArray(df_filt["lat"].values, dims=["point"])
        
        # Get input rastrs
        site_indir = os.path.join(rastervar_indir, site_id)
        # print(site_indir)
        raster_fpaths = glob.glob(os.path.join(site_indir,"*.tif"))
        for raster_fp in raster_fpaths:
            # get varname for adding to df
            var_name = os.path.basename(raster_fp)[:-4]
            # Read input raster
            rarr = rioxr.open_rasterio(raster_fp,mask_and_scale=True)
            # Sample raster (returns 2d array (1,n_samples) )
            sampled_raster_vals = rarr.sel(x=x_indexer, y=y_indexer, method="nearest")[0]
            appidv_all_latlon.loc[df_indices,var_name] = sampled_raster_vals.values




In [20]:
appidv_all_latlon.columns

Index(['uid', 'namedLocation', 'date', 'eventID', 'domainID', 'siteID',
       'plotID', 'individualID', 'tempStemID', 'tagStatus', 'growthForm',
       'plantStatus', 'stemDiameter', 'measurementHeight',
       'changedMeasurementLocation', 'height', 'baseCrownHeight',
       'breakHeight', 'breakDiameter', 'maxCrownDiameter',
       'ninetyCrownDiameter', 'canopyPosition', 'shape', 'basalStemDiameter',
       'basalStemDiameterMsrmntHeight', 'maxBaseCrownDiameter',
       'ninetyBaseCrownDiameter', 'dendrometerInstallationDate',
       'initialGapMeasurementDate', 'initialBandStemDiameter',
       'initialDendrometerGap', 'dendrometerHeight', 'dendrometerGap',
       'dendrometerCondition', 'bandStemDiameter', 'remarks', 'recordedBy',
       'measuredBy', 'dataEntryRecordID', 'dataQF', 'subplotID', 'taxonID',
       'scientificName', 'easting', 'northing', 'avgCrownDiameter',
       'verticalDiameter', 'crown_vol', 'stemDiameter_m', 'crownRadius',
       'verticalCrownRadius', 'lon',

In [24]:
########
## SAVE DF with sampled rasters

In [22]:
appidv_all_latlon.to_csv("../data/output/NEON_VEGSTRUCTURE_FIRSTMSMTS_ENVVARS.csv",index=False)

In [None]:
########################
########################
# Get averages for each site
########################
########################

In [2]:
# Get averages for each site, then append to NEON metadata file as new vars!
neon_meta = pd.read_csv("/data/shared/src/arojas/NEON/data/NEON_Field_Site_Metadata_20230309.csv")

In [3]:
## Worldclim sampling
# Lets loop through all files
raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/worldclim"
# raster_input_vars = next(os.walk(raster_input_dir))[1]
site_id_list = neon_meta['field_site_id'].unique()

for site_id in site_id_list:
#     if site_id != "BART":
#         continue
    # Get index for NEON metadata!
    neonmeta_index = neon_meta.loc[neon_meta['field_site_id']==site_id].index
    # Get input rastrs
    site_indir = os.path.join(raster_input_dir, site_id)
    # print(site_indir)
    raster_fpaths = glob.glob(os.path.join(site_indir,"*.tif"))
    for raster_fp in raster_fpaths:
        # get varname for adding to df
        var_name = os.path.basename(raster_fp)[15:-4]
        # Read input raster
        rarr = rioxr.open_rasterio(raster_fp,mask_and_scale=True)
        # Get average, then add to neon metadata!
        neon_meta.loc[neonmeta_index,f"wc_{var_name}"] = np.nanmean(rarr)

        
        
    

In [5]:
## PET and AI
# Lets loop through all files
# raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/PET"
raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/AI"
site_id_list = neon_meta['field_site_id'].unique()


for site_id in site_id_list:

    # Get index for NEON metadata!
    neonmeta_index = neon_meta.loc[neon_meta['field_site_id']==site_id].index
    # Get input rastr
    raster_fp = glob.glob(os.path.join(raster_input_dir,f"{site_id}*"))[0]
    var_name = os.path.basename(raster_fp)[5:-4]
    # Read input raster and get average!
    rarr = rioxr.open_rasterio(raster_fp,mask_and_scale=True)
    neon_meta.loc[neonmeta_index,var_name] = np.nanmean(rarr)
        
        
    

In [7]:
## Chave environmental vars (CWD and E)
# Lets loop through all files
# raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/chave/E"
raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/chave/CWD"
site_id_list = neon_meta['field_site_id'].unique()


for site_id in site_id_list:

    # Get index for NEON metadata!
    neonmeta_index = neon_meta.loc[neon_meta['field_site_id']==site_id].index
    # Get input rastr
    raster_fp = glob.glob(os.path.join(raster_input_dir,f"{site_id}*"))[0]
    var_name = os.path.basename(raster_fp)[5:-4]
    # Read input raster and average!
    rarr = rioxr.open_rasterio(raster_fp,mask_and_scale=True)
    neon_meta.loc[neonmeta_index,var_name] = np.nanmean(rarr)

  neon_meta.loc[neonmeta_index,var_name] = np.nanmean(rarr)


In [8]:
## Soilgrids sampling
# Lets loop through all files
raster_input_dir = "/data/shared/src/arojas/NEON/data/raster/SOILGRIDS"
raster_vars = ["cec",  "clay",  "phh2o",  "sand",  "silt"]
site_id_list = neon_meta['field_site_id'].unique()

for raster_var in raster_vars:
    
    # get input dir of soil var
    rastervar_indir = os.path.join(raster_input_dir, raster_var) 
    
    for site_id in site_id_list:

        # Get index for NEON metadata!
        neonmeta_index = neon_meta.loc[neon_meta['field_site_id']==site_id].index
        
        # Get input rasters from site
        site_indir = os.path.join(rastervar_indir, site_id)
        raster_fpaths = glob.glob(os.path.join(site_indir,"*.tif"))
        if len(raster_fpaths)==0:
            continue
        
        arr_list=[]
        for raster_fp in raster_fpaths:
            # get varname for adding to df
            var_name = os.path.basename(raster_fp)[:-4]
            # Read input raster and append to list!
            rarr = rioxr.open_rasterio(raster_fp,mask_and_scale=True)
            arr_list.append(rarr.values)
            
        arr_stack = np.dstack(arr_list)           
        # add averaged soil data to
        neon_meta.loc[neonmeta_index,f"sg_{raster_var}_avg"] = np.nanmean(np.nanmean(arr_stack,axis=2))


In [9]:
neon_meta.columns

Index(['field_domain_id', 'field_site_id', 'field_site_name',
       'field_site_type', 'field_site_subtype', 'field_colocated_site',
       'field_site_host', 'field_site_url', 'field_nonneon_research_allowed',
       'field_access_details', 'field_neon_field_operations_office',
       'field_latitude', 'field_longitude', 'field_geodetic_datum',
       'field_utm_northing', 'field_utm_easting', 'field_utm_zone',
       'field_site_county', 'field_site_state', 'field_site_country',
       'field_mean_elevation_m', 'field_minimum_elevation_m',
       'field_maximum_elevation_m', 'field_mean_annual_temperature_C',
       'field_mean_annual_precipitation_mm', 'field_dominant_wind_direction',
       'field_mean_canopy_height_m', 'field_dominant_nlcd_classes',
       'field_domint_plant_species', 'field_usgs_huc', 'field_watershed_name',
       'field_watershed_size_km2', 'field_lake_depth_mean_m',
       'field_lake_depth_max_m', 'field_tower_height_m',
       'field_usgs_geology_unit', 'f

In [10]:
# Round columns
cols = ['wc_srad_avg_yr',
       'wc_vapr_avg_yr', 'wc_wind_avg_yr', 'wc_bio_1', 'wc_bio_12',
       'wc_bio_15', 'ai_yr', 'pet_he_yr', 'CWD', 'E', 'sg_cec_avg',
       'sg_clay_avg', 'sg_phh2o_avg', 'sg_sand_avg', 'sg_silt_avg']
neon_meta[cols] = neon_meta[cols].round(3)

In [11]:
neon_meta.to_csv("/data/shared/src/arojas/NEON/data/NEON_Field_Site_Metadata_20230309_Env_Vars.csv",index=False)