In [1]:
import math
import shutil
import pandas as pd
import glob
import os
import netCDF4 as nc4
import sys
sys.path.append('/glade/u/home/adamhb/Earth-System-Model-Tools')
import esm_tools
import re
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
np.set_printoptions(threshold=1000)

## Cases since bug fix

Large ensembles 
* afterBugFix_1280_041923 (to 150 years)
* afterOakFix_2560_042324 (to 200 years)

Equilibrium ensembles
* equilibrium_700yrs_042524 (actually ran for 450)

In [6]:
# afterBugFix_1280_041923

# metrics_path = '/glade/work/adamhb/processed_output/CZ2_equilibrium_041924_XX_-17e2acb6a_FATES-1449c787'
# param_ranges_path = '/glade/u/home/adamhb/california-fates/parameter_ranges/param_range_archive/param_ranges_040524.csv'
# param_subdir_base_name = 'afterBugFix_1280_041923'
# eco_criteria = "200yrs"
# inst_per_case = 128

# afterOakFix
# metrics_path = '/glade/work/adamhb/processed_output/CZ2_equilibrium_042324_XX_-17e2acb6a_FATES-1449c787'
# param_ranges_path = '/glade/u/home/adamhb/california-fates/parameter_ranges/param_range_archive/param_ranges_042324.csv'
# param_subdir_base_name = 'afterOakFix_2560_042323'
# eco_criteria = "200yrs"
# inst_per_case = 128

# equilibrium_700yrs_042524
# metrics_path = '/glade/work/adamhb/processed_output/CZ2_equilibrium_700yrs_042524_01_-17e2acb6a_FATES-1449c787'
# param_ranges_path = '/glade/u/home/adamhb/california-fates/parameter_ranges/param_range_archive/param_ranges_042324.csv'
# param_subdir_base_name = 'equilibrium_700yrs_042524'
# eco_criteria = "eq"
# inst_per_case = 99

# ml_supported_large_ensemble
# metrics_path = '/glade/work/adamhb/processed_output/ml_supported_ensemble_050224_XX_-17e2acb6a_FATES-1449c787'
# param_ranges_path = '/glade/u/home/adamhb/california-fates/parameter_ranges/param_range_archive/param_ranges_050124.csv'
# param_subdir_base_name = 'ml_supported_ensemble_2560_050224'
# eco_criteria = "200yrs"
# inst_per_case = 128

# equilibrium700_050924
metrics_path = '/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01'
param_ranges_path = '/glade/u/home/adamhb/california-fates/parameter_ranges/param_range_archive/param_ranges_050124.csv'
param_subdir_base_name = 'equilibrium_700yrs_050924'
eco_criteria = "eq"
inst_per_case = 82

## Constants

In [3]:
param_subdir_root = '/glade/u/home/adamhb/ahb_params/fates_api_25/ensembles'

In [5]:
basic_metrics = ['inst_tag','AGCD',
'BA_conifer',
'BA_trees',
'BA_pine',
'BA_cedar',
'BA_fir',
'BA_shrub',
'BA_oak',
'TreeStemD_40',
'ShannonE',
'Pct_shrub_cover_canopy',
'Pct_shrub_cover',
'NPP',
'Burned_area',
'AWFI',
'Pct_high_severity_1025'
'Pct_high_severity_1700',
'Pct_high_severity_3500']

In [38]:
def extract_case_number(filename):
    # Regular expression pattern to find two digits between "_" and ".csv"
    pattern = r'_(\d{2})\.csv'
    
    # Search for the pattern in the filename
    match = re.search(pattern, filename)
    
    # If a match is found, return the first group (the two digits)
    if match:
        return match.group(1)  # Returns the two digits as a string
    else:
        return None  # Returns None if no match is found

def extract_variable_from_netcdf(file_path, variable_name,pf):
    """
    Extract a variable from a NetCDF file.

    Parameters:
    - file_path: The path to the NetCDF file.
    - variable_name: The name of the variable to extract.

    Returns:
    - The extracted variable data.
    """
    index = max(0,pf-1)
    
    with nc4.Dataset(file_path, 'r') as dataset:
        # Check if the variable exists in the dataset
        if variable_name in dataset.variables:
            variable_data = dataset.variables[variable_name]
            #print("Variable shape",len(variable_data.shape))
            if len(variable_data.shape) == 0:
                return variable_data[:].data.item()
            if len(variable_data.shape) == 1:
                return variable_data[:].data[index]
            if len(variable_data.shape) == 2:
                return variable_data[:].data[0,index]
        else:
            raise ValueError(f"'{variable_name}' not found in the NetCDF file.")

def get_param_file_path(param_subdir_base_name,
                                          case_number,inst_tag,
                                          param_sudir_root = '/glade/u/home/adamhb/ahb_params/fates_api_25/ensembles'):
    return f'{param_sudir_root}/{param_subdir_base_name}_{case_number}/ca_5pfts_100523_{inst_tag}.nc'


def aggregate_metrics(metrics_path = '/glade/work/adamhb/processed_output/CZ2_equilibrium_041924_XX_-17e2acb6a_FATES-1449c787',
                      param_ranges_path = '/glade/u/home/adamhb/california-fates/parameter_ranges/param_range_archive/param_ranges_040524.csv',
                      param_subdir_base_name = 'afterBugFix_1280_041923',
                      eco_criteria = "200yrs",
                      inst_per_case = 128,
                      param_subdir_root = '/glade/u/home/adamhb/ahb_params/fates_api_25/ensembles',
                      one_case_multi_tag = False,
                      case_number = None):
    '''
    Converts a group of metrics csvs to one metrics csv with the parameter values for that ensemble member. Returns this csv.
    Also returns a df of ensemble members that pass the ecological requirements.
    '''
    ##########################
    # Aggregate metrics csvs #
    ##########################
    if one_case_multi_tag == False:
        files = sorted(glob.glob(os.path.join(metrics_path,'*metrics_[0-9][0-9]*')))

    else:
        files = sorted(glob.glob(os.path.join(metrics_path,'*metrics_[0-9][0-9][0-9][0-9]*')))
    
    print(files)
    metrics_df = pd.DataFrame()
    case_numbers = []
    for file in files:
        tmp = pd.read_csv(file, dtype={'inst_tag':str})
        to_drop = tmp.columns[0]
        tmp.drop(to_drop,axis=1,inplace=True)
        
        if one_case_multi_tag == False:
            case_number = extract_case_number(file)
        
        case_numbers.append(case_number)
        tmp['case_number'] = case_number
        metrics_df = pd.concat([metrics_df,tmp],axis = 0)

    #case_numbers = case_numbers[:2]
    print(case_numbers)
    print(len(metrics_df), "ensemble members analyzed.")
    metrics_df['case_num_inst_id'] =  metrics_df['case_number'] + "_" + metrics_df['inst_tag']
    

    ####################################################
    # Get the parameter values of each ensemble member #
    ####################################################
    param_ranges = pd.read_csv(param_ranges_path)
    param_ranges = param_ranges.loc[param_ranges['param'] != 'fates_leaf_slamax']
    param_ranges = param_ranges.drop_duplicates(subset = ['param','pft'])
    params = np.array(param_ranges['param'])
    pfts = np.array(param_ranges['pft'])
    #indices = np.maximum(pfts - 1,0)
    param_index_names = [params[i] + "_" + str(pfts[i]) for i in list(range(len(params)))]

    unique_case_numbers = list(dict.fromkeys(case_numbers))
    print("unique_case_numbers",unique_case_numbers)
    param_subdirs = [f'{param_subdir_root}/{param_subdir_base_name}_{case_number}' for case_number in unique_case_numbers]
    inst_tags = esm_tools.inst_to_tag(list(range(1,inst_per_case+1)))
    param_file_names = [f'ca_5pfts_100523_{tag}.nc' for tag in inst_tags]

    param_data = np.zeros((inst_per_case * len(unique_case_numbers),len(params)))
    row = -1
    print("param_subdirs",param_subdirs)
    for param_subdir in param_subdirs:
        print("Getting parameter values for",param_subdir)
        for file in param_file_names:
            file_path = os.path.join(param_subdir,file)
            row = row  + 1
            col = -1
            for param,pf in zip(params,pfts):
                col = col + 1
                param_data[row,col] = extract_variable_from_netcdf(file_path,param,pf)

    case_num_inst_id = [case_number + "_" + inst_tag for case_number in unique_case_numbers for inst_tag in inst_tags]
    param_df = pd.DataFrame(param_data,index = case_num_inst_id, columns = param_index_names)
    param_df = param_df.reset_index().rename(columns={'index': 'case_num_inst_id'})

    # Merge the two datasets and return to group case output
    df = pd.merge(metrics_df,param_df,on = 'case_num_inst_id')
    


    # Ecological criteria
    obs_PEAS = {'Burned_area':[0.03,0.11], #Range for burned area was calculated from the FRI estimates given in Table 3 of Williams et al., 2023. BurnFrac = 1/ FRI. The ranges given in Safford and Stevens, 2017 are larger.
       'Pct_high_severity_3500':[1,6], # Williams+ 2023; Safford and Stevens, 2017, Stephens+ 2015
       'Pct_high_severity_1700':[1,6],
       'BA_conifer':[10.6,29.5], # 10 cm dbh
       'Pct_shrub_cover_canopy':[0.09,0.54], # DRY and WET
      }

   

    # There should be at least 5 percent total shrub cover
    shrub_cover = (df["Pct_shrub_cover"] > obs_PEAS["Pct_shrub_cover_canopy"][0]/2) & (df["Pct_shrub_cover_canopy"] < obs_PEAS["Pct_shrub_cover_canopy"][1])

    # original
    #burned_area = (df['Burned_area'] > obs_PEAS["Burned_area"][0] / 2) & (df['Burned_area'] < obs_PEAS["Burned_area"][1] * 2)
    # burned area can be 2 to 12 reasonably
    burned_area = (df['Burned_area'] > obs_PEAS["Burned_area"][0] - 0.01) & (df['Burned_area'] < obs_PEAS["Burned_area"][1] + 0.01)
    # AWFI is low (< 350 kW m-1; based on USDA FLI categorization)
    fire_intensity = df['AWFI'] < 350
    

    
    enough_conifer_basal_area = (df['BA_conifer'] > obs_PEAS["BA_conifer"][0] / 2) # I divided this by 2 because it was only 150 years
    
    if eco_criteria == "200yrs":
        some_large_trees = df['TreeStemD_40'] > 5
    if eco_criteria == "eq":
        some_large_trees = df['TreeStemD_80'] > 5
    
    enough_pine = df["BA_pine"] > 1
    npp_filter = (df['NPP'] < 0.9) & (df['NPP'] > 0.54)
    some_oak = df["BA_oak"] > 0.1
    coexistence_filter = df["FailedPFTs"] < 3
    density_filter = df['TreeStemD'] < 400
  
    criteria = shrub_cover & burned_area & enough_conifer_basal_area & some_large_trees & enough_pine & coexistence_filter & npp_filter & some_oak & density_filter\
    & fire_intensity

    # Output
    df['density_filter'] = density_filter.astype(int)
    df['good_shrub_cover'] = shrub_cover.astype(int)
    df['good_burned_area'] = burned_area.astype(int)
    df['enough_conifer_basal_area'] = enough_conifer_basal_area.astype(int)
    df['some_large_trees'] = some_large_trees.astype(int)
    df['enough_pine'] = enough_pine.astype(int)
    df['good_npp'] = npp_filter.astype(int)
    df['some_oak'] = some_oak.astype(int)
    df['some_coexistence'] = coexistence_filter.astype(int)
    df['promising'] = criteria.astype(int)
    # Identify that case/tag/param files that pass the ecological criteria
    print(len(df[criteria]),"members meet all criteria")

    # Add paths to param files
    df['case_group'] = param_subdir_base_name
    param_file_paths = [get_param_file_path(param_subdir_base_name,df['case_number'][i],df['inst_tag'][i])\
                                       for i in range(len(df))]
    df['param_file_path'] = param_file_paths
    #cols_to_drop = ['fates_frag_maxdecomp_0.1', 'fates_frag_maxdecomp_0.2', 'fates_frag_maxdecomp_0.3']
    #df.drop(cols_to_drop, axis=1, inplace=True)
    
    df_file_name = param_subdir_base_name + "_" + "metrics_and_params.csv"
    df.to_csv(os.path.join(metrics_path,df_file_name))
    print("Made",os.path.join(metrics_path,df_file_name))

## Aggregate and filter ensemble output

This step takes as input a group of csv files that contain the processed metrics of each ensemble member (processed using get_case_metrics.py). It returns a dataframe / csv that contains the metrics (aggregated across cases), parameter values for each ensemble member, which benchmarks each ensemble member met, and a flag for ensemble members that met all benchmarks. It does not check the size class distributions if each ensemble member. 

In [39]:
aggregate_metrics(metrics_path = metrics_path,
                  param_ranges_path = param_ranges_path,
                  param_subdir_base_name = param_subdir_base_name,
                  eco_criteria = eco_criteria,
                  inst_per_case = inst_per_case,
                  param_subdir_root = param_subdir_root,
                  one_case_multi_tag=True,case_number='01')

['/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/peas700_metrics_0001.csv', '/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/peas700_metrics_0002.csv', '/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/peas700_metrics_0003.csv', '/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/peas700_metrics_0004.csv', '/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/peas700_metrics_0005.csv', '/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/peas700_metrics_0006.csv', '/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/peas700_metrics_0007.csv', '/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/peas700_metrics_0008.csv', '/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/peas700_metrics_0009.csv', '/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/peas700_metrics_0010.csv', '/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/pea

## Make one aggregated file for training

In [36]:
# df1 = pd.read_csv('/glade/work/adamhb/processed_output/CZ2_equilibrium_041924_XX_-17e2acb6a_FATES-1449c787/afterBugFix_1280_041923_metrics_and_params.csv')
# df2 = pd.read_csv('/glade/work/adamhb/processed_output/CZ2_equilibrium_042324_XX_-17e2acb6a_FATES-1449c787/afterOakFix_2560_042323_metrics_and_params.csv')
# df3 = pd.concat([df1,df2],axis = 0)
df = pd.read_csv('/glade/work/adamhb/processed_output/equilibrium_700yrs_050924_01/equilibrium_700yrs_050924_metrics_and_params.csv')

## Scratch

In [36]:
#df3.columns[df3.columns.str.contains('fates')]
#df3.to_csv('/glade/work/adamhb/processed_output/CZ2_equilibrium_041924_and_042324_XX.csv')