In [6]:
import os
import gc
import sys
import glob
import copy
import numpy as np
import pandas as pd
import netCDF4 as nc
from datetime import datetime, timedelta
from matplotlib.cm import get_cmap
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import colors
import matplotlib.ticker as mticker
# from PLUMBER2_VPD_common_utils import *
import resource

In [7]:
def load_default_list():

    # The site names
    PLUMBER2_path  = "/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/nc_files/"
    all_site_path  = sorted(glob.glob(PLUMBER2_path+"/*.nc"))
    site_names     = [os.path.basename(site_path).split(".")[0] for site_path in all_site_path]

    IGBP_types     = ['GRA','OSH', 'SAV', 'WSA', 'CSH', 'DBF', 'ENF', 'EBF', 'MF', 'WET', 'CRO']
    clim_types     = ['Af', 'Am', 'Aw', 'BSh', 'BSk', 'BWh', 'BWk', 'Cfa', 'Cfb', 'Csa', 'Csb', 'Cwa',
                      'Dfa', 'Dfb', 'Dfc', 'Dsb', 'Dsc', 'Dwa', 'Dwb', 'ET']

    return site_names, IGBP_types, clim_types


In [8]:
def bin_VPD(var_plot, model_out_list):

    # Set up the VPD bins
    vpd_top      = 7.1 #7.04
    vpd_bot      = 0.1#0.02
    vpd_interval = 0.2#0.04
    vpd_series   = np.arange(vpd_bot,vpd_top,vpd_interval)

    # Set up the values need to draw
    vpd_tot      = len(vpd_series)
    model_tot    = len(model_out_list)
    vpd_num      = np.zeros((model_tot, vpd_tot))
    var_vals     = np.zeros((model_tot, vpd_tot))
    var_vals_top = np.zeros((model_tot, vpd_tot))
    var_vals_bot = np.zeros((model_tot, vpd_tot))

    # Binned by VPD
    for j, vpd_val in enumerate(vpd_series):

        mask_vpd       = (var_plot['VPD'] > vpd_val-vpd_interval/2) & (var_plot['VPD'] < vpd_val+vpd_interval/2)

        if np.any(mask_vpd):

            var_masked = var_plot[mask_vpd]

            # Draw the line for different models
            for i, model_out_name in enumerate(model_out_list):

                if 'obs' in model_out_name:
                    head = ''
                else:
                    head = 'model_'

                # calculate mean value
                var_vals[i,j] = var_masked[head+model_out_name].mean(skipna=True)

                vpd_num[i,j]  = np.sum(~np.isnan(var_masked[head+model_out_name]))
                #print('model_out_name=',model_out_name,'j=',j,'vpd_num[i,j]=',vpd_num[i,j])

                if 0:
                    # using 1 std as the uncertainty
                    var_std   = var_masked[head+model_out_name].std(skipna=True)
                    var_vals_top[i,j] = var_vals[i,j] + var_std
                    var_vals_bot[i,j] = var_vals[i,j] - var_std

                if 1:
                    # using percentile as the uncertainty
                    var_temp  = var_masked[head+model_out_name]
                    mask_temp = ~ np.isnan(var_temp)
                    if np.any(mask_temp):
                        var_vals_top[i,j] = np.percentile(var_temp[mask_temp], 75)
                        var_vals_bot[i,j] = np.percentile(var_temp[mask_temp], 25)
                    else:
                        var_vals_top[i,j] = np.nan
                        var_vals_bot[i,j] = np.nan
                # print(model_out_name, 'var_vals[i,:]', var_vals[i,:])
        else:
            print('In bin_VPD, binned by VPD, var_masked = np.nan. Please check why the code goes here')
            print('j=',j, ' vpd_val=',vpd_val)

            var_vals[:,j]     = np.nan
            vpd_num[:,j]      = np.nan
            var_vals_top[:,j] = np.nan
            var_vals_bot[:,j] = np.nan

    return vpd_series, vpd_num, var_vals, var_vals_top, var_vals_bot


In [9]:
def write_var_VPD(var_name, site_names, PLUMBER2_path, bin_by=None, low_bound=30,
                  high_bound=70, day_time=False, summer_time=False, IGBP_type=None,
                  clim_type=None, energy_cor=False,VPD_num_threshold=None, 
                  hours_precip_free=None, method='GAM'):

    '''
    1. bin the dataframe by percentile of obs_EF
    2. calculate var series against VPD changes
    3. write out the var series
    '''

    # ========== read the data ==========
    var_output    = pd.read_csv(f'/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/txt/{var_name}_all_sites.csv',na_values=[''])
    print( 'Check point 1, np.any(~np.isnan(var_output["model_CABLE"]))=',
           np.any(~np.isnan(var_output["model_CABLE"])) )

    # Using AR-SLu.nc file to get the model namelist
    f             = nc.Dataset(PLUMBER2_path+"/AR-SLu.nc", mode='r')
    model_in_list = f.variables[var_name + '_models']
    ntime         = len(f.variables['CABLE_time'])
    model_out_list= []

    # Compare each model's output time interval with CABLE hourly interval
    # If the model has hourly output then use the model simulation
    for model_in in model_in_list:
        if len(f.variables[f"{model_in}_time"]) == ntime:
            model_out_list.append(model_in)

    # add obs to draw-out namelist
    if var_name in ['Qle','Qh']:
        model_out_list.append('obs')
        # model_out_list.append('obs_cor')

    if var_name in ['NEE']:
        model_out_list.append('obs')

    # total site number
    site_num    = len(np.unique(var_output["site_name"]))
    print('Point 1, site_num=',site_num)
    print('Finish reading csv file')
    print('Check 1 np.unique(var_output["site_name"])', np.unique(var_output["site_name"]))

    # ========== select data ==========

    # whether only considers the sites with energy budget corrected fluxs
    if var_name in ['Qle','Qh'] and energy_cor:
        check_obs_cor = var_output['obs_cor']
        check_obs_cor.to_csv(f'/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/txt/check_obs_cor.csv')
        print( 'Check point 2, np.any(~np.isnan(var_output["model_CABLE"]))=',
                np.any(~np.isnan(var_output["model_CABLE"])) )
        cor_notNan_mask = ~ np.isnan(var_output['obs_cor'])
        var_output      = var_output[cor_notNan_mask]
        print( 'Check point 3, np.any(~np.isnan(var_output["model_CABLE"]))=',
                np.any(~np.isnan(var_output["model_CABLE"])) )
        # print('var_output["obs_EF"][:100] point 2', var_output["obs_EF"][:100])
        site_num    = len(np.unique(var_output["site_name"]))
        print('Point 2, site_num=',site_num)


#     # whether only considers day time
#     if day_time:
#         print('Check 1 var_output["hour"]', var_output['hour'])
        
#         for i in np.arange(3261024,3261124):
#             print('i=',i, var_output[i])
# #         check_site = var_output[ var_output['site_name']=='CA-NS1']
#         # print("np.any((check_site['hour'] >= 9) & (check_site['hour'] <= 16))",np.any((check_site['hour'] >= 9) & (check_site['hour'] <= 16)))
#         print("np.unique(check_site['hour'])",np.unique(check_site['hour']))
#         print("np.unique(check_site['time'])",np.unique(check_site['time']))
        
#         day_mask    = (var_output['hour'] >= 9) & (var_output['hour'] <= 16)
#         # print('np.any(day_mask)', np.any(day_mask))
#         # print('Check 2 var_output["hour"]', var_output[day_mask]['hour'])
#         # print('Check 3 var_output[~day_mask]', var_output[~day_mask]['hour'])
#         print('Check 3 np.unique(var_output[~day_mask]["site_name"])', np.unique(var_output[~day_mask]["site_name"]))
#         # print('Check 4 np.unique(var_output[~day_mask])', np.unique(var_output[~day_mask]['hour']))
#         print('Check 3 np.unique(var_output[day_mask]["site_name"])', np.unique(var_output[day_mask]["site_name"]))

#         var_output  = var_output[day_mask]
#         site_num    = len(np.unique(var_output["site_name"]))
#         print('Point 2, site_num=',site_num)


#     # whether only considers summers
#     if summer_time:
#         summer_mask = (var_output['month'] > 11) | (var_output['month']< 3)
#         # print('np.any(summer_mask)', np.any(summer_mask))
#         var_output  = var_output[summer_mask]
#         site_num    = len(np.unique(var_output["site_name"]))
#         print('Point 3, site_num=',site_num)

#     # whether only considers one type of IGBP
#     if IGBP_type!=None:
#         IGBP_mask   = (var_output['IGBP_type'] == IGBP_type)
#         # print('np.any(IGBP_mask)', np.any(IGBP_mask))
#         var_output  = var_output[IGBP_mask]
#         site_num    = len(np.unique(var_output["site_name"]))
#         print('Point 4, site_num=',site_num)

#     # whether only considers one type of climate type
#     if clim_type!=None:
#         clim_mask   = (var_output['climate_type'] == clim_type)
#         # print('np.any(clim_mask)', np.any(clim_mask))
#         var_output  = var_output[clim_mask]
#         site_num    = len(np.unique(var_output["site_name"]))
#         print('Point 5, site_num=',site_num)

#     # whether only considers observation without precipitation in hours_precip_free hours
#     if hours_precip_free!=None:
#         rain_mask   = (var_output['hrs_after_precip'] > hours_precip_free)
#         var_output  = var_output[rain_mask]
#         site_num    = len(np.unique(var_output["site_name"]))
#         print('Point 6, site_num=',site_num)

#     print('Point 7, site_num=',site_num)

#     print( 'Check point 4, np.any(~np.isnan(var_output["model_CABLE"]))=',
#            np.any(~np.isnan(var_output["model_CABLE"])) )

#     print('Finish selecting data')

#     # ========== Divide dry and wet periods ==========

#     # Calculate EF thresholds
#     if bin_by == 'EF_obs':

#         # select time step where obs_EF isn't NaN (when Qh<0 or Qle+Qh<10)
#         EF_notNan_mask = ~ np.isnan(var_output['obs_EF'])
#         var_output     = var_output[EF_notNan_mask]

#         # print('np.any(EF_notNan_mask)', np.any(EF_notNan_mask))

#         # Select EF<low_bound and EF>high_bound for each site to make sure
#         # that every site can contribute to the final VPD lines
#         for site_name in site_names:

#             # select data for this site
#             site_mask       = (var_output['site_name'] == site_name)

#             print('In bin by EF, site_name=', site_name, 'np.any(site_mask)',np.any(site_mask))

#             # calculate EF thresholds for this site
#             if len(low_bound)>1 and len(high_bound)>1:
#                 try:
#                     bin_dry_low  = np.percentile(var_output[site_mask]['obs_EF'], low_bound[0])
#                     bin_dry_high = np.percentile(var_output[site_mask]['obs_EF'], low_bound[1])
#                     bin_wet_low  = np.percentile(var_output[site_mask]['obs_EF'], high_bound[0])
#                     bin_wet_high = np.percentile(var_output[site_mask]['obs_EF'], high_bound[1])
#                 except:
#                     bin_dry_low  = np.nan
#                     bin_dry_high = np.nan
#                     bin_wet_low  = np.nan
#                     bin_wet_high = np.nan
#                 # make the mask based on EF thresholds and append it to a full-site long logic array
#                 try:
#                     dry_mask = dry_mask.append((var_output[site_mask]['obs_EF'] > bin_dry_low)
#                                              & (var_output[site_mask]['obs_EF'] < bin_dry_high))
#                     wet_mask = wet_mask.append((var_output[site_mask]['obs_EF'] > bin_wet_low)
#                                              & (var_output[site_mask]['obs_EF'] < bin_wet_high))
#                 except:
#                     dry_mask = (var_output[site_mask]['obs_EF'] > bin_dry_low) & (var_output[site_mask]['obs_EF'] < bin_dry_high)
#                     wet_mask = (var_output[site_mask]['obs_EF'] > bin_wet_low) & (var_output[site_mask]['obs_EF'] < bin_wet_high)
#             elif len(low_bound)==1 and len(high_bound)==1:
#                 try:
#                     bin_dry     = np.percentile(var_output[site_mask]['obs_EF'], low_bound)
#                     bin_wet     = np.percentile(var_output[site_mask]['obs_EF'], high_bound)
#                 except:
#                     bin_dry     = np.nan
#                     bin_wet     = np.nan

#                 # make the mask based on EF thresholds and append it to a full-site long logic array
#                 try:
#                     dry_mask = dry_mask.append(var_output[site_mask]['obs_EF'] < bin_dry)
#                     wet_mask = wet_mask.append(var_output[site_mask]['obs_EF'] > bin_wet)
#                 except:
#                     dry_mask = (var_output[site_mask]['obs_EF'] < bin_dry)
#                     wet_mask = (var_output[site_mask]['obs_EF'] > bin_wet)
#             else:
#                 sys.exit('len(low_bound)=',len(low_bound),'len(high_bound)=',len(high_bound))

#         # Mask out the time steps beyond the EF thresholds
#         var_output_dry = var_output[dry_mask]
#         var_output_wet = var_output[wet_mask]

#         # free memory
#         EF_notNan_mask=None

#     elif bin_by == 'EF_model':

#         var_output_dry = copy.deepcopy(var_output)
#         var_output_wet = copy.deepcopy(var_output)

#         print( 'Check point 6, np.any(~np.isnan(var_output_dry["model_CABLE"]))=',
#                np.any(~np.isnan(var_output_dry["model_CABLE"])) )

#         # select time step where obs_EF isn't NaN (when Qh<0 or Qle+Qh<10)
#         for i, model_out_name in enumerate(model_out_list):
#             if 'obs' in model_out_name:
#                 head = ''
#             else:
#                 head = 'model_'

#             if model_out_name == 'obs_cor':
#                 # Use Qle_obs and Qh_obs calculated EF to bin obs_cor, this method may introduce
#                 # some bias, but keep it for now
#                 EF_var_name = 'obs_EF'
#             else:
#                 EF_var_name = model_out_name+'_EF'

#             if len(low_bound)>1 and len(high_bound)>1:
#                 dry_mask  = (var_output[EF_var_name] > low_bound[0]) & (var_output[EF_var_name] < low_bound[1])
#                 wet_mask  = (var_output[EF_var_name] > high_bound[0]) & (var_output[EF_var_name] < high_bound[1])
#             elif len(low_bound)==1 and len(high_bound)==1:
#                 dry_mask  = (var_output[EF_var_name] < low_bound)
#                 wet_mask  = (var_output[EF_var_name] > high_bound)
#             else:
#                 sys.exit('len(low_bound)=',len(low_bound),'len(high_bound)=',len(high_bound))

#             var_output_dry[head+model_out_name] = np.where(dry_mask, var_output[head+model_out_name], np.nan)
#             var_output_wet[head+model_out_name] = np.where(wet_mask, var_output[head+model_out_name], np.nan)


#         print( 'Check point 7, np.any(~np.isnan(var_output_dry["model_CABLE"]))=',
#                np.any(~np.isnan(var_output_dry["model_CABLE"])) )

#     print('Finish dividing dry and wet periods')

#     # ============ Choosing fitting or binning ============

#     if method == 'bin_by_vpd':
#         # ============ Bin by VPD ============
#         # vpd_series[vpd_tot]
#         # var_vals[model_tot, vpd_tot]
#         # var_vals_top[model_tot, vpd_tot]
#         # var_vals_bot[model_tot, vpd_tot]

#         vpd_series_dry, vpd_num_dry, var_vals_dry, var_vals_top_dry, var_vals_bot_dry = bin_VPD(var_output_dry, model_out_list)
#         vpd_series_wet, vpd_num_wet, var_vals_wet, var_vals_top_wet, var_vals_bot_wet = bin_VPD(var_output_wet, model_out_list)

#         # ============ Creat the output dataframe ============
#         var_dry = pd.DataFrame(vpd_series_dry, columns=['vpd_series'])
#         var_wet = pd.DataFrame(vpd_series_wet, columns=['vpd_series'])

#         for i, model_out_name in enumerate(model_out_list):

#             var_dry[model_out_name+'_vpd_num'] = vpd_num_dry[i,:]
#             var_wet[model_out_name+'_vpd_num'] = vpd_num_wet[i,:]

#             if VPD_num_threshold == None:
#                 var_dry[model_out_name+'_vals'] = var_vals_dry[i,:]
#                 var_dry[model_out_name+'_top']  = var_vals_top_dry[i,:]
#                 var_dry[model_out_name+'_bot']  = var_vals_bot_dry[i,:]
#                 var_wet[model_out_name+'_vals'] = var_vals_wet[i,:]
#                 var_wet[model_out_name+'_top']  = var_vals_top_wet[i,:]
#                 var_wet[model_out_name+'_bot']  = var_vals_bot_wet[i,:]
#             else:
#                 var_dry[model_out_name+'_vals'] = np.where(var_dry[model_out_name+'vpd_num'] >= VPD_num_threshold,
#                                                   var_vals_dry[i,:], np.nan)
#                 var_dry[model_out_name+'_top']  = np.where(var_dry[model_out_name+'vpd_num'] >= VPD_num_threshold,
#                                                   var_vals_top_dry[i,:], np.nan)
#                 var_dry[model_out_name+'_bot']  = np.where(var_dry[model_out_name+'vpd_num'] >= VPD_num_threshold,
#                                                   var_vals_bot_dry[i,:], np.nan)
#                 var_wet[model_out_name+'_vals'] = np.where(var_wet[model_out_name+'vpd_num'] >= VPD_num_threshold,
#                                                   var_vals_wet[i,:], np.nan)
#                 var_wet[model_out_name+'_top']  = np.where(var_wet[model_out_name+'vpd_num'] >= VPD_num_threshold,
#                                                   var_vals_top_wet[i,:], np.nan)
#                 var_wet[model_out_name+'_bot']  = np.where(var_wet[model_out_name+'vpd_num'] >= VPD_num_threshold,
#                                                   var_vals_bot_wet[i,:], np.nan)

#         var_dry['site_num']    = site_num
#         var_wet['site_num']    = site_num

#     elif method == 'GAM':
#         '''
#         fitting GAM curve
#         '''

#         # ============ Creat the output dataframe ============

#         x_top      = 7.04
#         x_bot      = 0.02
#         x_interval = 0.04

#         #reshape for gam
#         for i, model_out_name in enumerate(model_out_list):
#             print('In GAM fitting for model:', model_out_name)
#             if 'obs' in model_out_name:
#                 head = ''
#             else:
#                 head = 'model_'

#             dry_x_values = var_output_dry['VPD']
#             dry_y_values = var_output_dry[head+model_out_name]
#             dry_vpd_pred, dry_y_pred, dry_y_int = fit_GAM(x_top,x_bot,x_interval,dry_x_values,dry_y_values,n_splines=7,spline_order=3)
#             gc.collect()

#             wet_x_values = var_output_wet['VPD']
#             wet_y_values = var_output_wet[head+model_out_name]
#             wet_vpd_pred, wet_y_pred, wet_y_int = fit_GAM(x_top,x_bot,x_interval,wet_x_values,wet_y_values,n_splines=7,spline_order=3)
#             gc.collect()
#             if i == 0:
#                 var_dry      = pd.DataFrame(dry_vpd_pred, columns=['vpd_series'])
#                 var_wet      = pd.DataFrame(wet_vpd_pred, columns=['vpd_series'])

#             var_dry[model_out_name+'_vals'] = dry_y_pred
#             var_dry[model_out_name+'_top']  = dry_y_int[:,0]
#             var_dry[model_out_name+'_bot']  = dry_y_int[:,1]
#             var_wet[model_out_name+'_vals'] = wet_y_pred
#             var_wet[model_out_name+'_top']  = wet_y_int[:,0]
#             var_wet[model_out_name+'_bot']  = wet_y_int[:,1]
#         var_dry['site_num']    = site_num
#         var_wet['site_num']    = site_num

#     # ============ Set the output file name ============
#     message = ''

#     if day_time:
#         message = message + '_daytime'

#     if IGBP_type !=None:
#         message = message + '_IGBP='+IGBP_type

#     if clim_type !=None:
#         message = message + '_clim='+clim_type

#     # save data
#     if len(low_bound) >1 and len(high_bound) >1:
#         if low_bound[1] > 1:
#             var_dry.to_csv(f'/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/txt/{var_name}_VPD'+message+'_'+bin_by+'_'+str(low_bound[0])+'-'+str(low_bound[1])+'th_'+method+'_coarse.csv')
#             var_wet.to_csv(f'/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/txt/{var_name}_VPD'+message+'_'+bin_by+'_'+str(high_bound[0])+'-'+str(high_bound[1])+'th_'+method+'_coarse.csv')
#         else:
#             var_dry.to_csv(f'/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/txt/{var_name}_VPD'+message+'_'+bin_by+'_'+str(low_bound[0])+'-'+str(low_bound[1])+'_'+method+'_coarse.csv')
#             var_wet.to_csv(f'/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/txt/{var_name}_VPD'+message+'_'+bin_by+'_'+str(high_bound[0])+'-'+str(high_bound[1])+'_'+method+'_coarse.csv')
#     elif len(low_bound) == 1 and len(high_bound) == 1:
#         if low_bound > 1:
#             var_dry.to_csv(f'/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/txt/{var_name}_VPD'+message+'_'+bin_by+'_'+str(low_bound)+'th_'+method+'_coarse.csv')
#             var_wet.to_csv(f'/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/txt/{var_name}_VPD'+message+'_'+bin_by+'_'+str(high_bound)+'th_'+method+'_coarse.csv')
#         else:
#             var_dry.to_csv(f'/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/txt/{var_name}_VPD'+message+'_'+bin_by+'_'+str(low_bound)+'_'+method+'_coarse.csv')
#             var_wet.to_csv(f'/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/txt/{var_name}_VPD'+message+'_'+bin_by+'_'+str(high_bound)+'_'+method+'_coarse.csv')

    return



In [10]:
if __name__ == "__main__":
    
    # Check memory
    # Get the maximum amount of memory that the process can use
    max_memory = resource.getrlimit(resource.RLIMIT_RSS)[1]

    # Print the maximum amount of memory
    print("Maximum memory usage: {} MB".format(max_memory / 1024 / 1024))

    # Path of PLUMBER 2 dataset
    PLUMBER2_path  = "/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/nc_files/"

    var_name       = 'Qle'  #'TVeg'
    bin_by         = 'EF_model' #'EF_model' #'EF_obs'#
    site_names, IGBP_types, clim_types = load_default_list()

    day_time       = True
    energy_cor     = True
    method         = 'bin_by_vpd' #'GAM'

    if var_name == 'NEE':
        energy_cor     = False

    # # ================== 0-0.4 ==================
    low_bound      = [0,0.2] #30
    high_bound     = [0.2,0.4] #70
    

    write_var_VPD(var_name, site_names, PLUMBER2_path, bin_by=bin_by, low_bound=low_bound,
                    high_bound=high_bound, day_time=day_time,
                    energy_cor=energy_cor, method=method)
    gc.collect()



Maximum memory usage: 129024.0 MB
Check point 1, np.any(~np.isnan(var_output["model_CABLE"]))= True
Point 1, site_num= 170
Finish reading csv file
Check 1 np.unique(var_output["site_name"]) ['AR-SLu' 'AT-Neu' 'AU-ASM' 'AU-Cow' 'AU-Cpr' 'AU-Ctr' 'AU-Cum' 'AU-DaP'
 'AU-DaS' 'AU-Dry' 'AU-Emr' 'AU-GWW' 'AU-Gin' 'AU-How' 'AU-Lit' 'AU-Otw'
 'AU-Rig' 'AU-Rob' 'AU-Sam' 'AU-Stp' 'AU-TTE' 'AU-Tum' 'AU-Whr' 'AU-Wrr'
 'AU-Ync' 'BE-Bra' 'BE-Lon' 'BE-Vie' 'BR-Sa3' 'BW-Ma1' 'CA-NS1' 'CA-NS2'
 'CA-NS4' 'CA-NS5' 'CA-NS6' 'CA-NS7' 'CA-Qcu' 'CA-Qfo' 'CA-SF1' 'CA-SF2'
 'CA-SF3' 'CH-Cha' 'CH-Dav' 'CH-Fru' 'CH-Oe1' 'CN-Cha' 'CN-Cng' 'CN-Dan'
 'CN-Din' 'CN-Du2' 'CN-HaM' 'CN-Qia' 'CZ-wet' 'DE-Bay' 'DE-Geb' 'DE-Gri'
 'DE-Hai' 'DE-Kli' 'DE-Meh' 'DE-Obe' 'DE-Seh' 'DE-SfN' 'DE-Tha' 'DE-Wet'
 'DK-Fou' 'DK-Lva' 'DK-Ris' 'DK-Sor' 'DK-ZaH' 'ES-ES1' 'ES-ES2' 'ES-LMa'
 'ES-LgS' 'ES-VDA' 'FI-Hyy' 'FI-Kaa' 'FI-Lom' 'FI-Sod' 'FR-Fon' 'FR-Gri'
 'FR-Hes' 'FR-LBr' 'FR-Lq1' 'FR-Lq2' 'FR-Pue' 'GF-Guy' 'HU-Bug' 'ID-Pag'
 'IE-Ca

KeyError: 3261024

In [12]:

# Check memory
# Get the maximum amount of memory that the process can use
max_memory = resource.getrlimit(resource.RLIMIT_RSS)[1]

# Print the maximum amount of memory
print("Maximum memory usage: {} MB".format(max_memory / 1024 / 1024))

# Path of PLUMBER 2 dataset
PLUMBER2_path  = "/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/nc_files/"

var_name       = 'Qle'  #'TVeg'
bin_by         = 'EF_model' #'EF_model' #'EF_obs'#
site_names, IGBP_types, clim_types = load_default_list()

day_time       = True
energy_cor     = True
method         = 'bin_by_vpd' #'GAM'

if var_name == 'NEE':
    energy_cor     = False

# # ================== 0-0.4 ==================
low_bound      = [0,0.2] #30
high_bound     = [0.2,0.4] #70


# ========== read the data ==========
var_output    = pd.read_csv(f'/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2/txt/{var_name}_all_sites.csv',na_values=[''])



Maximum memory usage: 129024.0 MB


In [21]:
var_output

Unnamed: 0.1,Unnamed: 0,time,model_1lin,1lin_EF,model_3km27,3km27_EF,model_6km729,6km729_EF,model_6km729lag,6km729lag_EF,...,VPD,obs_Tair,obs_Qair,obs_Precip,month,hour,site_name,IGBP_type,climate_type,half_hrs_after_precip
0,0,2010-01-01 00:30:00,7.140990,,9.085434,,9.143805,,23.089573,,...,1.2924,292.650,0.006580,0.000000,1.0,0.0,AR-SLu,MF,BSk,1.0
1,1,2010-01-01 01:00:00,7.140990,,10.523390,,8.069214,,13.489099,,...,0.8980,291.850,0.008465,0.000000,1.0,1.0,AR-SLu,MF,BSk,2.0
2,2,2010-01-01 01:30:00,7.140990,,10.327738,,19.550978,,20.617836,,...,0.9025,291.450,0.008073,0.000444,1.0,1.0,AR-SLu,MF,BSk,3.0
3,3,2010-01-01 02:00:00,7.140990,,10.455215,,10.478658,,20.202003,,...,0.9070,291.550,0.008145,0.000000,1.0,2.0,AR-SLu,MF,BSk,0.0
4,4,2010-01-01 02:30:00,7.140990,,10.238870,,24.238617,,31.716244,,...,1.0735,291.950,0.007412,0.000056,1.0,2.0,AR-SLu,MF,BSk,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17137987,17137987,2008-12-31 22:00:00,7.148216,,13.786309,,9.284344,,5.085342,,...,0.6075,295.750,0.015418,0.000000,12.0,22.0,ZM-Mon,DBF,Cwa,11549.0
17137988,17137988,2008-12-31 22:30:00,7.148216,,12.832660,,7.875926,,4.381216,,...,0.5068,295.147,0.015416,0.000000,12.0,22.0,ZM-Mon,DBF,Cwa,11550.0
17137989,17137989,2008-12-31 23:00:00,7.148216,,12.002188,,4.790282,,0.133078,,...,0.3987,294.627,0.015573,0.000000,12.0,23.0,ZM-Mon,DBF,Cwa,11551.0
17137990,17137990,2008-12-31 23:30:00,7.148216,,11.834283,,17.480413,,1.371061,,...,0.3522,294.527,0.015785,0.000000,12.0,23.0,ZM-Mon,DBF,Cwa,11552.0


In [22]:
# Using AR-SLu.nc file to get the model namelist
f             = nc.Dataset(PLUMBER2_path+"/AR-SLu.nc", mode='r')
model_in_list = f.variables[var_name + '_models']
ntime         = len(f.variables['CABLE_time'])
model_out_list= []
model_in_list

<class 'netCDF4._netCDF4.Variable'>
vlen Qle_models(Qle_models)
    standard_name: Qle_models
vlen data type: <class 'str'>
unlimited dimensions: 
current shape = (32,)

In [23]:
# Compare each model's output time interval with CABLE hourly interval
# If the model has hourly output then use the model simulation
for model_in in model_in_list:
    if len(f.variables[f"{model_in}_time"]) == ntime:
        model_out_list.append(model_in)
model_out_list

['1lin',
 '3km27',
 '6km729',
 '6km729lag',
 'ACASA',
 'CABLE',
 'CABLE-POP-CN',
 'CHTESSEL_ERA5_3',
 'CHTESSEL_Ref_exp1',
 'CLM5a',
 'GFDL',
 'JULES_GL9_withLAI',
 'JULES_test',
 'LSTM_eb',
 'LSTM_raw',
 'Manabe',
 'ManabeV2',
 'MATSIRO',
 'MuSICA',
 'NASAEnt',
 'NoahMPv401',
 'ORC2_r6593',
 'ORC2_r6593_CO2',
 'ORC3_r7245_NEE',
 'ORC3_r8120',
 'PenmanMonteith',
 'QUINCY',
 'RF_eb',
 'RF_raw',
 'STEMMUS-SCOPE']

In [25]:
# add obs to draw-out namelist
if var_name in ['Qle','Qh']:
    model_out_list.append('obs')
    # model_out_list.append('obs_cor')

if var_name in ['NEE']:
    model_out_list.append('obs')

# total site number
site_num    = len(np.unique(var_output["site_name"]))
var_output["site_name"]

0           AR-SLu
1           AR-SLu
2           AR-SLu
3           AR-SLu
4           AR-SLu
             ...  
17137987    ZM-Mon
17137988    ZM-Mon
17137989    ZM-Mon
17137990    ZM-Mon
17137991    ZM-Mon
Name: site_name, Length: 17137992, dtype: object

In [26]:
var_output[var_output["site_name"]=='CA-NS1']

Unnamed: 0.1,Unnamed: 0,time,model_1lin,1lin_EF,model_3km27,3km27_EF,model_6km729,6km729_EF,model_6km729lag,6km729lag_EF,...,VPD,obs_Tair,obs_Qair,obs_Precip,month,hour,site_name,IGBP_type,climate_type,half_hrs_after_precip
3261024,3261024,2003-01-01 00:30:00,7.165850,,1.923796,,-0.553359,,2.198900,,...,0.026,265.530,0.002034,0.0,1.0,0.0,CA-NS1,ENF,Dfc,1.0
3261025,3261025,2003-01-01 01:00:00,7.197248,,2.094288,,0.092126,,2.463660,,...,0.029,265.899,0.002078,0.0,1.0,1.0,CA-NS1,ENF,Dfc,2.0
3261026,3261026,2003-01-01 01:30:00,7.165850,,2.152761,,-0.054032,,2.394293,,...,0.031,266.031,0.002089,0.0,1.0,1.0,CA-NS1,ENF,Dfc,3.0
3261027,3261027,2003-01-01 02:00:00,7.165850,,2.104097,,0.626721,,1.709947,,...,0.029,265.951,0.002087,0.0,1.0,2.0,CA-NS1,ENF,Dfc,4.0
3261028,3261028,2003-01-01 02:30:00,7.165850,,2.008636,,0.593111,,1.728376,,...,0.028,265.772,0.002062,0.0,1.0,2.0,CA-NS1,ENF,Dfc,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3278539,3278539,2003-12-31 22:00:00,7.165850,,0.785722,,0.542913,,0.396711,,...,0.024,255.662,0.000837,0.0,12.0,22.0,CA-NS1,ENF,Dfc,1939.0
3278540,3278540,2003-12-31 22:30:00,7.165850,,0.828404,,0.294389,,0.393343,,...,0.024,255.846,0.000853,0.0,12.0,22.0,CA-NS1,ENF,Dfc,1940.0
3278541,3278541,2003-12-31 23:00:00,7.165850,,0.879422,,0.335966,,0.393220,,...,0.024,255.923,0.000859,0.0,12.0,23.0,CA-NS1,ENF,Dfc,1941.0
3278542,3278542,2003-12-31 23:30:00,7.165850,,0.851427,,0.334989,,0.387731,,...,0.024,255.814,0.000850,0.0,12.0,23.0,CA-NS1,ENF,Dfc,1942.0
