### Notes:
I set the day at least one time step has nan in Qle or Qh to have a daily EF nan, and set the day's EF as nan for all models/obs if one of them is EF. 

In [37]:
import os
import gc
import sys
import glob
import numpy as np
import pandas as pd
import netCDF4 as nc
from datetime import datetime, timedelta
from matplotlib.cm import get_cmap
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import colors
import matplotlib.ticker as mticker
from copy import deepcopy

In [38]:
# To use PLUMBER2_GPP_common_utils, change directory to where it exists
os.chdir('/g/data/w97/mm3972/scripts/PLUMBER2/LSM_VPD_PLUMBER2')
from PLUMBER2_VPD_common_utils import *

In [39]:
# Define a custom function for aggregation
# Set the grouped bin as nan if more than 6 time steps in this day are nan
def custom_agg(series):
    if series.isnull().sum() > 6: 
        return np.nan
    else:
        return series.mean()

In [40]:
site_names, IGBP_types, clim_types, model_names = load_default_list()
X_day     = 1
use_Rnet  = False

In [41]:
# Read Qle_input and Qh_input
Qle_input = pd.read_csv('./txt/process1_output/Qle_all_sites.csv',
                        na_values=[''],
                        usecols=['time', 'month', 'site_name', 'model_CABLE', 'model_CABLE-POP-CN',
                                 'model_CHTESSEL_Ref_exp1', 'model_CLM5a', 'model_GFDL',
                                 'model_JULES_GL9', 'model_JULES_GL9_withLAI', 'model_MATSIRO',
                                 'model_MuSICA', 'model_NASAEnt', 'model_NoahMPv401', 'model_ORC2_r6593',
                                 'model_ORC3_r8120', 'model_QUINCY', 'model_STEMMUS-SCOPE', 'obs'])

if use_Rnet:
    ### no 'model_JULES_GL9'
    Rnet_input = pd.read_csv('./txt/process1_output/Rnet_all_sites.csv',
                           na_values=[''],
                           usecols=['time', 'month', 'site_name', 'model_CABLE', 'model_CABLE-POP-CN',
                                    'model_CHTESSEL_Ref_exp1', 'model_CLM5a', 'model_GFDL',
                                    'model_JULES_GL9', 'model_JULES_GL9_withLAI', 'model_MATSIRO',
                                    'model_MuSICA', 'model_NASAEnt', 'model_NoahMPv401', 'model_ORC2_r6593',
                                    'model_ORC3_r8120', 'model_QUINCY', 'model_STEMMUS-SCOPE', 'obs'])

    # Check for 'time' column
    if 'time' not in Qle_input.columns or 'time' not in Rnet_input.columns:
        raise ValueError("The input files do not contain a 'time' column")

    # Extract 'year' and 'day' from 'time' column
    Qle_input['year'] = Qle_input['time'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").year)
    Qle_input['day']  = Qle_input['time'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").day)
    Rnet_input['year']= Qle_input['year'][:]
    Rnet_input['day'] = Qle_input['day'][:]
    
    # Replace -9999 with NaN
    Qle_input.replace(-9999, np.nan, inplace=True)
    Rnet_input.replace(-9999, np.nan, inplace=True)

    # Drop 'time' column
    Qle_input.drop(columns=['time'], inplace=True)
    Rnet_input.drop(columns=['time'], inplace=True)
    
    '''
    Set the day's EF as nan if one time step is nan
    '''
    # Define grouping columns
    grouping_cols = ['year', 'month', 'day', 'site_name']

    # Get the list of columns to aggregate
    columns_to_aggregate = [col for col in Qle_input.columns if col not in grouping_cols]

    # Create the aggregation dictionary dynamically
    agg_dict = {col: custom_agg for col in columns_to_aggregate}

    daily_Qle = Qle_input.groupby(['year', 'month', 'day', 'site_name']).agg(agg_dict).reset_index()
    daily_Rnet= Rnet_input.groupby(['year', 'month', 'day', 'site_name']).agg(agg_dict).reset_index()
    
    
    # daily_Qle = Qle_input.groupby(['year', 'month', 'day', 'site_name']).mean().reset_index()
    # daily_Rnet= Rnet_input.groupby(['year', 'month', 'day', 'site_name']).mean().reset_index()
    
else:
    Qh_input = pd.read_csv('./txt/process1_output/Qh_all_sites.csv',
                           na_values=['-9999'],
                           usecols=['time', 'month', 'site_name', 'model_CABLE', 'model_CABLE-POP-CN',
                                    'model_CHTESSEL_Ref_exp1', 'model_CLM5a', 'model_GFDL',
                                    'model_JULES_GL9', 'model_JULES_GL9_withLAI', 'model_MATSIRO',
                                    'model_MuSICA', 'model_NASAEnt', 'model_NoahMPv401', 'model_ORC2_r6593',
                                    'model_ORC3_r8120', 'model_QUINCY', 'model_STEMMUS-SCOPE', 'obs'])

    # Check for 'time' column
    if 'time' not in Qle_input.columns or 'time' not in Qh_input.columns:
        raise ValueError("The input files do not contain a 'time' column")

    # Extract 'year' and 'day' from 'time' column
    Qle_input['year'] = Qle_input['time'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").year)
    Qle_input['day']  = Qle_input['time'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").day)
    Qh_input['year']  = Qle_input['year'][:]
    Qh_input['day']   = Qle_input['day'][:]
    
    # Replace -9999 with NaN
    Qle_input.replace(-9999, np.nan, inplace=True)
    Qh_input.replace(-9999, np.nan, inplace=True)

    # Drop 'time' column
    Qle_input.drop(columns=['time'], inplace=True)
    Qh_input.drop(columns=['time'], inplace=True)
    
    
    '''
    Set the day's EF as nan if one time step is nan
    '''
    # Define grouping columns
    grouping_cols = ['year', 'month', 'day', 'site_name']

    # Get the list of columns to aggregate
    columns_to_aggregate = [col for col in Qle_input.columns if col not in grouping_cols]

    # Create the aggregation dictionary dynamically
    agg_dict = {col: custom_agg for col in columns_to_aggregate}

    daily_Qle = Qle_input.groupby(['year', 'month', 'day', 'site_name']).agg(agg_dict).reset_index()
    daily_Qh  = Qh_input.groupby(['year', 'month', 'day', 'site_name']).agg(agg_dict).reset_index()
    
    # daily_Qle = Qle_input.groupby(['year', 'month', 'day', 'site_name']).mean().reset_index() 
    # daily_Qh  = Qh_input.groupby(['year', 'month', 'day', 'site_name']).mean().reset_index()    

In [42]:
columns_to_aggregate

['model_CABLE',
 'model_CABLE-POP-CN',
 'model_CHTESSEL_Ref_exp1',
 'model_CLM5a',
 'model_GFDL',
 'model_JULES_GL9',
 'model_JULES_GL9_withLAI',
 'model_MATSIRO',
 'model_MuSICA',
 'model_NASAEnt',
 'model_NoahMPv401',
 'model_ORC2_r6593',
 'model_ORC3_r8120',
 'model_QUINCY',
 'model_STEMMUS-SCOPE',
 'obs']

In [43]:
daily_EF          = deepcopy(daily_Qle)
daily_EF_output   = deepcopy(daily_Qle)

for model_name in model_names['model_select_new']:
    if model_name == 'obs':
        head = ''
    else:
        head = 'model_'
        
    if use_Rnet:
        # qc_mask = (daily_Qle[head + model_name]<=800) & (daily_Qle[head + model_name]>=-300) & (daily_Rnet[head + model_name]<=2000) & (daily_Rnet[head + model_name]>=-1000)
        daily_EF.loc[:, head + model_name] = daily_Qle[head + model_name]/daily_Rnet[head + model_name]
    else:
        # qc_mask = (daily_Qle[head + model_name]<=800) & (daily_Qle[head + model_name]>=-300) & (daily_Qh[head + model_name]<=800) & (daily_Qh[head + model_name]>=-300)
        daily_EF.loc[:, head + model_name] = daily_Qle[head + model_name]/(daily_Qle[head + model_name]+daily_Qh[head + model_name])
    
    if X_day > 1:
        for site_name in site_names:
            site_mask = (daily_EF_smoothed['site_name']==site_name)

            # Calculate 5-day rolling mean of efficiency factor grouped by ['year', 'month', 'day', 'site_name']
            daily_EF_output.loc[site_mask, head + model_name] = daily_EF.loc[site_mask, head + model_name].rolling(window=X_day, min_periods=1).mean() 
    else:
        daily_EF_output = daily_EF

### Set the EF in a day is nan in any model/obs as nan for all models/obs

In [47]:
model_names = columns_to_aggregate
daily_EF_output[model_names] = daily_EF_output[model_names].where(~daily_EF_output[model_names].isna().any(axis=1), other=np.nan)

In [48]:
remove_site=['AU-Rig','AU-Rob','AU-Whr','AU-Ync','CA-NS1','CA-NS2','CA-NS4','CA-NS5','CA-NS6', 
             'CA-NS7','CA-SF1','CA-SF2','CA-SF3','RU-Che','RU-Zot','UK-PL3','US-SP1',
             'AU-Wrr','CN-Din','US-WCr','ZM-Mon']
for site_name in site_names:
    site_mask = (daily_EF_output['site_name'] == site_name)
    lost_percent =  (np.sum(np.isnan(daily_EF_output.loc[site_mask, 'model_CABLE']))/len(daily_EF_output.loc[site_mask, 'model_CABLE']))*100
    print(site_name,  np.sum(np.isnan(daily_EF_output.loc[site_mask, 'model_CABLE'])),lost_percent)
    # if lost_percent == 100 and not site_name in  remove_site:
    #     print('lose ', site_name)

AR-SLu 0 0.0
AT-Neu 26 0.6469270962926101
AU-ASM 18 0.7036747458952306
AU-Cow 20 0.9124087591240875
AU-Cpr 4 0.1563721657544957
AU-Ctr 5 0.17105713308244952
AU-Cum 3 0.13686131386861314
AU-DaP 2 0.13679890560875513
AU-DaS 4 0.13684570646595962
AU-Dry 2 0.10946907498631638
AU-Emr 2 0.273224043715847
AU-GWW 3 0.16420361247947454
AU-Gin 22 1.0031919744642042
AU-How 8 0.145985401459854
AU-Lit 2 0.273224043715847
AU-Otw 1 0.13679890560875513
AU-Rig 5 0.22799817601459188
AU-Rob 1462 100.0
AU-Sam 14 0.547302580140735
AU-Stp 11 0.376325692781389
AU-TTE 2 0.10946907498631638
AU-Tum 3 0.05132591958939265
AU-Whr 19 2.5956284153005464
AU-Wrr 732 100.0
AU-Ync 2558 100.0
BE-Bra 7 0.17417267977108733
BE-Lon 16 0.43799616753353404
BE-Vie 8 0.12167300380228137
BR-Sa3 9 0.8211678832116789
BW-Ma1 2 0.544959128065395
CA-NS1 0 0.0
CA-NS2 1097 100.0
CA-NS4 2 0.273224043715847
CA-NS5 2 0.273224043715847
CA-NS6 1097 100.0
CA-NS7 732 100.0
CA-Qcu 229 12.534209085933224
CA-Qfo 4 0.1563721657544957
CA-SF1 32 2.9

### Save to daily values

In [49]:
# Drop unnecessary columns from Qh_input
Qle_input.drop(columns=['model_CABLE', 'model_CABLE-POP-CN', 'model_CHTESSEL_Ref_exp1', 'model_CLM5a',
                       'model_GFDL', 'model_JULES_GL9', 'model_JULES_GL9_withLAI', 'model_MATSIRO',
                       'model_MuSICA', 'model_NASAEnt', 'model_NoahMPv401', 'model_ORC2_r6593',
                       'model_ORC3_r8120', 'model_QUINCY', 'model_STEMMUS-SCOPE', 'obs'], inplace=True)

Qle_input

Unnamed: 0,month,site_name,year,day
0,1.0,AR-SLu,2010,1
1,1.0,AR-SLu,2010,1
2,1.0,AR-SLu,2010,1
3,1.0,AR-SLu,2010,1
4,1.0,AR-SLu,2010,1
...,...,...,...,...
17137987,12.0,ZM-Mon,2008,31
17137988,12.0,ZM-Mon,2008,31
17137989,12.0,ZM-Mon,2008,31
17137990,12.0,ZM-Mon,2008,31


In [50]:
# Merge var_output back to Qh_input
var_output = pd.merge(Qle_input, daily_EF_output, on=['year', 'month', 'day', 'site_name'], how='left')

In [51]:
# Save the processed data to a new CSV file
if use_Rnet:
    daily_Qle.to_csv(f'./txt/process2_output/Qle_all_sites_daily_mean.csv', index=False)
    daily_Rnet.to_csv(f'./txt/process2_output/Rnet_all_sites_daily_mean.csv', index=False)
    daily_EF.to_csv(f'./txt/process2_output/EF_all_sites_daily_mean_use_Rnet.csv', index=False)
    var_output.to_csv(f'./txt/process2_output/EF_all_sites_{X_day}_day_mean_use_Rnet.csv', index=False)
else:
    daily_Qle.to_csv(f'./txt/process2_output/Qle_all_sites_daily_mean.csv', index=False)
    daily_Qh.to_csv(f'./txt/process2_output/Qh_all_sites_daily_mean.csv', index=False)
    daily_EF.to_csv(f'./txt/process2_output/EF_all_sites_daily_mean.csv', index=False)
    var_output.to_csv(f'./txt/process2_output/EF_all_sites_{X_day}_day_mean.csv', index=False)
    

# Calculate 5 days mean

In [52]:
X_day             = 5
daily_EF_smooth   = deepcopy(daily_Qle)
site_names, IGBP_types, clim_types, model_names = load_default_list()

for model_name in model_names['model_select_new']:
    if model_name == 'obs':
        head = ''
    else:
        head = 'model_'
        
    if X_day > 1:
        for site_name in site_names:
            site_mask = (daily_EF_smooth['site_name']==site_name)

            # Calculate 5-day rolling mean of efficiency factor grouped by ['year', 'month', 'day', 'site_name']
            daily_EF_smooth.loc[site_mask, head + model_name] = daily_EF.loc[site_mask, head + model_name].rolling(window=X_day, min_periods=1).mean() 

In [53]:
model_names = columns_to_aggregate
daily_EF_smooth[model_names] = daily_EF_smooth[model_names].where(~daily_EF_smooth[model_names].isna().any(axis=1), other=np.nan)

In [54]:
# Merge var_output back to Qh_input
var_output_5day = pd.merge(Qle_input, daily_EF_smooth, on=['year', 'month', 'day', 'site_name'], how='left')

In [56]:
# Save the processed data to a new CSV file
if use_Rnet:
    var_output_5day.to_csv(f'./txt/process2_output/EF_all_sites_{X_day}_day_mean_use_Rnet.csv', index=False)
else:
    var_output_5day.to_csv(f'./txt/process2_output/EF_all_sites_{X_day}_day_mean.csv', index=False)
    

In [58]:
print(np.sum(np.isnan(var_output['model_CABLE'])),np.sum(np.isnan(var_output_5day['model_CABLE'])))

1145179 1016370
