# Effect of Forcings on CAMELs Simulations

Now we can look at the output and see if there are any patterns across the variables or across basin characteristics.

First we load the imports.

In [None]:
%pylab inline
#import cartopy
#import geoviews as gv
#import geopandas as gpd
#import holoviews as hv
import pandas as pd
import xarray as xr
#import seaborn as sns
import ogr
from scipy import stats

pylab.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 96
#hv.notebook_extension('bokeh')

<br>
Load the shapefiles.

In [None]:
top = '/glade/work/ashleyvb'
folder = top+'/CAMELs'
folders = folder+'/summa_camels'

<br>

# Summary Statistics of Error on output
Let's look at some error metrics by HRU.
KGE means perfect agreement if it is 1, and <0 means the mean is a better guess. 
Bias means perfect aggreement if it is 0, and larger means larger error. 
All errors have 1's added so we don't divide by 0. 

In [None]:
# truth data set
sim_truth = xr.open_dataset(folders+'/output/merged_day/NLDAStruth_hru.nc')

In [None]:
# Set forcings to hold at constant or MetSim and create dictionaries
cm_vars= ['all','airpres','airtemp','LWRadAtm','pptrate','spechum','SWRadAtm','windspd']
error_kind = ['bias','kge']
est_kind = ['constant','metsim']
seas_kind = ['YEAR','DJF','MAM','JJA','SON']
#forcing, liquid water fluxes for the soil domain, turbulent heat transfer, snow, vegetation, derived 
var_sim=['airpres','airtemp','LWRadAtm','pptrate','spechum','SWRadAtm','windspd',
          'scalarSurfaceRunoff','scalarAquiferBaseflow','scalarInfiltration','scalarRainPlusMelt','scalarSoilDrainage',
          'scalarLatHeatTotal','scalarSenHeatTotal','scalarSnowSublimation',
          'scalarSWE',
          'scalarCanopyWat',
          'scalarNetRadiation','scalarTotalET','scalarTotalRunoff','scalarTotalSoilWat']

In [None]:
# definitions for KGE computation
def covariance(x,y,dims=None):
    return xr.dot(x-x.mean(dims), y-y.mean(dims), dims=dims) / x.count(dims)

def correlation(x,y,dims=None):
    return covariance(x,y,dims) / (x.std(dims) * y.std(dims))

In [None]:
# set up xarray
hrud = sim_truth['hru'] #indices here are 0 to number of basins
shape = (len(hrud), len(cm_vars),len(est_kind), len(error_kind),len(seas_kind))
dims = ('hru','var','estimation','error','season')
coords = {'hru': hrud, 'var':cm_vars, 'estimation':est_kind, 'error':error_kind, 'season':seas_kind}
error_data = xr.Dataset(coords=coords)
for s in var_sim:
    error_data[s] = xr.DataArray(data=np.full(shape, np.nan),
                                 coords=coords, dims=dims,
                                 name=s)

<br>
Now run the actual computations on KGE. This takes 35 min using all 671 basins. 

In [None]:
%%time
truth0 = sim_truth.drop_vars('hruId').load()
for v in cm_vars:
    for c in est_kind:     
        sim0 = xr.open_dataset(folders+'/output/merged_day/NLDAS' + c + '_' + v +'_hru.nc')
        sim0 = sim0.drop_vars('hruId').load()
        for i, t in enumerate(seas_kind):     
            if i==0: 
                truth = truth0
                sim = sim0
            if i>0: 
                truth = truth0.sel(time=truth0['time.season']==t)
                sim = sim0.sel(time=sim0['time.season']==t)
                
            r = sim.mean(dim='time') #to set up xarray since xr.dot not supported on dataset and have to do loop
            for s in var_sim:         
                r[s] = correlation(sim[s],truth[s],dims='time')
            # KGE value for each hru, add 1 so no nan
            ds = 1 - np.sqrt( np.square(r-1) 
                + np.square( (sim.std(dim='time')+1)/(truth.std(dim='time')+1) - 1) 
                + np.square( (sim.mean(dim='time')+1)/(truth.mean(dim='time')+1) - 1) )
            ds0 = ds.load()
            # bias value for each hru, add 1 so no nan
            ds = np.abs(sim-truth)/(truth+1) 
            ds1 = ds.mean(dim='time').load()
            for s in var_sim:
                error_data[s].loc[:,v,c,'kge',t]  = ds0[s]
                error_data[s].loc[:,v,c,'bias',t] = ds1[s]
    print(v)

In [None]:
# Add hruId as coordinates
error_data = attrib.assign_coords(hru=sim_truth['hruId'])
the_hru = np.array(error_data['hruId'])

In [None]:
error_data.to_netcdf(folder+'/regress_data/error_data.nc') #save this incase hangs up

<br>
KGE does not need to be normalized. We plot the HRU error as stack of values, with no error plotting as a height of 1 for that color. Values less than 0 are plotted as 0. 

In [None]:
#error_data =  xr.open_dataset(folder+'/regress_data/error_data.nc') #read this incase hangs up

In [None]:
# Setup plots
x = np.arange(len(hrud))
col_vars = ['gray','y','r','g','orange','c','m','b']
letter = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
wid = ceil(len(var_sim)/3)
inc = floor(len(hrud)/10)
if inc<1: inc=1
xtic = np.arange(1, len(hrud)+.05,inc).tolist()
xtic =[int(i) for i in xtic]
xtics =[str(i) for i in xtic]
labels =["V"+i for i in xtics]

In [None]:
%%time
# Just plot all-- takes 8 min. Maybe add winter, and summer if you want to see more detail. 
ind = [0]#,1,3]
seas_kind0 = [ seas_kind[i] for i in ind]
for c in est_kind:      
    for t in seas_kind0:     
        plot1 = plt.figure(1, figsize = (20,10))

        for i, s in enumerate(var_sim):
            data0 = error_data[s].loc[:,:,c,'kge',t]
            data = data0.where(data0>0,0) #make the negative values be 0
            data_Master = [0] * len(hrud)
    
            plot2 = plt.subplot(3,wid,i+1)
            for j, v in enumerate(cm_vars):
                plt.bar(height = data.loc[:,v], x = x, width = 1.0, color = col_vars[j], bottom = data_Master)
                #data_Master = [m + n for m, n in zip(data_Master, data.loc[:,v])]
                data_Master = [j+1] * len(hrud)
         
            plt.title('('+letter[i]+') '+s)
            plt.ylim(0,len(cm_vars))
            plt.xticks(xtic, labels, fontsize = 3)
            plt.yticks(np.arange(0, len(cm_vars)+.05, 1).tolist())
            plt.tick_params(axis = "x", which = "both", bottom = False, top = False)
            plt.xlabel("CAMELS basin (v1-v671)", fontsize = 9)
            plt.ylabel("KGE", fontsize = 9)

        plt.subplots_adjust(hspace = .4)

        for j, v in enumerate(cm_vars):
            plt.scatter([],[], color = col_vars[j], label = t + '_NLDAS_' + c + '_' + v)
        plt.figlegend(loc = 'lower right')
        plt.show()

<br>
We see that the pptrate and air pressure would be better off constant than at MetSim values (thiner orange and yellow layers in the MetSim plots), but that the air pressure does not matter in the variable calculation (except simulation of air pressure itself). Air temperature has less error in MetSim. 
By season, there is more error in the winter in both Metsim and Constant.

<br>

# Correlations of Error and Basin Attributes
We look at the basin attributes to see if there are any patterns with the error sizes. 
We use the Kendall non-parametric correlation based on ranks, so that error magnitude (that is likely more affected by calibrated or not calibrated parameters) is not a factor. 
The attribute file that SUMMA uses does not have many continuous variables in it, so we use the raw attribute data that would have been used to derive the SUMMA attribute file. 
TEST Budyko of each setup??

In [None]:
# Here is the file used by SUMMA. It does not have many values and a bunch of them are indices.  
attrib0 = xr.open_dataset(folders+'/settings.v1/attributes.nc')
print(attrib0)

In [None]:
# And here are the attribute data
# variables to regress, take only floats
lr_attrib0 = attrib0.get(['hruId']) 
lr_attrib0 = lr_attrib0.assign_coords(hru=lr_attrib0['hruId'])
file_name = ['clim','geol','hydro','soil','topo','vege']
n_attrib = file_name
for i, f in enumerate(file_name):
    df = pd.read_csv(folder+'/regress_data/camels_'+f+'.txt',delimiter=';')
    df['hru'] = range(0,671)
    xr_tmp = df.set_index(['hru']).to_xarray()
    xr_att = xr_tmp.drop_vars([ var for var in xr_tmp.variables if not 'float64'==xr_tmp[var].dtype ])
    if i==0: n_attrib[i]= len(xr_att.variables)-1
    if i>0: n_attrib[i]= len(xr_att.variables)+n_attrib[i-1]
    lr_attrib0 =xr.merge([lr_attrib0,xr_att])

In [None]:
# Add hruId as coordinates, select basins and print results
lr_attrib = attrib.assign_coords(hru=lr_attrib['hruId'])
lr_attrib = lr_attrib0.drop('hruId').load()
lr_attrib = lr_attrib.sel(hru=the_hru)
attrib_kind = list(lr_attrib.variables.keys())
print(n_attrib)
print(lr_attrib)

<br>
Now run the regressions and plot. 

In [None]:
# set up xarray
attrib_num = ['clim','geol','hydro','soil','topo','vege']
shape = (len(attrib_kind), len(cm_vars),len(est_kind), len(error_kind),len(seas_kind))
dims = ('attrib','var','estimation','error','season')
coords = {'attrib': attrib_kind, 'var':cm_vars, 'estimation':est_kind, 'error':error_kind, 'season':seas_kind}
corr_data = xr.Dataset(coords=coords)
for s in var_sim:
    corr_data[s] = xr.DataArray(data=np.full(shape, np.nan),
                                 coords=coords, dims=dims,
                                 name=s)

In [None]:
def r_cor(x,y, pthres = 0.01, direction = False):
    """
    Uses the scipy stats module to calculate a Kendall correlation test
    :pthres: Significance of the underlying test
    :direction: output only direction as output (-1 & 1)
    """
    
    # Check NA values
    nas = np.logical_or(np.isnan(x), np.isnan(y))
    if len(x[~nas]) < 10: # If fewer than 10 data return nan
        return np.nan
    if norm(x[~nas] - mean(x[~nas])) < 1e-13 * abs(mean(x[~nas])): #near constant attibute
         return np.nan
    if norm(y[~nas] - mean(y[~nas])) < 1e-13 * abs(mean(y[~nas])): #near constant error
         return np.nan
       
    
    # Run the kendalltau test
    #stat, p_value = stats.kendalltau(x[~nas], y[~nas])
    # Run the spearmanr test
    #stat, p_value = stats.spearmanr(x[~nas], y[~nas])
    # Run the pearsonr test
    stat, p_value = stats.pearsonr(x[~nas], y[~nas])
    
    # Criterium to return results in case of Significance
    if p_value < pthres:
        # Check direction
        if direction:
            if stat < 0:
                return -1
            elif stat > 0:
                return 1
        else:
            return stat
    else:
      return 0  

# The function we are going to use for applying our kendall test
def rank_correlation(x,y):
    return xr.apply_ufunc(
        r_cor, x , y
        )

In [None]:
%%time
#This takes ~7  min 
for a in attrib_kind:
    ds0 = lr_attrib[a]
    for v in cm_vars:
        for c in est_kind:
            for k in error_kind:
                for t in seas_kind:
                    for s in var_sim:
                        ds1 = error_data[s].loc[:,v,c,k,t]
                        value = rank_correlation(ds0.values, ds1.values)
                        corr_data[s].loc[a,v,c,k,t] = value

In [None]:
# Setup plots
x = np.arange(len(attrib_kind))
col_vars = ['gray','y','r','g','orange','c','m','b']
letter = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
wid = ceil(len(var_sim)/3)
xtic = n_attrib
labels =file_name

In [None]:
%%time
# Just plot all, winter, and summer for simplicity. 
ind = [0] #,1,3]
seas_kind0 = [ seas_kind[i] for i in ind]
for c in est_kind:      
    for t in seas_kind0:     
        plot1 = plt.figure(1, figsize = (20,10))

        for i, s in enumerate(var_sim):
            data0 = corr_data[s].loc[:,:,c,'kge',t]
            #data = abs(data0)
            data = data0
            data_Master = [0] * len(attrib_kind)
    
            plot2 = plt.subplot(3,wid,i+1)
            for j, v in enumerate(cm_vars):
                plt.bar(height = data.loc[:,v], x = x, width = 1.0, color = col_vars[j], bottom = data_Master)
                #data_Master = [m + n for m, n in zip(data_Master, data.loc[:,v])]
                data_Master = [j+1] * len(attrib_kind)
        
            plt.title('('+letter[i]+') '+s)
            plt.xticks(xtic, labels, fontsize = 3)
            #plt.ylim(0,(len(cm_vars)/1.8))
            #plt.yticks(np.arange(0, (len(cm_vars)/2.0)+.55, 0.5).tolist())
            plt.ylim(0,len(cm_vars))
            plt.yticks(np.arange(-0.5, len(cm_vars)+.05, 1).tolist())
            plt.tick_params(axis = "x", which = "both", bottom = False, top = False)
            plt.xlabel("CAMELS Attrib (a1-a52)", fontsize = 9)
            plt.ylabel("Pearson Correlation with KGE", fontsize = 9)

        plt.subplots_adjust(hspace = .4)

        for j, v in enumerate(cm_vars):
            plt.scatter([],[], color = col_vars[j], label = t + '_NLDAS_' + c + '_' + v)
        plt.figlegend(loc = 'lower right')
        plt.show()