In [None]:
# standard python utilities
import os
from os.path import join, basename,dirname, exists, expanduser
import sys
import glob
import pandas as pd
import numpy as np
import time

# standard python plotting utilities
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

# standard geospatial python utilities
# import pyproj # for converting proj4string
# import shapely
import geopandas as gpd
# import rasterio

# mapping utilities
import contextily as ctx
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from mpl_toolkits.axes_grid1.anchored_artists import AnchoredSizeBar
import matplotlib.font_manager as fm
from matplotlib.ticker import MaxNLocator


In [None]:
usr_dir = expanduser('~')
doc_dir = join(usr_dir, 'Documents')
    
# dir of all gwfm data
gwfm_dir = dirname(doc_dir)+'/Box/research_cosumnes/GWFlowModel'
# dir of stream level data for seepage study
proj_dir = gwfm_dir + '/Oneto_Denier/'
dat_dir = proj_dir+'Stream_level_data/'

fig_dir = proj_dir+'/Streambed_seepage/figures/'
hob_dir = join(gwfm_dir, 'HOB_data')
sfr_dir = gwfm_dir+'/SFR_data/'




In [None]:
def add_path(fxn_dir):
    """ Insert fxn directory into first position on path so local functions supercede the global"""
    if fxn_dir not in sys.path:
        sys.path.insert(0, fxn_dir)

add_path(doc_dir+'/GitHub/flopy')
import flopy 
py_dir = join(doc_dir,'GitHub/CosumnesRiverRecharge/python_utilities')
add_path(py_dir)
from mf_utility import get_dates, get_layer_from_elev, clean_wb
from map_cln import gdf_bnds, plt_cln

# from importlib import reload
# import mf_utility
# reload(mf_utility)

In [None]:
# scenario specific function
from OD_utility import run_stats

In [None]:
# scenario = '' # baseline, levee removal occurred in 2014
# create identifier for scenario if levee removal didn't occur
scenario = 'no_reconnection'

In [None]:
ext_dir = 'F:/WRDAPP'
c_dir = 'C:/WRDAPP'
if os.path.exists(ext_dir):
    loadpth = ext_dir 
elif os.path.exists(c_dir):
    loadpth = c_dir 
loadpth +=  '/GWFlowModel/Cosumnes/Stream_seepage'

upscale = 'upscale4x_'
# model_nam = 'oneto_denier_'+upscale+'2014_2018'
model_nam = 'oneto_denier_'+upscale+'2014_2020'
model_ws = join(loadpth,model_nam)

if scenario != '':
    model_ws += '_' + scenario
    
# model_ws = join(loadpth,'parallel_oneto_denier','realization000')
load_only = ['DIS','BAS6','UPW','SFR','OC', "EVT",'LAK']
m = flopy.modflow.Modflow.load('MF.nam', model_ws= model_ws, 
                                exe_name='mf-owhm.exe', version='mfnwt',
                              load_only=load_only,
                              )


In [None]:
nrow, ncol = (m.dis.nrow, m.dis.ncol)

In [None]:
model_ws0 = join(loadpth,model_nam)

In [None]:
print('Quantiles: ',[0,0.5,0.6,0.75,1])
print('HK :',np.quantile(m.upw.hk.array,[0,0.5,0.6,0.75,1]))
print('VKA :',np.quantile(m.upw.vka.array,[0,0.5,0.6,0.75,1]))

In [None]:
model_grp = 'inset_oneto_denier'
grid_dir = join(gwfm_dir, 'DIS_data/streambed_seepage/grid')
grid_fn = join(grid_dir, model_grp,'rm_only_grid.shp')
grid_p = gpd.read_file(grid_fn)
grid_p.crs='epsg:32610'
m_domain = gpd.GeoDataFrame(pd.DataFrame([0]), geometry = [grid_p.unary_union], crs=grid_p.crs)

In [None]:
XSg = pd.read_csv(join(model_ws,'04_XSg_filled.csv'))
XSg = gpd.GeoDataFrame(XSg, geometry = gpd.points_from_xy(XSg.Easting, XSg.Northing), crs='epsg:32610')

# overwrite SFR segment/reach input relevant to seepage
# sensor_dict = pd.read_csv(join(model_ws, 'sensor_xs_dict.csv'), index_col=0)
# XS_params = sensor_dict.join(params.set_index('Sensor'), on='Sensor')

In [None]:
params = pd.read_csv(model_ws+'/ZonePropertiesInitial.csv', index_col='Zone')
# convert from m/s to m/d
params['K_m_d'] = params.K_m_s * 86400 
vka = m.upw.vka.array
tprogs_vals = np.arange(1,5)
tprogs_hist = np.flip([0.590, 0.155, 0.197, 0.058])
tprogs_quants = 1-np.append([0], np.cumsum(tprogs_hist)/np.sum(tprogs_hist))
vka_quants = pd.DataFrame(tprogs_quants[1:], columns=['quant'], index=tprogs_vals)
# dataframe summarizing dominant facies based on quantiles
vka_quants['vka_min'] = np.quantile(vka, tprogs_quants[1:])
vka_quants['vka_max'] = np.quantile(vka, tprogs_quants[:-1])
vka_quants['facies'] = params.loc[tprogs_vals].Lithology.values

In [None]:
sfr_reach = pd.DataFrame(m.sfr.reach_data)
grid_sfr = grid_p.set_index(['row','column']).loc[list(zip(sfr_reach.i+1,sfr_reach.j+1))].reset_index(drop=True)
grid_sfr = pd.concat((grid_sfr,sfr_reach),axis=1)
# group sfrdf by vka quantiles
sfr_vka = vka[grid_sfr.k, grid_sfr.i, grid_sfr.j]

for p in vka_quants.index:
    facies = vka_quants.loc[p]
    grid_sfr.loc[(sfr_vka< facies.vka_max)&(sfr_vka>= facies.vka_min),'facies'] = facies.facies
#     # add color for facies plots
# grid_sfr = grid_sfr.join(gel_color.set_index('geology')[['color']], on='facies')

In [None]:
lak_shp = join(gwfm_dir,'LAK_data/floodplain_delineation')
# shapefile rectangle of the area surrounding the Dam within about 5 cells
lak_gpd = gpd.read_file(join(lak_shp,'LCRFR_ModelDom_2017/LCRFR_2DArea_2015.shp' )).to_crs('epsg:32610')

lak_cells = gpd.sjoin(grid_p,lak_gpd,how='right',predicate='within').drop(columns='index_left')

# filter zone budget for Blodgett Dam to just within 5 cells or so of the Dam
zon_lak = np.zeros((grid_p.row.max(),grid_p.column.max()),dtype=int)
zon_lak[lak_cells.row-1,lak_cells.column-1]=1

zon_mod = np.ones((grid_p.row.max(),grid_p.column.max()),dtype=int)

In [None]:
zon_color_dict = pd.read_excel('mf_wb_color_dict.xlsx',sheet_name='owhm_wb_dict', header=0, index_col='flux',comment='#').color.to_dict()
zon_name_dict = pd.read_excel('mf_wb_color_dict.xlsx',sheet_name='owhm_wb_dict', header=0, index_col='flux',comment='#').name.to_dict()

zb_alt = pd.read_excel('mf_wb_color_dict.xlsx',sheet_name='flopy_to_owhm', header=0, index_col='flopy',comment='#').owhm.to_dict()


## Sensor data and XS data

In [None]:
rm_grid = pd.read_csv(join(proj_dir, 'mw_hob_cleaned.csv'))
rm_grid = gpd.GeoDataFrame(rm_grid, geometry = gpd.points_from_xy(rm_grid.Longitude,rm_grid.Latitude), 
                           crs='epsg:4326').to_crs(grid_p.crs)
# get model layer for heads
hob_row = rm_grid.row.values-1
hob_col = rm_grid.column.values-1

In [None]:
gwl_long = pd.read_csv(join(model_ws,'gwl_long.csv'), parse_dates=['dt'])

In [None]:
# XS are every 100 m
xs_all = pd.read_csv(dat_dir+'XS_point_elevations.csv',index_col=0)
xs_all = gpd.GeoDataFrame(xs_all,geometry = gpd.points_from_xy(xs_all.Easting,xs_all.Northing), crs='epsg:32610')


In [None]:

# correspond XS to sensors
rm_elev = gpd.sjoin_nearest(XSg, rm_grid, how='right',lsuffix='xs', rsuffix='rm')
#MW_11, MW_CP1 had doubles with sjoin_nearest due to XS duplicates from Oneto_Denier
rm_elev = rm_elev.drop_duplicates(['xs_num','Sensor'])

## Model output - time variant

In [None]:
hdobj0 = flopy.utils.HeadFile(model_ws0+'/MF.hds')
hdobj = flopy.utils.HeadFile(model_ws+'/MF.hds')
spd_stp = hdobj.get_kstpkper()
times = hdobj.get_times()


In [None]:
strt_date, end_date, dt_ref = get_dates(m.dis, ref='strt')


In [None]:
wb0, out_cols, in_cols = clean_wb(model_ws0, dt_ref)
wb0_cols = np.append(out_cols, in_cols)

fig,ax= plt.subplots(3,1, sharex=True)
wb0.plot(y='PERCENT_ERROR', ax=ax[0])
wb0.plot(y=out_cols, ax=ax[1], legend=True)
wb0.plot(y=in_cols, ax=ax[2], legend=True)


In [None]:
wb, out_cols, in_cols = clean_wb(model_ws, dt_ref)
wb_cols = np.append(out_cols, in_cols)
fig,ax= plt.subplots(3,1, sharex=True)
wb.plot(y='PERCENT_ERROR', ax=ax[0])
wb.plot(y=out_cols, ax=ax[1], legend=True)
wb.plot(y=in_cols, ax=ax[2], legend=True)


## Water budget change
The expected components to change in the water budget are evapotranspiration, groundwater inflow and outflow, and change in storage. Plot each of these on the same plot or with a differenced line.

- Amy Yoder performed a t-test with levee-restoration status as the grouping variable and recharge volume as the response variable (she did this for each recharge event, in this case I would do it for each year)+. She also plotted cumulative flow volume against recharge for each event and applied a power regression equation to fit groups or pre- and post-restoration to show how restoration ideally leads to more recharge per cumulative flow.

- If I apply a t-test, we are comparing the annual recharge and storage change between the scenarios to see if they have significantly different average values. Should also plot linear regression to use slope as an explanation of how water budget terms change from the scenarios.

- **Water budget can be compared with riparian zone water budget stats**

The statistic is calculated as (np.mean(a) - np.mean(b))/se, where se is the standard error. Therefore, the statistic will be positive when the sample mean of a is greater than the sample mean of b and negative when the sample mean of a is less than the sample mean of b.  We apply a related t-test because samples are paired by datetime or location.  
- Standard error is $\frac{\sigma}{\sqrt{n}}$ of the differences between all pairs

Two-sample t-test: Decide if the population means for two different groups are equal or not  
Paired t-test: Decide if the difference between paired measurements for a population is zero or not  

The water budget data is generally log-normally distributed so ideally I should apply a log transform before data analysis.

"The paired t test provides an hypothesis test of the difference between population means for a pair of random samples whose differences are approximately normally distributed. Please note that a pair of samples, each of which are not from normal a distribution, often yields differences that are normally distributed." To gain normality one could apply a regular t-test to the differenced data against an expected mean of 0, the results from the 1-sample t-test are the same as the paired t-test



In [None]:
from scipy.stats import ttest_rel, ttest_1samp, linregress


In [None]:
def lin_reg_plt(a, b):
    """ Plot scatter plot and linear regression, print ttest results
    """
    slope, intercept, r_value, p_value, std_err = linregress(a, b)

    plt.scatter(a, b)
    x_range = np.array([[np.min((a,b))], [np.max((a,b))]])
    plt.plot(x_range, slope*x_range + intercept, color='black', linewidth=1)
    plt.annotate('y = '+str(np.round(slope,3))+'x + '+ str(np.round(intercept,2)), (0.1,0.8), xycoords='axes fraction')
    plt.ylabel('No Reconnection')
    plt.xlabel('Baseline')
def run_ttest(a, b, term, freq):
    """ Run ttest and summarize to tables
    """
    t_out = ttest_rel(a, b)

    t_df = pd.DataFrame([t_out.statistic, t_out.pvalue]).transpose()
    t_df.columns=['statistic','pvalue']
    t_df['term'] = term
    # t_df['freq'] = freq
    # t_df['season'] = season
    t_df['mean_a'] = np.mean(a)
    t_df['mean_b'] = np.mean(b)
    t_df['perc_diff_in_means'] = 100*(np.mean(a)-np.mean(b))/np.abs((np.mean(a)+np.mean(b))/2)

    # rounding to clean up output
    t_df.statistic = t_df.statistic.round(3)
    t_df.pvalue = t_df.pvalue.round(4)
    t_df.perc_diff_in_means = t_df.perc_diff_in_means.round(2)

    # if pvalue is insignificant then don't include
    t_df.loc[t_df.pvalue>=0.05,'perc_diff_in_means'] = '-'
    return(t_df)

By cutting off the wet season in april, there is a disconnect in ET. It might be best to switch to Winter, Spring, summer, and fall

In [None]:
# wet_months = [11,12,1,2,3,4]
# dry_months = [5,6,7,8,9,10]
# fall_months=[9,10,11]

def run_stats(wb, wb0, term, season=None, freq='monthly', plot=False):
    if season == 'Winter':
        months = [12,1,2]
    elif season== 'Spring':
        months = [3,4,5]
    elif season =='Summer':
        months = [6,7,8]
    elif season=='Fall':
        months=[9,10,11]
    if season is not None:
        wb = wb[wb.index.month.isin(months)]
        wb0 = wb0[wb0.index.month.isin(months)]
    if freq=='annual':
        a = wb0.resample('AS-Oct').sum()[term].values
        b = wb.resample('AS-Oct').sum()[term].values
    elif freq=='monthly':
        a = wb0.resample('MS').sum()[term].values
        b = wb.resample('MS').sum()[term].values
    elif freq=='daily':
        a = wb0.resample('D').sum()[term].values
        b = wb.resample('D').sum()[term].values

    t_df = run_ttest(a, b, term, freq)
    t_df['freq'] = freq
    t_df['season'] = season
    if plot:
        print('T-test statistic: %.2f' %t_df.statistic.iloc[0], 'and pvalue: %.4f' %t_df.pvalue.iloc[0])
        lin_reg_plt(a, b)
        plt.title(term)
        plt.show()

    return(t_df)

# run_stats(wb, wb0, 'SFR_IN', season='Wet', freq='monthly', plot=True)

Do we want to use the slope of the linear regression as a way to show the relationship at each point versus the relationship on average (t-test)? The slope shows the relationship in a more specific way while the t-test helps decide the net effect. Assuming our water years are representative then the t-test can be a decider of effectiveness while the slope helps show the significance of the benefits?  
- linear regression is helpful for runderstanding but should be presented in an appendix.

In [None]:
# run_stats(wb, wb0, 'SFR_OUT', freq='annual')
# t_out = run_stats(wb, wb0, 'ET_OUT', freq='annual', plot=True)
# t_out = run_stats(wb, wb0, 'SFR_IN', freq='annual', plot=True, season = 'Dry')
# plt.show()
# t_out = run_stats(wb, wb0, 'SFR_IN', freq='annual', plot=True, season = 'Dry')

 - There is a tight relationship of recharge with and without levee removal, with the slope indicating a reduction in change in storage going from the baseline scenario to a no reconnection scenario. This would go further that when there are losses in storage they are larger than in the baseline. This linear relationship exists both on an annual and monthly scale, the slope is reduced at a monthly scale which shows that on a monthly scale there are larger recharge gains under levee removal. There is not a statistically significant difference in mean change in storage.
 - The baseflow has a statistically significant difference in means. The slope is 0 because there is no baseflow in the no reconnection scenario. The statistically significant relationship only exists in the wet season.
 - For streamflow seepage there is relationship in the dry and wet seasons, but the dry season relationship shows that the different scenarios while having different means (t-test), have similar monthly stream seepage. The wet season shows a stronger relationship that stream seepage is large without levee removal.  
 - Groundwater in/outflow is statistically signficantly different but the slope is almost near one so not worth presenting.

Using the daily values for the t-test input keeps most of the percent differences the same and adds a few new significant differences in the dry season for floodplain recharge and baseflow which show much bigger differences because the no reconnection case has zero values. It's helpful to see but not necessary for this first presentation.

In [None]:
ttest_dir = join(fig_dir, 'ttest_results')
os.makedirs(ttest_dir, exist_ok=True)

In [None]:
ttest_all = pd.DataFrame()
for freq in ['annual','monthly','daily']:
    for t in ['dSTORAGE_sum','LAK_IN', 'ET_OUT', 'GHB_NET', 'SFR_IN', 'SFR_OUT']:
        for s in ['Winter','Spring','Summer','Fall']:
            t_df = run_stats(wb, wb0, t, freq=freq, season=s)

            ttest_all = pd.concat((ttest_all, t_df))
# replace term with clean name
ttest_all['term'] = [zon_name_dict[t] for t in ttest_all.term]

In [None]:
ttest_out = ttest_all[['freq','term','season','statistic','pvalue', 'perc_diff_in_means']]

ttest_out[ttest_out.freq=='monthly'].drop(columns=['freq']).to_csv(join(ttest_dir,'wb_seasonal_monthly.csv'), index=False)
ttest_out[ttest_out.freq=='annual'].drop(columns=['freq']).to_csv(join(ttest_dir,'wb_seasonal_annual.csv'), index=False)

# ttest_out[ttest_out.freq=='monthly']

In [None]:
# plotting the means

ttest_monthly = ttest_all[ttest_all.freq=='monthly']
plt_terms = ttest_monthly.term.unique()
plt_labels=['Cumulative\nStorage\nChange', 'Floodplain\nRecharge', 'GW ET',
            'Net\nGW Flow','Stream\nRecharge', 'Stream\nBaseflow']
scale=1E3
# scale=1E6 # 
fig,ax = plt.subplots(len(plt_terms),1, figsize=(5,6.5), dpi=300,sharex=True)
for n, t in enumerate(plt_terms):
    df = ttest_monthly[ttest_monthly.term==t].copy()
    df_mean = df.set_index('season')[['mean_a','mean_b']].multiply(1/scale)
    # df_mean.plot(y=['mean_a','mean_b'], kind='bar', ax=ax[n], legend=False)
    df_mean.plot(y=['mean_a','mean_b'], kind='line', ax=ax[n], legend=False)
    df.assign(star = df.mean_a/scale)[df.pvalue<0.05].plot(x='season',y='star', kind='scatter', marker='*', s=100, ax=ax[n])
    ax[n].set_ylabel(plt_labels[n])
    ax[n].set_xticks(np.arange(0,4))
    ax[n].set_xticks([], minor=True)
plt.xticks(rotation=0);
plt.xlabel(None);
fig.supylabel('Flux (thousand $m^3$/day)')
fig.tight_layout()
fig.legend(['Restoration','Baseline', 'Significant'], ncol=3, 
           loc='outside upper center', bbox_to_anchor=(0.5, 1.05),)# 0.4, 0.95 no tight layout
# ax[5].legend(['Restoration','Baseline', 'Significant'], ncol=1, loc='upper right')
# fig.legend(['Restoration','Baseline', 'Significant'], ncol=1, loc='outside center right', bbox_to_anchor=(0.95, 0.5),)


### Reference values
Use the output saved in the table for the percent changes

In [None]:
# ttest_monthly

# WY comparison
- good linear fit all WY: dSTORAGE_sum, ET_OUT, GHB_NET
- for LAK_IN the linearity is worse in

Need to use the full year to avoid cutting off months with seasonal definitions.

In [None]:
## closer review for understanding
# t = 'LAK_IN'
# freq='monthly'
# s='Wet'
# for wy in np.arange(2015,2021):
#     yr_strt = str(wy-1)+'-10-1'
#     yr_end = str(wy)+'-9-30'
#     run_stats(wb.loc[yr_strt:yr_end], wb0.loc[yr_strt:yr_end], t, freq=freq,  plot=True)

In [None]:
ttest_all = pd.DataFrame()
for t in ['dSTORAGE_sum','ET_OUT','SFR_IN', 'LAK_IN', 'SFR_OUT']:
    s = 'Wet'
    for wy in np.arange(2015,2021):
        yr_strt = str(wy-1)+'-10-1'
        yr_end = str(wy)+'-9-30'
        t_df = run_stats(wb.loc[yr_strt:yr_end], wb0.loc[yr_strt:yr_end], t, freq=freq)
        
        ttest_all = pd.concat((ttest_all, t_df.assign(wy=wy)))
# replace term with clean name
ttest_all['term'] = [zon_name_dict[t] for t in ttest_all.term]

In [None]:
ttest_out = ttest_all[['term','season','wy','statistic','pvalue', 'perc_diff_in_means']]

ttest_out.to_csv(join(ttest_dir, 'wb_wy_monthly.csv'), index=False)
# ttest_out

## Monthly t-test to plot
If the t-tests are done on a monthly scale we can then count the number of months by season and water year that are significantly different while keeping the standard error based on the daily data so these would be truer t-tests since the monthly summed data might be reducing the variance.  

We want to know the number of months that are significantly different and in which seasons and years the occur.
- histogram by season and year
- time series or histogram for percent difference?

In [None]:
freq='daily'
s=None
ttest_all = pd.DataFrame()
ttest1_all = pd.DataFrame()
months = pd.date_range(strt_date, end_date, freq='MS')[:-1]
for t in ['dSTORAGE_sum', 'LAK_IN','ET_OUT','GHB_NET', 'SFR_IN',  'SFR_OUT']:
    for n, month in enumerate(months):
        m_strt = month
        m_end = month +pd.offsets.MonthEnd()
        t_df = run_stats(wb.loc[m_strt:m_end], wb0.loc[m_strt:m_end], t, freq=freq)
        ttest_all = pd.concat((ttest_all, t_df.assign(month=m_strt)))
        # t1_df = ttest_1samp((wb.loc[m_strt:m_end,t]-wb0.loc[m_strt:m_end,t]).values, 0) # ttest of difference
        # ttest1_all = pd.concat((ttest1_all, pd.DataFrame(t1_df).transpose().assign(month=m_strt, term=t)))
# replace term with clean name
ttest_all['term_name'] = [zon_name_dict[t] for t in ttest_all.term]

In [None]:
# ttest1_all = ttest1_all.rename(columns={0:'statistic',1:'pvalue'})
# ttest1_all['sig'] = 0
# ttest1_all.loc[ttest1_all.pvalue<0.05,'sig'] = 1


In [None]:
# something simple for the histograms
ttest_all['sig'] = 0
ttest_all.loc[ttest_all.pvalue<0.05,'sig'] = 1

ttest_all['wy'] = ttest_all.month.dt.year
ttest_all.loc[ttest_all.month.dt.month>=10, 'wy'] +=1
# ttest_all

In [None]:
# # Helen though this was boring, could be represented with a simpler table giving the number of sig months if needed
# # very slow with sns.histplot
# plt_terms = ['dSTORAGE_sum','LAK_IN', 'ET_OUT','GHB_NET', 'SFR_IN', 'SFR_OUT']
# plt_labels=['Cumulative\nStorage\nChange', 'Floodplain\nRecharge', 'GW ET',
#             'Net\nGW Flow','Stream\nRecharge', 'Stream\nBaseflow']
# fig, ax = plt.subplots(len(plt_terms),1, figsize=(6.5,len(plt_terms)*1), dpi=300,sharey=True, sharex=True)

# for n,t in enumerate(plt_terms):
#     t_df = ttest_all[ttest_all.term==t]
#     t_df.groupby('wy').sum(numeric_only=True).plot(y='sig',kind='bar', ax=ax[n], legend=False)
# plt.xlabel(None)
# ax[n].set_yticks(np.arange(0,14,6));
# ax[n].set_yticks(np.arange(0,12,3), minor=True);
# for n, t in enumerate(plt_terms):
#     ax[n].grid(which='both',axis='y',linestyle='--', alpha=0.8)
#     ax[n].set_ylabel(plt_labels[n])

# plt.xticks(rotation=0);

In [None]:

def sig_fill(ttest_all, ax):
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    for n in ttest_all.month.unique():
        t_df = ttest_all.loc[(ttest_all.month==n)]
        if t_df.sig.values[0]==1:
            t_min = t_df.month.min()-pd.DateOffset(days=15)
            t_max = t_df.month.max()+ pd.DateOffset(days=15) #+pd.DateOffset(months=1)
            ax.fill_between([t_min, t_max ], 
                             ylim[1], ylim[0], 
                            
                            color='blue', edgecolor='blue', alpha=0.2) # hatch is too busy
# step/interpolate doesn't effect the box location
# could also use 'where=y > threshold' which would avoid the need for a loop

In [None]:
# def sig_fill(ttest_all, ax):
#     ylim = ax.get_ylim()
#     ax.fill_between(ttest_all.month.values, 
#                      ylim[1], ylim[0], where= ttest_all.sig==1, 
#                     step='mid', interpolate=True,
#                     color='blue', edgecolor='blue', alpha=0.2) # hatch is too busy
# the fill doesn't work like this because for individual months it is a very thin line

In [None]:
# fig, ax = plt.subplots()
# # for n, var in enumerate(plt_cols):
# var='ET_OUT'
# ttest_chk = ttest_all[(ttest_all.month>='2015-10-1')&(ttest_all.month<'2016-10-1')]
# # plt_wb(wb.resample('MS').sum(), wb0.resample('MS').sum(), plt_cols, plt_labels, ax)

# wb.resample('MS').sum().plot(y=var, ax=ax)
# ax.set_xlim(ttest_chk.month.min(), ttest_chk.month.max())

# # after plotting the water budget lines the fill between is reset the offset
# sig_fill(ttest_chk[ttest_chk.term==var], ax)


# Time series comparison
Rather than plotting the significance over the monthly summed data (it's not averaged here). It would make sense to plot the difference of the monthly average values and the standard deviation with the highlighting for significance

In [None]:
# wb_rip = pd.read_csv(join(model_ws, 'MF_zonebud_riparian_monthly.csv'))
def load_zb_cln(filename, zb_alt):
    wb_df = pd.read_csv(filename, parse_dates=['totim'])
    wb_df.totim-=pd.DateOffset(1) # fix dates
    # select and rename relevant columns
    extra_cols = ['FROM_ZONE_0','TO_ZONE_0']
    wb_df = wb_df.set_index('totim')[list(zb_alt.keys())+extra_cols].rename(columns=zb_alt)
    wb_df['GHB_NET'] = wb_df.FROM_ZONE_0 - wb_df.TO_ZONE_0
    # long format
    # wb_df_long = wb_df.melt(ignore_index=False)
    return(wb_df)

In [None]:
## sum GHB_IN and GHB_OUT, LAK_IN and LAK_OUT to show net effect
# SFR is separate because of interest in baseflow
# the cumulative change in storage is more intuitive to plot than plain change in storage
plt_cols = ['dSTORAGE_sum','LAK_IN', 'ET_OUT','GHB_NET', 'SFR_IN', 'SFR_OUT']
plt_labels=['Cumulative\nStorage Change', 'Floodplain\nRecharge', 'GW ET',
            'Net\nGW Flow','Stream\nRecharge', 'Stream\nBaseflow']
def plt_wb(wb, wb0, plt_cols, plt_labels, ax, scale=1E-6):
    for n, var in enumerate(plt_cols):
        wb0[var].multiply(scale).plot(ax=ax[n], label='Restoration', legend=False)
        wb[var].multiply(scale).plot(ax=ax[n], label='Baseline', legend=False)
        ax[n].set_ylabel(plt_labels[n])

        ax[n].ticklabel_format(style='plain', axis='y')
        ax[n].set_xlabel(None)
#         ax[n].set_yscale('log')

#     fig.savefig(join(fig_dir, 'monthly_wb_lines.png'), bbox_inches='tight')
    
# plt_wb(wb.resample('AS-Oct').sum(), wb0.resample('AS-Oct').sum())
fig,ax= plt.subplots(len(plt_cols),1, sharex=True,  figsize=(6.5, len(plt_cols)*1),dpi=300)
# plt_wb(wb.resample('MS').sum(), wb0.resample('MS').sum(), plt_cols, plt_labels, ax)
plt_wb(wb.resample('MS').mean(numeric_only=True), wb0.resample('MS').mean(numeric_only=True),
       plt_cols, plt_labels, ax, scale=1E-3)

# for n, var in enumerate(plt_cols):
#     sig_fill(ttest_all[ttest_all.term==var], ax[n])
    # sig_fill(ttest1_all[ttest1_all.term==var], ax[n])

fig.legend(['Restoration','Baseline'], ncol=2, loc='outside upper center', bbox_to_anchor=(0.5, 1.05),)
# fig.supylabel('Flux (MCM)')
fig.supylabel('Flux (thousand $m^3$/day)')
fig.tight_layout(h_pad=0.1)

# plt.savefig(join(fig_dir, 'wb_monthly_timeseries.png'), bbox_inches='tight')

### Text reference values
The monthly values should provide context for the results so it may be most helpful to present the ranges of differences or values rather than average

In [None]:
def wb_range(wb, wb0, plt_cols):
    months = [12,1,2,3,4,5]
    months = [3,4,5]
    wb = wb[wb.index.month.isin(months)]
    wb0 = wb0[wb0.index.month.isin(months)]
    for n, var in enumerate(plt_cols):
        wb_frac = wb0[var]/wb[var]
        print(var)
        print('Min %.2f' %wb_frac.min(), 'Max %.2f' %wb_frac.max())
        print(wb_frac.index.date[wb_frac.argmin()], wb_frac.index.date[wb_frac.argmax()])
        
# wb_range(wb.resample('MS').sum(), wb0.resample('MS').sum(), plt_cols, )
wb_range(wb.resample('MS').mean(numeric_only=True), 
         wb0.resample('MS').mean(numeric_only=True), plt_cols)

In [None]:
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
dif_lgd = [
    # Patch(facecolor='tab:blue', alpha=0.5, label='Reconnected Floodplain'),
    Line2D([0], [0],color='black',label='Difference'),
    Line2D([0], [0], color='grey', label='Difference $\pm 1\sigma$'),
]

In [None]:
def plt_wb_diff(wb_diff, plt_cols, plt_labels, ax, color, scale = 1E-3):
    for n, var in enumerate(plt_cols):
        wb_diff[var].multiply(scale).plot(ax=ax[n], label='Difference', legend=False, color=color)
        ax[n].set_ylabel(plt_labels[n])

        ax[n].ticklabel_format(style='plain', axis='y')
        ax[n].set_xlabel(None)
fig,ax= plt.subplots(len(plt_cols),1, sharex=True,  figsize=(6.5, len(plt_cols)*1),dpi=300)
wb_diff = wb0.resample('MS').mean(numeric_only=True) - wb.resample('MS').mean(numeric_only=True)
wb_std = (wb[plt_cols]-wb0[plt_cols]).resample('MS').std(numeric_only=True)
plt_wb_diff(wb_diff+wb_std, plt_cols, plt_labels, ax, color='gray')
plt_wb_diff(wb_diff-wb_std, plt_cols, plt_labels, ax, color='gray')

plt_wb_diff(wb_diff, plt_cols, plt_labels, ax, color='black')


for n, var in enumerate(plt_cols):
    sig_fill(ttest_all[ttest_all.term==var], ax[n])

# fig.legend(['Difference','Std Dev'], ncol=2, loc='outside upper center', bbox_to_anchor=(0.5, 1.05),)
fig.legend(handles = dif_lgd, ncol=2, loc='outside upper center', bbox_to_anchor=(0.5, 1.05),)
#     ax[0].legend(['No Reconnection','Baseline'], ncol=2)
# fig.supylabel('Difference in Mean Flux (MCM)')
fig.supylabel('Difference in Flux (thousand $m^3$/day)')
fig.tight_layout(h_pad=0.1)

# plt.savefig(join(fig_dir, 'wb_monthly_timeseries.png'), bbox_inches='tight')

In the riparian zone we see:
- even smaller differences in storage change because both are trending upward.
- The pattern of GW ET is the same with smaller magnitude
- Net GW pattern is more pronounced with greater outflow in winter under baseline
In the floodplain:
- much clearer difference of floodplain storage in dry years
- GW ET has a weird dynamic with no reconnection greater
- net gw is more pronounced in winter with much greater baseline outflow  
*May not be worth showing these separately*

In [None]:
# # riparian vs floodplain (spatial)
# wb_rip = load_zb_cln(join(model_ws, 'MF_zonebud_riparian_daily.csv'), zb_alt).assign(scenario='no reconnection')
# wb_rip0 = load_zb_cln(join(model_ws0, 'MF_zonebud_riparian_daily.csv'), zb_alt).assign(scenario='baseline')
# wb_fp = load_zb_cln(join(model_ws, 'MF_zonebud_floodplain_daily.csv'), zb_alt).assign(scenario='no reconnection')
# wb_fp0 = load_zb_cln(join(model_ws0, 'MF_zonebud_floodplain_daily.csv'), zb_alt).assign(scenario='baseline')


In [None]:
# plt_cols = ['dSTORAGE_sum','ET_OUT','GHB_NET']
# plt_labels=['Cumulative\nStorage Change', 'GW ET',
#             'Net\nGW Flow']
# plt_wb_diff(wb_rip.resample('MS').sum(), wb_rip0.resample('MS').sum(), plt_cols, plt_labels)
# plt_wb_diff(wb_fp.resample('MS').sum(), wb_fp0.resample('MS').sum(), plt_cols, plt_labels)


In [None]:
# # decided it wasn't worth showing, better already with mechanism plot earlier on and differences are harder to see
# # coarse vs fine comparison (heterogeneity)
# wb_fine = load_zb_cln(join(model_ws, 'MF_zonebud_fine_daily.csv'), zb_alt).assign(scenario='no reconnection')
# wb_fine0 = load_zb_cln(join(model_ws0, 'MF_zonebud_fine_daily.csv'), zb_alt).assign(scenario='baseline')
# wb_coarse = load_zb_cln(join(model_ws, 'MF_zonebud_coarse_daily.csv'), zb_alt).assign(scenario='no reconnection')
# wb_coarse0 = load_zb_cln(join(model_ws0, 'MF_zonebud_coarse_daily.csv'), zb_alt).assign(scenario='baseline')


In [None]:
# plt_cols = ['dSTORAGE_sum','LAK_IN', 'ET_OUT','GHB_NET', 'SFR_IN', 'SFR_OUT']
# plt_labels=['Cumulative\nStorage\nChange', 'Floodplain\nRecharge', 'GW ET',
#             'Net\nGW Flow','Stream\nRecharge', 'Stream\nBaseflow']
# fig,ax= plt.subplots(len(plt_cols),2, sharex=True,  figsize=(6.5, len(plt_cols)*1),dpi=300, sharey='row')

# # fig.legend(['Baseline','No Reconnection'], ncol=2, loc='outside upper center', bbox_to_anchor=(0.5, 1.05),)
# #     ax[0].legend(['No Reconnection','Baseline'], ncol=2)


# plt_wb_diff(wb_fine.resample('MS').sum(), wb_fine0.resample('MS').sum(), plt_cols, plt_labels, ax=ax[:,0])

# plt_wb_diff(wb_coarse.resample('MS').sum(), wb_coarse0.resample('MS').sum(), plt_cols, plt_labels, ax=ax[:,1])

# fig.supylabel('Flux (MCM)')
# fig.tight_layout(h_pad=0.1)
# ax[0,0].set_title('Fine')
# ax[0,1].set_title('Coarse')

# plt.savefig(join(fig_dir, 'wb_monthly_by_facies_time_series.png'), bbox_inches='tight')

Between coarse and fine there are some interesting patterns such as the Baseflow is nearly all from the fines, the stream recharge in coarse doesn't change much but in fines it is reduced as the mounding remains in place for longer periods of time.

# SFR Plotting

In [None]:
import mf_utility
from importlib import reload
reload(mf_utility)
from mf_utility import clean_sfr_df

In [None]:
# double check the no_reconnection has updated fully

grid_sfr = pd.read_csv(join(model_ws,'grid_sfr.csv'),index_col=0)
grid_sfr = grid_sfr[grid_sfr.strhc1!=0]
grid_sfr['vka'] = vka[grid_sfr.k, grid_sfr.i, grid_sfr.j]

pd_sfr = grid_sfr.set_index(['iseg','ireach'])[['rchlen','strtop', 'facies', 'strthick']]
pd_sfr['Total distance (m)'] = pd_sfr['rchlen'].cumsum()
# if 'Logger Location' in XSg.columns:
#     drop_iseg = XSg[~XSg['Logger Location'].isna()].iseg.values
#     # remove stream segments for routing purposes only
#     grid_sfr = grid_sfr[~grid_sfr.iseg.isin(drop_iseg)]
sfrdf =  clean_sfr_df(model_ws, dt_ref, pd_sfr, name='MF')
# gradient is stage - Ha/str thick, and strthick=1
sfrdf['h_aquifer'] = -(sfrdf.gradient*sfrdf.strthick - sfrdf.stage)

In [None]:
grid_sfr.groupby('facies').count()['vka']/grid_sfr.shape[0]

In [None]:
sfrdf_full =  clean_sfr_df(model_ws, dt_ref, name='MF')
sfrdf0_full =  clean_sfr_df(model_ws0, dt_ref,name='MF')

In [None]:
grid_sfr = pd.read_csv(join(model_ws0,'grid_sfr.csv'),index_col=0)
drop_iseg = grid_sfr[grid_sfr.strhc1==0].iseg.values
grid_sfr = grid_sfr[grid_sfr.strhc1!=0]
grid_sfr['vka'] = vka[grid_sfr.k, grid_sfr.i, grid_sfr.j]
pd_sfr = grid_sfr.set_index(['iseg','ireach'])[['rchlen','strtop', 'facies', 'strthick']]
pd_sfr['Total distance (m)'] = pd_sfr['rchlen'].cumsum()

sfrdf0=  clean_sfr_df(model_ws0, dt_ref, pd_sfr, name='MF')
# gradient is stage - Ha/str thick, and strthick=1
sfrdf0['h_aquifer'] = -(sfrdf0.gradient*sfrdf0.strthick - sfrdf0.stage)


In [None]:
# segment 4 shows no negative flow in the time-series
# sfrdf0[sfrdf0.segment==ns].loc['2017-1-1':'2017-5-1'].plot(y='Qout')

In [None]:
sfrdf0.loc['2017-1-1'].plot(x='Total distance (m)', y='Qout')
fig, ax = plt.subplots()
sfrdf0.loc['2017-5-1'].plot(x='Total distance (m)', y='stage', ax=ax)
ax.set_aspect(500)

In [None]:
# even after switching to 1 cross-section every 4 segments there is still big variability in depth, if not more than before.
chk = sfrdf0.loc['2017-1-1':'2017-5-1'].groupby('Total distance (m)').mean(numeric_only=True)
fig,ax=plt.subplots(figsize=(6.5,2),dpi=300)
chk.plot(y=['stage'], ax=ax)
ns = 4
chk[chk.segment==ns].plot(y=['stage','strtop', 'depth'])
chk[chk.segment==ns].plot(y=['Qout'])

# on a close-up scale, the depth is fairly uniform within segments with some having slight increases or decreases
# only segment has an up-down pattern

# for some reason after averaging by distance, seg 4 shows negative flow which is not true
# there is greater variability under lower flows when XS shape has more impact

In [None]:
# id_cols = ['layer','row','column','segment','reach']
# sfrdf_all = sfrdf.join(sfrdf0.set_index(id_cols, append=True), on=['dt']+id_cols, rsuffix='0')
sfrdf_all = pd.concat((sfrdf.assign(scenario='baseline'), sfrdf0.assign(scenario='restoration')))
# sfrdf_all = pd.concat((sfrdf0.assign(scenario='baseline'), sfrdf.assign(scenario='restoration')))


In [None]:
# facies_sum = sfrdf_all.groupby(['dt','facies','scenario']).sum()
# facies_mean = sfrdf_all.groupby(['dt','facies','scenario']).mean()

The mean, median, min depth across reaches doesn't help show a significant change except that the baseline has slightly lower peaks.  

The segments with flow is odd because right now it might include the segments that need to be dropped.

*The days with flow doesn't greatly change between scenarios so not worth showing*

In [None]:
# fig, ax = plt.subplots(figsize=(6.5,3),dpi=300)
# # sfrdf.resample('D').median(numeric_only=True).plot(y='depth', ax=ax, label='No reconnection')
# # sfrdf0.resample('D').median(numeric_only=True).plot(y='depth',ax=ax,label='Baseline')

# sfrdf.resample('D').sum(numeric_only=True).plot(y='flowing', ax=ax, label='No reconnection')
# sfrdf0.resample('D').sum(numeric_only=True).plot(y='flowing',ax=ax,label='Baseline')
# plt.ylabel('Segments with flow')
# plt.xlabel('Date')

The plot of streamflow at the outlet is better visualized for differences without log scale, unless only plotting the summertime flows. Without log scale is becomes notcieable that the floodplain reconnection leads to higher winter baseflow levels.
- the streamflow peaks are certainly higher under the baseline scenario so we should check what is driving that whether it is baseflow or lake seepage out or just the lack of streambed losses.
- the elevated winter baseflow levels should be coplotted with lake volume to determine if they are driven by floodplain storage releases or groundwater releases.

**To understand the cause of the difference we need to coplot with seepage and lake storage which is down below**

In [None]:
# can't use seven day roling average because this smooths out some key points
plt_dates = pd.date_range('2020-1-1','2020-9-30')
plt_dates = pd.date_range('2014-10-1','2020-9-30')

fig, ax = plt.subplots( figsize=(6.5,3),dpi=300, sharex=True)
seg_plt = (sfrdf.segment==sfrdf.segment.max())
# seg_plt = (sfrdf.segment==sfrdf.segment.median())
# seg_plt = (sfrdf.segment==33)

sfrdf0[seg_plt].loc[plt_dates].plot(y='Qout', ax=ax, label='Restoration',linewidth=0.5)
sfrdf[seg_plt].loc[plt_dates].plot(y='Qout', ax=ax, label='Baseline',linewidth=0.5)
# sfrdf[sfrdf.segment==1].loc[plt_dates].plot(y='Qin', ax=ax, label='Inflow', linewidth=0.5)


plt.yscale('log')
ax.set_ylabel('Outlet Streamflow\n($m^3/day$)')
plt.xlabel('Date')


What if we plot streamflow by wet and dry season? Easier to show scales
There is nothing very interesting in the summer. The most interesting is likely the late winter to spring.

In all cases the baseline leads to greater streamflows because of the floodplain building early storage and then adding to large flows despite recharge losses. Because we are running the simulation on a daily scale there are some timing issues that don't appear so it is not appropriate to discuss the impact on peak streamflows and since there is no difference in low flows then it isn't appropriate either. What might make the most sense is plotting log scale to compare winter baseflows but then we need to show the cause

In [None]:
# # can't use seven day roling average because this smooths out some key points
# plt_dates = pd.date_range('2020-1-1','2020-9-30')
# plt_dates = pd.date_range('2014-10-1','2020-9-30')
# # plt_dates = plt_dates[plt_dates.month.isin([11,12,1,2,3,4])]

# fig, axes = plt.subplots(6,1, figsize=(6.5,6.5),dpi=300, sharex=False)
# seg_plt = (sfrdf.segment==sfrdf.segment.max())
# # seg_plt = (sfrdf.segment==sfrdf.segment.median())
# # seg_plt = (sfrdf.segment==33)
# for n, y in enumerate(np.arange(2015,2021)):
#     ax = axes[n]
#     plt_dates = pd.date_range(str(y)+'-1-1', str(y)+'-6-1')
#     # plt_dates = pd.date_range(str(y)+'-5-1', str(y)+'-10-31')
#     # plt_dates = plt_dates[plt_dates.isin(sfrdf.index)]
#     sfrdf[seg_plt].loc[plt_dates].plot(y='Qout', ax=ax, label='No Reconnection',linewidth=0.5, legend=False)
#     sfrdf0[seg_plt].loc[plt_dates].plot(y='Qout', ax=ax, label='Baseline',linewidth=0.5, legend=False)
#     ax.set_yscale('log')

# # sfrdf[sfrdf.segment==1].loc[plt_dates].plot(y='Qin', ax=ax, label='Inflow', linewidth=0.5)

# axes[0].legend()
# fig.supylabel('Outlet Streamflow ($m^3/day$)')
# plt.xlabel('Date')


In [None]:
## the days with flow doesn't seem to provide anything distinctly new

# fig,ax=plt.subplots(figsize=(6.5,6))
# seg_plt = (sfrdf.segment==sfrdf.segment.max())
# freq = 'MS'
# freq='AS-Oct'
# sfrdf[seg_plt].resample(freq).sum(numeric_only=True).plot(y='flowing',ax=ax,label='No Reconnection', kind='bar')
# sfrdf0[seg_plt].resample(freq).sum(numeric_only=True).plot(y='flowing', ax=ax, label='Baseline', kind='bar',alpha=0.7)
# plt.ylabel('Days with flow')
# plt.xlabel('Date')

In [None]:
ttest_all = pd.DataFrame()
for freq in ['annual','monthly']:
    for s in ['Winter','Spring','Summer','Fall']:
        # streamflow isn't relevant to facies really
        t_df = run_stats(sfrdf[sfrdf.segment==sfrdf.segment.max()], 
                 sfrdf0[sfrdf.segment==sfrdf.segment.max()], 'Qout', freq=freq, season=s)
        ttest_all = pd.concat((ttest_all, t_df))
        for f in ['Gravel','Sand','Sandy Mud','Mud']:
            t_df = run_stats(sfrdf[sfrdf.facies==f].groupby('dt').sum(numeric_only=True), 
                      sfrdf0[sfrdf0.facies==f].groupby('dt').sum(numeric_only=True), 'Qrech', freq=freq, season=s)
            ttest_all = pd.concat((ttest_all, t_df.assign(facies=f)))
# only flow from the last segment is compared (cumulative impact)

# ttest_all.columns=['z_stat','pvalue','term','freq']
sfr_name_dict = {'Qout':'Outlet Streamflow', 'Qrech':'Stream Recharge'}
# replace term with clean name
ttest_all['term'] = [sfr_name_dict[t] for t in ttest_all.term]

Need to decide if statistics should be based on comparing daily data for streamflow or monthly. Why should I use monthly instead of daily?

In [None]:
ttest_out = ttest_all[['freq','term','season','facies','statistic','pvalue', 'perc_diff_in_means']]

ttest_flow = ttest_out[ttest_out.term=='Outlet Streamflow'].drop(columns='facies')
ttest_flow[ttest_flow.freq=='monthly'].drop(columns=['freq']).to_csv(join(ttest_dir, 'flow_monthly.csv'), index=False)
ttest_flow[ttest_flow.freq=='annual'].drop(columns=['freq']).to_csv(join(ttest_dir, 'flow_annual.csv'), index=False)

ttest_facies = ttest_out[~ttest_out.facies.isna()]
ttest_facies[ttest_facies.freq=='monthly'].drop(columns=['freq']).to_csv(join(ttest_dir, 'facies_monthly.csv'), index=False)
ttest_facies[ttest_facies.freq=='annual'].drop(columns=['freq']).to_csv(join(ttest_dir, 'facies_annual.csv'), index=False)

# ttest_facies[ttest_facies.freq=='monthly']

There is not any baseflow in the scenario without floodplain reconnection, and the only baseflow with levee removal comes from Mud. In this case it makes more sense to present the results in terms of stream leakage where lower leakage means less water is infiltrating to the aquifer.

Qrech (Qaquifer), losing, connected show how mud and baseline vs no reconnection have differences. Qbase shows the starkest difference because only the Mud has baseflow.

When looking at Qout averaged across the facies the peak flows are slightly higher in the baseline scenario. We need to make a distinction between when the levee removal improves conditions and worsens. We should also note flood flow reduction value.

In [None]:
# plt_df = facies_sum.copy()
# plt_df = facies_mean.copy()
# g = sns.relplot(plt_df, 
#             x='dt',y='Qrech', col='facies', hue='scenario', col_wrap=2, kind='line')
# # g.set(yscale='log') # doesn't improve visualization
# # g.set(yscale="log")

The mud are the only facies that contribute to baseflow in the baseline scenario likely because they are the only facies to hold on to water long enough to maintain a higher gradient. It would also be interesting to map whether ET relates to facies as well.

Recharge and baseflow show that with levee removal there is elevated groundwater elevations that reduce the streambed seepage to the aquifer and create conditions for baseflow to occur.  

If we zoom in on Mud we see a much bigger contrast between baseline and no reconnection for streambed seepage.

In [None]:
# fig,ax= plt.subplots(len(plt_cols),1, sharex=True,  figsize=(6.5, len(plt_cols)*1),dpi=300)


In [None]:
plt_df = sfrdf_all.copy()
# plt_df[~sfrdf_all.facies.isin(['Mud'])] = np.nan

plt_df = plt_df.groupby(['WY','segment','scenario']).mean(numeric_only=True)
# start simple with just year by segment ,'month','facies'
sns.relplot(plt_df, x='Total distance (m)',y='Qrech', 
            col='WY', col_wrap=2, hue='scenario', 
            kind='line'
#             kind='scatter'
           )

The sum plots of days with flow doesn't show distinct differences even with certain facies.

## Flow duration curve
Helen suggested plotting the flow duration curves as a way of looking at how probability changes for flow events.

The OSU Streamflow guide and USGS method suggest grouping a streamflow record into 20-30 bins and then to count the flow events that fall within each bin to create the flow-duration curve

- the flow duration curve shows and increase in the exceedance of smaller flow events (1E1 - 1E4 m3/d) and an increase in high flow events, 1E6-1E8. the increase in low flows is due to lowered summer seepage while the increase in high flow events is because the lake builds up storage which in MODFLOW results in higher peak flows which isn't realistic since it's not a 2D hydraulic model.

In [None]:
def flow_curve(sfrdf):
    sfr_last = sfrdf[sfrdf.segment==sfrdf.segment.max()].copy()
    sfr_last_sort = sfr_last['Qout'].copy().sort_values(ascending=False).reset_index()
    sfr_last_sort['P'] = np.arange(0, sfr_last_sort.shape[0])/(sfr_last_sort.shape[0]+1)

    return(sfr_last_sort)
    
fig, ax = plt.subplots()
sfr_last_sort0 = flow_curve(sfrdf0)
sfr_last_sort0.plot(x='P',y='Qout', ax=ax, label='Restoration')

sfr_last_sort = flow_curve(sfrdf)
sfr_last_sort.plot(x='P',y='Qout', ax=ax, label='Baseline')

ax.set_yscale('log')
# ax.set_xlim(.50,.65)
ax.set_ylabel('Discharge ($m^3/d$)')
ax.set_xlabel('Exceedance Probability')

# switching from daily to monthly streamflow kept the shift upward in discharge

In [None]:

# def flow_curve_cat(sfrdf):
#     sfr_last = sfrdf[sfrdf.segment==sfrdf.segment.max()].copy()
#     # categorized - OSU plotted the exceedance at the upper end, linear
#     flow_group = np.linspace(sfr_last.Qin.min(), sfr_last.Qin.max(), 30)
#     # logarithmic
#     Qmin = np.max((0, np.log10(sfr_last.Qin.min())))
#     Qmax = np.log(sfr_last.Qin.max())
#     flow_group = np.exp(np.linspace(Qmin, Qmax, 30))
#     flow_group[0] = Qmin # reset 0

#     sfr_last['flow_group'] = 0
#     for n in np.arange(0,len(flow_group)-1):
#         sfr_last.loc[(sfr_last.Qin>flow_group[n])&(sfr_last.Qin<flow_group[n+1]), 'flow_group'] = flow_group[n+1]
#     # sfr_last['flow_count'] = sfr_last.groupby('flow_group').count()['Qin']
#     sfr_last_sort = sfr_last.groupby('flow_group').count()['Qout'].reset_index()
#     sfr_last_sort = sfr_last_sort.sort_values('flow_group', ascending=False)
#     sfr_last_sort['cum_flow_count'] = sfr_last_sort.Qout.cumsum()
#     sfr_last_sort['P'] = sfr_last_sort.cum_flow_count/sfr_last_sort.Qout.sum()
#     return(sfr_last_sort)

# fig, ax = plt.subplots()
# flow_curve_cat(sfrdf0).plot(x='P',y='flow_group', ax=ax, label='Restoration')
# flow_curve_cat(sfrdf).plot(x='P',y='flow_group', ax=ax, label='Baseline')
# plt.yscale('log')

# # the monthly plots show the same result as the daily so it's not really needed to characterize

### Compare water budget fluxes to cumulative discharge
Monthly scatter plots (annual shows similar result but more simplified)
- For floodplain recharge there is logarithmic trend in both scenarios as at some point greater discharge doesn't increase recharge. There is a noticeable offset in trends.
- the trend is similar for stream recharge but the difference is much less noticeable. big difference in trend for baseflow
- ET increases logarithmicaly as well with most differences at larger flows
- cumulative storage doesn't have a clear pattern but has increases, similar with GHB_NET where there is more outflow but no trend with discharge
- the discharge at the inlet vs outlet suggests that there is an increase in peak flows with restoration

In [None]:
# cumulative discharge in is equal but in case we change segments it will be different
rs = 'MS' #'AS'
sfr_sum = sfrdf[sfrdf.segment==1].resample(rs).sum()
sfr0_sum = sfrdf0[sfrdf0.segment==1].resample(rs).sum()

sfr_last_sum = sfrdf[sfrdf.segment==sfrdf.segment.max()].resample(rs).sum()
sfr0_last_sum = sfrdf0[sfrdf0.segment==sfrdf.segment.max()].resample(rs).sum()

var = 'ET_OUT'
var = 'LAK_IN'
plt.scatter(sfr_sum.Qin, wb0.resample(rs).sum()[var], label='Restoration')
plt.scatter(sfr0_sum.Qin, wb.resample(rs).sum()[var], label='Baseline')
plt.ylabel(var+'Monthly Flux ($m^3/month$)')

# plt.scatter(sfr0_sum.Qin, sfr0_last_sum.Qout, label='Restoration')
# plt.scatter(sfr_sum.Qin, sfr_last_sum.Qout, label='Baseline')
# plt.ylabel('Monthly Discharge out ($m^3/month$)')
# # the flow in vs flow out shows that restoration increases peak outflows

plt.legend()
plt.xlabel('Monthly Discharge in ($m^3/month$)')

# maybe the flow fraction into the floodplain is overestimated 
# as less flow could go onto the floodplain and the benefits would remain since only 1% is recharged

# Spatial comparison of seepage

In [None]:
vka = m.upw.vka.array

In [None]:
hdobj = flopy.utils.HeadFile(model_ws+'/MF.hds')
hdobj0 = flopy.utils.HeadFile(model_ws0+'/MF.hds')


In [None]:
def plot_vka(vka, ax, sfr_nodata, k_max):
    sfr_hk = vka[:k_max][:, sfr_rows, sfr_cols]
    sfr_hk = np.ma.masked_where(sfr_nodata[:k_max], sfr_hk)
    im = ax.imshow(sfr_hk, norm = mpl.colors.LogNorm(vmin=vmin, vmax=vmax), 
                   aspect='auto', cmap='viridis_r')
    # plt.xticks([]);
    ax.set_yticks(ticks = np.arange(1,k_max,5), labels=m.dis.botm.array[:,0,0][:k_max:5]);
    ax.set_xticks(ticks = np.arange(0, len(plt_segs),10), labels=np.arange(0, len(plt_segs),10), rotation=90)
    return sfr_hk, im

In [None]:
plt_segs = sfrdf.segment.unique()

In [None]:
sfr_hk_plt = grid_sfr[~grid_sfr.iseg.isin(drop_iseg)]
vmin = sfr_hk_plt.vka.min()
vmax = sfr_hk_plt.vka.max()

sfr_seg = sfr_hk_plt.drop_duplicates('node')
sfr_rows = sfr_seg.i.values
sfr_cols = sfr_seg.j.values
sfr_lays = sfr_seg.k.values

k_max = int(sfr_hk_plt.k.max())
k_max = m.dis.nlay-1

# define by active cells (new: don't include bottom 10, original: don't include bottom)
sfr_ibound = ~m.bas6.ibound.array[:-1, sfr_rows, sfr_cols].astype(bool)
# identify where data should be removed becaues it's above land
sfr_nodata = np.zeros((k_max, len(sfr_lays)), dtype=bool)
for n in np.arange(0,len(sfr_lays)):    
    sfr_nodata[:sfr_lays[n], n] = True

# plot only data below ground
fig, ax = plt.subplots(figsize=(6.5,2))
sfr_hk, im = plot_vka(vka, ax, sfr_nodata, k_max = 15)

fig.supylabel('Layer')
# cbar_ax=ax.ravel().tolist()
fig.colorbar(im, ax=ax, orientation='vertical', label='$K_{vert}$\n($m/day$)', shrink=1, location='right')  


In [None]:
plt_dates = pd.date_range('2017-1-1','2017-5-30')
# plt_dates = pd.date_range('2017-6-1','2017-9-30')

def sfr_load_hds(hdobj, plt_dates):
    # runs pretty quickly with hdobj.get_data
    sfr_heads = np.zeros((len(plt_dates), len(plt_segs)))
    avg_heads = np.zeros((len(plt_dates), len(plt_segs)))
    for n, plt_date in enumerate(plt_dates):
        spd = dt_ref.loc[dt_ref.dt==plt_date, 'kstpkper'].values[0]
    
        head = hdobj.get_data(spd)
        head = np.ma.masked_where(head ==-999.99, head)
        sfr_heads[n,:] = head[sfr_lays, sfr_rows, sfr_cols]
        # pull head for top 10 layers to compare
        avg_heads[n,:] = np.mean(head[:10, sfr_rows, sfr_cols], axis=0)
    return(sfr_heads, avg_heads)

sfr_heads, avg_heads = sfr_load_hds(hdobj, plt_dates)
sfr_heads0, avg_heads0 = sfr_load_hds(hdobj0, plt_dates)

In [None]:
# grid_sfr.plot(x='iseg',y='strtop')

In [None]:
fig, ax = plt.subplots(2,1, figsize=(6.5,3),dpi=300)

ax[0].plot(sfrdf0.loc[plt_dates].groupby('segment').mean(numeric_only=True).stage.values, color='blue', label='Rest. Stream Stage')
ax[0].plot(sfrdf.loc[plt_dates].groupby('segment').mean(numeric_only=True).stage.values, color='lightblue', label='Base Stream Stage')
# plt.plot(grid_sfr.strtop, color='brown',label='Stream Top')

# spd = dt_ref.loc[dt_ref.dt==plt_date, 'kstpkper'].values[0]

# ax.plot(np.mean(sfr_heads0, axis=0))
# ax.plot(np.mean(sfr_heads, axis=0))
# ax.plot(np.mean(avg_heads0, axis=0))
# ax.plot(np.mean(avg_heads, axis=0))

# can also plot with back calculated head from gradient
ax[1].plot(sfrdf0.loc[plt_dates].groupby('segment').mean(numeric_only=True).h_aquifer.values, color='brown', label='Rest. GW Head')
ax[1].plot(sfrdf.loc[plt_dates].groupby('segment').mean(numeric_only=True).h_aquifer.values, color='tan', label='Base GW Head')

ax[0].legend()
ax[1].legend()
ax[0].set_aspect(2)
ax[1].set_aspect(2)

The idea that baseflow may be driven more by the decrease in stream stage than the increase in groundwater level is interesting in itself because it is perhaps revealing a natural way that alternate flow paths develop. We can caveat if needed that there is uncertainty in the flow in the channel but either way there would be a signficant flow path from the floodplain to the stream (could look at floodplain loggers vs stream loggers stage or flood rasters from Whipple).   
- One way to clarify the baseflow due to stage vs groundwater would be to look at days when the stage is equal between scenarios (e.g., >71.6 cms) then look at how groundwater differs.
- The idea of floodplain filtering is interesting as well, but I would need justification for the removal of contaminants or the addition of helpful things like the primary production. MODPATH would give us the residence time of floodplain recharge and the fate more precisely than water budgets, but I'm not sure we can directly link it to quality improvements. 

In [None]:
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

profile_legend_elements = [
    # Patch(facecolor='tab:blue', alpha=0.5, label='Floodplain'),
    Line2D([0], [0], color='tab:blue',  linestyle='-', label='Restoration'),
    Line2D([0], [0],color='tab:orange',label='Baseline'),
]


In [None]:
# averaged across all time we see baseflow only in the floodplain, but within that it is variable
# and seeapage shows hotter spots
fig,ax = plt.subplots(6,1,sharex=True, sharey=False, layout='constrained', dpi=300,
                      figsize=(8, 6.5),
                      
                      # gridspec_kw={'height_ratios':(3,2, 2,2, 2, 2)}
                     )

sfr_hk, im = plot_vka(vka, ax[0], sfr_nodata, k_max=15)

ax[0].set_ylabel('Layer\nElevation\n(m AMSL)')
# grid_sfr.reset_index().vka.plot(ax=ax[1], color='black')
# ax[1].set_ylabel('Stream\nVKA (m/d)')


def plt_profile(sfrdf, plt_dates, ax, color):
    df_mean = sfrdf.loc[plt_dates].groupby('segment').mean(numeric_only=True).reset_index()
    df_mean.plot(y='Qrech', ax=ax[-2], legend=False, color=color)
    ax[-2].set_ylabel('Recharge\n($m^3/day$)')
    df_mean.plot(y='Qbase', ax=ax[-1], legend=False, color=color)
    ax[-1].set_ylabel('Baseflow\n($m^3/day$)')
    ax[-3].axhline(y=0, color='black', alpha=0.5) # show transition from gaining to losing
    df_mean.plot(y='gradient', ax=ax[-3], legend=False, color=color)
    ax[-3].set_ylabel('Vertical\nGradient')

# plt_dates = pd.date_range('2017-1-1','2017-5-30')
plt_dates = pd.date_range('2017-3-1','2017-6-30')
# plt_dates = '2017-1-16'
plt_profile(sfrdf0, plt_dates, ax, color='tab:blue')
plt_profile(sfrdf, plt_dates, ax, color='tab:orange')

ax[1].plot(sfrdf0.loc[plt_dates].groupby('segment').mean(numeric_only=True).stage.values)
ax[1].plot(sfrdf.loc[plt_dates].groupby('segment').mean(numeric_only=True).stage.values)
# ax[1].set_ylabel('Stream\nStage (m)')
ax[1].set_ylabel('Stream\nStage\n(m)')

ax[2].plot(sfrdf0.loc[plt_dates].groupby('segment').mean(numeric_only=True).h_aquifer.values)
ax[2].plot(sfrdf.loc[plt_dates].groupby('segment').mean(numeric_only=True).h_aquifer.values)
# ax[2].set_ylabel('Groundwater\nElevation (m)')
ax[2].set_ylabel('GW\nElevation\n(m)')



fig.tight_layout(h_pad=0.1)

fig.legend(handles=profile_legend_elements, loc='center', bbox_to_anchor=[0.3, 0.99], ncol=1)
fig.colorbar(im, orientation = 'horizontal', location='top', label='$K_{vert}$ ($m/day$)', shrink=0.3)
plt.xlabel('Stream Segment')
# plt.savefig(join(fig_dir, 'longitudinal_profile_stream_aquifer.png'), bbox_inches='tight')

In [None]:
df_mean = sfrdf.loc[plt_dates].groupby('segment').mean(numeric_only=True).reset_index()
df_mean0 = sfrdf0.loc[plt_dates].groupby('segment').mean(numeric_only=True).reset_index()
# average reduction
red_rch = 100*(1-(df_mean0.loc[10:50, 'Qrech']/df_mean.loc[10:50, 'Qrech']).mean())
print('Avg %.2f %% reduction in stream recharge' %red_rch)

red_grad = 100*(1-(df_mean0.loc[10:50, 'gradient']/df_mean.loc[10:50, 'gradient']).mean())
print('Avg %.2f %% reduction in stream gradient' %red_grad)

In [None]:
# averaged across all time we see baseflow only in the floodplain, but within that it is variable
# and seeapage shows hotter spots
fig,ax = plt.subplots(5,1,sharex=True, sharey=False, layout='constrained',
                      gridspec_kw={'height_ratios':(3,2,2, 2, 2)}, dpi=300)

sfr_hk, im = plot_vka(vka, ax[0], sfr_nodata)
ax[0].set_ylabel('Elevation\n(m AMSL)')
grid_sfr.reset_index().vka.plot(ax=ax[1], color='black')
ax[1].set_ylabel('Stream\nVKA (m/d)')

def plt_profile(sfrdf, plt_dates, ax, color):
    df_mean = sfrdf.loc[plt_dates].groupby('segment').mean(numeric_only=True).reset_index()
    df_mean.plot(y='Qrech', ax=ax[-2], legend=False, color=color)
    ax[-2].set_ylabel('Recharge\n($m^3/day$)')
    df_mean.plot(y='Qbase', ax=ax[-1], legend=False, color=color)
    ax[-1].set_ylabel('Baseflow\n($m^3/day$)')
    ax[2].axhline(y=0, color='black', alpha=0.5) # show transition from gaining to losing
    df_mean.plot(y='gradient', ax=ax[2], legend=False, color=color)
    ax[2].set_ylabel('Gradient')

plt_dates = pd.date_range('2017-1-1','2017-5-30')
# plt_dates = '2017-1-16'
plt_profile(sfrdf0, plt_dates, ax, color='tab:blue')
plt_profile(sfrdf, plt_dates, ax, color='tab:orange')

fig.tight_layout(h_pad=0.1)

fig.legend(handles=profile_legend_elements, loc='center', bbox_to_anchor=[0.3, 0.99], ncol=1)
fig.colorbar(im, orientation = 'horizontal', location='top', label='VKA ($m/day$)', shrink=0.3)
plt.xlabel('Stream Segment')
# plt.savefig(join(fig_dir, 'longitudinal_profile_stream_aquifer.png'), bbox_inches='tight')