In [None]:
import pandas as pd
import sys
import numpy as np
import pathlib as pl
import zipfile
sys.path.append('../dependencies/')
import pyemu
from datetime import datetime as dt
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
curr_model = '01473000'
curr_run_root = 'prior_mc_reweight'
pstdir = pl.Path(f'../NHM_extractions/20230110_pois_haj/{curr_model}/')
results_file = pl.Path(f'../results/{curr_run_root}.{curr_model}.zip/')
tmp_res_path = pl.Path(f'../results/{curr_model}.{curr_run_root}')

In [None]:
fig_dir = pl.Path( f'../postprocessing/figures/{curr_model}/{curr_run_root}.{curr_model}')
if not fig_dir.exists():
    fig_dir.mkdir(parents=True)

In [None]:
datfmtmon = '%Y_%m'
datfmtdaily = '%Y_%m_%d'

## need pst file to get observation data

In [None]:
pst = pyemu.Pst(str(pstdir / f'{curr_run_root}.pst'))
obs = pst.observation_data
obs

In [None]:
pst.nnz_obs

## all engineering is reengineering --- let's make 0 weights.....again

In [None]:
obs.loc['sca_daily:2000_10_10:5621']

In [None]:
obs.loc[obs.obsval==-9999, 'weight'] = 0
obs.loc[obs.obsval==-9999,'obsval']=np.nan


In [None]:
pst.nnz_obs

## extract the model results

In [None]:
with zipfile.ZipFile(results_file, 'r') as zf:
    zf.extractall(tmp_res_path)

In [None]:
pdc = pd.read_csv(tmp_res_path / f'{curr_run_root}.pdc.csv', index_col=0)

In [None]:
pdc 

## bring in weights and groups to pdc

In [None]:
pdc = pdc.merge(obs[['weight', 'obgnme']], left_index=True, right_index=True)

### drop all zero weighted obs

In [None]:
pdc = pdc.loc[pdc.weight!=0]
pdc

In [None]:
pdc.obgnme.unique()

In [None]:
obs.obgnme.unique()

### we notice some SCA values are simulated as 0 when obsval != 0. is that alot?

In [None]:
pdc.loc[pdc.sim_mean<=1e-3].obgnme.unique()

In [None]:
pdc_sca = pdc.loc[pdc.sim_mean<=1e-3]
pdc_sca.obs_mean.hist(bins=50)

In [None]:
len(pdc_sca)

### yeah - hella. like half

In [None]:
pdc.loc[pdc.sim_mean==0].obs_mean.describe()

### trim the remaining PDC and look for clues 🧐

In [None]:
pdc = pdc.loc[pdc.sim_mean>1e-3]

In [None]:
pdc.obgnme.unique()

### seems like still a few sca_daily issues to sort out. Noise obs will likely solve ALL of these

In [None]:
pdc_therest = pdc.loc[pdc.obgnme != 'sca_daily']

### one of these kids is not like the other ----> let's check out soil moisture

In [None]:
pdc_sm = pdc.loc[pdc.obgnme == 'soil_moist_mon']
# is it more tilted toward an earlier timeframe?
pdc_sm.loc[:,'year'] = [int(i.split(':')[1].split('_')[0]) for i in pdc_sm.index ]

In [None]:
pdc_sm.year.hist(bins=10)

#### nope

In [None]:
pdc_sm[['obs_mean','sim_mean', 'sim_min',	'sim_max']].plot()

In [None]:
obs.loc[obs.obgnme=='soil_moist_mon'].obsval.plot()

In [None]:
len(pdc_sm.loc[pdc_sm.obgnme=='soil_moist_mon']), len(obs.loc[obs.obgnme=='soil_moist_mon'])

### for the PDC `soil_moist_mon` obs they are consistently simulated way too low

### what's left? Streamflow?

In [None]:
pdc_streamflow = pdc.loc[(pdc.obgnme != 'sca_daily') & (pdc.obgnme != 'soil_moist_mon')].copy()

In [None]:
len(pdc_streamflow)

In [None]:
for cn, cg in pdc_streamflow.groupby('obgnme'):
    plt.figure()
    cg[['obs_mean','sim_mean', 'sim_min',	'sim_max']].plot()
    plt.title(cn)
    plt.yscale('log')

In [None]:
pdc_streamflow.loc[:,'diff_bias'] = pdc_streamflow.obs_mean - pdc_streamflow.sim_mean

In [None]:
for cn, cg in pdc_streamflow.groupby('obgnme'):
    plt.figure()
    cg['diff_bias'].hist(bins=50)
    plt.title(cn)


## look at a histogram of PHI

In [None]:
phi = pd.read_csv(tmp_res_path / f'{curr_run_root}.phi.meas.csv').T.iloc[6:]

In [None]:
phi.hist(bins=50)

## read in the observations ensemble from the Prior MC

In [None]:
obens = pd.read_csv(tmp_res_path / f'{curr_run_root}.0.obs.csv', low_memory=False, index_col=0)

In [None]:
obens.T

In [None]:
def plot_o_matic(obs_df, obens_df, curr_group, fig_dir):
    # some metdata
    plot_lw = 0.01
    plot_alpha = 0.15
    
    ### TODO: purge the zero padding garbage once we put it in the original construction code!!!!
    print(f'postprocessing group: {curr_group}')
    # get a list of all the obs names in the group
    curr_obs = obs.loc[obs.obgnme==curr_group,'obsnme'].to_list()
    # truncate the ensemble data
    curr_obs_df = obens[curr_obs].T.copy()
    # parse the obs names for useful metadata
    curr_obs_df['obs_location'] = [i.split(':')[-1] for i in curr_obs_df.index]
    curr_obs_df['datestring'] = [i.split(':')[1] for i in curr_obs_df.index]
    # get the obsval for plotting
    curr_obs_df = curr_obs_df.merge(obs['obsval'], left_index=True, right_index=True)
    # parse based on the meas freq
    if ('mon' in curr_group) & ('mean' not in curr_group):
        curr_obs_df['datestring'] = [f'{int(i.split("_")[0]):4d}_{int(i.split("_")[1]):02d}' 
                     for i in curr_obs_df['datestring']]   
        curr_obs_df['datetime'] = [dt.strptime(i, datfmtmon) for i in curr_obs_df['datestring']]
        with PdfPages(fig_dir / f'{curr_group}.pdf') as outpdf:
            for cn, cg in curr_obs_df.groupby('obs_location'):
                plt.figure()
                cgplot = cg.set_index(cg.datetime)
                cgplot = cgplot[cgplot.columns[:-4]]
                ax = cgplot.plot(legend=None, linewidth=plot_lw, color='grey', alpha = plot_alpha)
                cg.set_index(cg.datetime).obsval.plot(ax=ax, color='orange')
                ax.set_title(f'location = {cn}')
                outpdf.savefig()
                plt.close('all')
            
    elif 'mean_mon' in curr_group:
        curr_obs_df['month'] = [int(i.split(':')[1]) for i in curr_obs_df.index]
        with PdfPages(fig_dir / f'{curr_group}.pdf') as outpdf:
            for cn, cg in curr_obs_df.groupby('obs_location'):
                plt.figure()
                cgplot = cg.sort_values(by='month')
                cgplot = cgplot.set_index('month')
                ax = cgplot.plot(legend=None, linewidth=plot_lw, color='grey', alpha = plot_alpha)
                cg.sort_values(by='month').set_index('month').obsval.plot(ax=ax, color='orange')
                ax.set_title(f'location = {cn}')
                outpdf.savefig()
                plt.close('all')
            
    elif 'ann' in curr_group:
        curr_obs_df['year'] = [int(i.split(':')[1]) for i in curr_obs_df.index]
        with PdfPages(fig_dir / f'{curr_group}.pdf') as outpdf:
            for cn, cg in curr_obs_df.groupby('obs_location'):
                plt.figure()
                cgplot = cg.sort_values(by='year')
                cgplot = cgplot.set_index('year')
                ax = cgplot.plot(legend=None, linewidth=plot_lw, color='grey', alpha = plot_alpha)
                cg.sort_values(by='year').set_index('year').obsval.plot(ax=ax, color='orange')
                ax.set_title(f'location = {cn}')
                outpdf.savefig()
                plt.close('all')
    elif 'daily' in curr_group:
        curr_obs_df['datestring'] = [f'{int(i.split("_")[0]):4d}_{int(i.split("_")[1]):02d}_{int(i.split("_")[2]):02d}' 
                             for i in curr_obs_df['datestring']]
        curr_obs_df['datetime'] = [dt.strptime(i, datfmtdaily) for i in curr_obs_df['datestring']]
        curr_obs_df['year'] = [ int(i.split("_")[0])  for i in curr_obs_df.datestring]
        group_root = curr_group.replace('_daily','')
        for cn, cg in curr_obs_df.groupby('obs_location'):
            print(f'subpostprocessing location: {cn}')
            with PdfPages(fig_dir / f'{group_root}_{cn}.pdf') as outpdf:
                cg = cg.sort_values(by='year')
                for cn2, cg2 in cg.groupby('year'):
                    plt.figure()
                    cgplot = cg2.set_index('datetime')[cg2.columns[:-5]]
                    ax = cgplot.plot(legend=None, linewidth=plot_lw, color='grey', alpha = plot_alpha)
                    cg2.set_index('datetime').obsval.plot(ax=ax, color='orange')
                    ax.set_title(f'year = {cn2}')
                    outpdf.savefig()
                    plt.close('all')

In [None]:
# run them all at once
[plot_o_matic(obs, obens, curr_group, fig_dir) for curr_group in pst.obs_groups];

In [None]:
pst.adjust_weights()