## Evaluating Uncertainty and Predictive Performance of Probabilistic Models

<b>Raymond Leung</b><br>
Rio Tinto Centre, Faculty of Engineering<br>
The University of Sydney, 2024

`SPDX-FileCopyrightText: 2024 Raymond Leung and Alexander Lowe <raymond.leung@sydney.edu.au>`<br>
`SPDX-License-Identifier: `[`BSD-3-Clause`](https://opensource.org/license/BSD-3-Clause)<br>

### Examining Geostatistical Properties across Domains and Inference Periods

#### <font color="#cc0066">Pre-requisite</font>
- Having completed the experiments for all geological domains (gD) and inference periods (mA) typically by running the bash script `run_experiments.sh` on a compute node.
- Alternatively, use the cached results from the `../archive/` directory.

#### Available statistics
- Uncorrelated error: `RMSE`
- Histogram differences (model, groundtruth): `h(psChi2)`, `h(JS)`, `h(IOU)`, `h(EM)`, `h(rank)`
- Variogram-based spatial fidelity: `Variogram Ratios`, `Spatial Fidelity`
- S-scores and Uncertainty-based measures: `|s|_L`, `|s|_U`, `Consensus`, `Accuracy(.05)`, `Precision`, `Goodness`, `Tightness`

In [None]:
# Required Python modules
import ast
import copy
import matplotlib as mpl
import numpy as np
import pandas as pd
import skgstat as skg
import os
import pickle
import re
import sys
import string
import warnings

sys.path.append(os.getcwd().replace('notebook', 'code'))

from IPython.display import Image, display
from collections import OrderedDict
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy.spatial.distance import jensenshannon
from scipy.stats import ttest_rel, ttest_ind
from rtc_evaluation_metrics import *

In [None]:
#---------------------------------------------------------------------------
# CONFIGURE

# Inference mode
inference_mode = 'future-bench-prediction'

# Specify directories
base_dir = os.getcwd()
code_dir = base_dir.replace('notebook', 'code') #$experiment/code/
data_dir = base_dir.replace('notebook', 'data') #$experiment/data/
# result_dir may point to your $experiment/results/ instead
result_dir = re.sub('notebook', f"archive/{inference_mode}", base_dir)
#---------------------------------------------------------------------------

# Domain properties
inference_mode_prefix = "insitu_" if "in-situ-regression" in inference_mode else ""
gdp_csv = os.path.join(data_dir, f"{inference_mode_prefix}domain_samples_summary.csv")
df_gdp = pd.read_csv(gdp_csv)
df_gdp = df_gdp.rename(columns = {'# mA': 'mA'})

# Blocks data
blocks_csv = 'blocks_to_estimate_tagged.csv'
if "in-situ-regression" in inference_mode:
    blocks_csv = 'blocks_insitu_tagged.csv'

def data_available_for(inference_period, domain_id):
    return not df_gdp.query('@inference_period == mA and @domain_id == domain').empty

inference_periods = df_gdp['mA'].values
domain_ids = df_gdp['domain'].values
n_blastholes = df_gdp['n_blastholes'].values
n_inference_pts = df_gdp['n_inference_pts'].values
uniq_inference_periods = np.unique(inference_periods)
uniq_domain_ids = np.unique(domain_ids)

# Performance statistics
example_stats_csv = os.path.join(result_dir, 'learning_rotated/04_05_06/analysis-2310.csv')
df_example = pd.read_csv(example_stats_csv, index_col=0, header=0)
model_names = list(df_example.index)
two_powers = [int(m.replace('GP_CRF_from_','')) for m in model_names if 'CRF_from' in m]
n_exponents = int(np.log2(max(two_powers)))

In [None]:
# Some definitions for computing semi-variograms
specs = {}
variogram_use_nugget = specs.get('variogram:use_nugget', False)
variogram_max_lag = specs.get('variogram:max_lag', 250.0)
variogram_num_lag = specs.get('variogram:num_lag', 45)
variogram_required_samples = specs.get('variogram:required_samples', 30)
variogram_model = specs.get('kriging:covariance_fn', 'matern')
fixed_nu = specs.get('kriging:matern_smoothness', None)
max_range = 2 * variogram_max_lag
constraints = None
if fixed_nu:
    constraints = ([0., 0., fixed_nu - 0.0001, 0.],
                   [max_range, max_var, fixed_nu + 0.0001, 0.5 * max_var])
    if variogram_use_nugget is False:
        constraints = tuple([cs[:-1] for cs in constraints])
make_variogram = lambda x, y : skg.Variogram(
                     coordinates=x,
                     values=y,
                     estimator='matheron',
                     model=variogram_model,
                     use_nugget=variogram_use_nugget,
                     maxlag=max_range,
                     n_lags=variogram_num_lag,
                     fit_bounds=constraints)

# Extend results to reference model(s)
def compute_results_for_reference_models(attribute, mA, domain_id):
    # Specify scope
    mA_mB_mC = '%02d_%02d_%02d' % (mA, mA+1, mA+2)
    data_path = os.path.join(data_dir, mA_mB_mC)
    df_k = pd.read_csv(f"{data_path}/{blocks_csv}")
    df_domain_infer = pd.DataFrame(df_k[df_k['domain'] == domain_id])
    ground_truth = mu_0 = df_domain_infer['cu_bh_nn'].values
    if df_domain_infer.empty:
        return np.nan, np.nan    
    # Retrieve grades for long-range and bench-above models
    gt_long_range = df_domain_infer['cu_50'].values
    stats = dict()
    # Compute relevant statistic
    if attribute in ['RMSE']:
        for k, y in zip(['RTK_LongRangeModel'],
                        [gt_long_range]):
            retain = np.where(np.isfinite(y))[0]
            stats[k] = compute_root_mean_squared_error(mu_0[retain], y[retain])
    elif 'h(' in attribute and attribute != 'h(rank)':
        short_names = ['h(psChi2)', 'h(JS)', 'h(IOU)', 'h(EM)', 'h(rank)']
        long_names = ['psym-chi-square', 'jensen-shannon', 'ruzicka', 'wasserstein', None]
        name_of = dict(zip(short_names, long_names))
        for k, y in zip(['RTK_LongRangeModel'],
                        [gt_long_range]):
            retain = np.where(np.isfinite(y))[0]
            stats[k] = compute_histogram_statistics(mu_0[retain], y[retain])[name_of[attribute]]
    elif attribute == 'Variogram Ratios':
        x = df_domain_infer[['X','Y','Z']].values
        vgram = OrderedDict()
        names = ['GroundTruth(blocks)', 'RTK_LongRangeModel']
        grades = [mu_0, gt_long_range]
        if len(ground_truth) >= variogram_required_samples:
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')
                try:
                    for k, y in zip(names, grades):
                        retain = np.where(np.isfinite(y))[0]
                        vgram[k] = make_variogram(x[retain], y[retain])
                    # Stack the variograms
                    v = np.vstack([vgram[k].experimental for k in names])
                    v = v[:, np.isfinite(np.sum(v, axis=0))]
                    for row, k in zip([1,2], names[1:]): #only compute 50th percentile
                        ratios = v[row] / v[0]
                        stats[k] = np.median(ratios)
                except Exception:
                    pass # Ignore "All-NaN slice encountered" or "All input values are the same"
    return stats.get('RTK_LongRangeModel', np.nan)

In [None]:
#group models by logical partition [SK_like | OK_like | GP(G)_like | GP(L)_like]
j, k = 8, n_exponents
ordered_model_names = model_names[0:2] + model_names[j:j+k] \
                    + model_names[2:4] + model_names[j+k:j+2*k] \
                    + model_names[6:8] + model_names[j+3*k:j+4*k] \
                    + model_names[4:6] + model_names[j+2*k:j+3*k]

n_rows = len(df_example)
n_columns = len(uniq_inference_periods) * len(uniq_domain_ids)
n_gd = len(uniq_domain_ids)

# Map "<L><G><P><R>" domain IDs to alphabets
domain_symbols = dict(zip(uniq_domain_ids, list(string.ascii_uppercase[:len(uniq_domain_ids)])))

'''
Implementation of a custom image visualisation function
'''
def imshow_matrix_common(rspace, attribute, colourmap, cfg={}):
    """
    @brief  Render an attribute matrix M as a colour image,
            M.shape=(rows,cols) where rows = #model, cols = (#domain * #inference_period)
    @detail Options available via cfg dict
     - 'group_by' (str) ['domain,inference', 'inference,domain']
            Outer and inner FOR loops w.r.t. x axis, whether to have
            geological domain or inference period as the major category
     - 'include_reference_models' (bool)
            Add extra rows for RTK_LongRangeModel and RTK_BenchAboveData
     - 'v_min', 'v_max' (float)
            Set value limits for the data and colour bar
     - 'hide_unmodelled_columns' (bool)
            Remove (domain,period) columns where groundtruth is unavailable (modelling is not done)
     - 'decorate_nans' (bool)
            Use marker to represent NaNs to distinguish them from high-intensity values
     - 'fontsize' (int)
            Font size applicable to the x-axis, y-axis and the graph title
     - 'title' (str)
            Heading shown at the top of the figure
     - 'colorbar_title' (str)
            Optional label for the colour bar (default: "")
     - 'compute_ratio' (str) variable abbreviated as k
            If k is non-empty, the result for the current attribute will be divided by
            a previously calculated attribute matrix, cfg[k]['M'], and rendered as an image.
     - 'skip_plot' (bool) default: False
            Enabled by user with the intent of obtaining the attribute matrix only
     - 'return_config_only' (bool) default: False
            Used in the first-pass to retrieve the updated config dictionary populated
            with f"results:{rspace}_{attribute}" instead of the matrix M when 'skip_plot'
            is True. The intent is to contrast the results with rspace='learning_rotated'
            and rspace='not_rotated' and plotting their ratios in a subsequent call with
            'compute_ratio' set to f"{rspace}_{attribute}"
    """
    col = 0
    n_ip = len(uniq_inference_periods)
    n_gd = len(uniq_domain_ids)
    include_reference_models = cfg.get('include_reference_models', False)
    non_probabilistic_models = ['LongRangeModel']
    #non_probabilistic_models = ['LongRangeModel', 'BenchAboveModel']
    total_rows = n_rows + len(non_probabilistic_models) if include_reference_models else n_rows
    M = np.nan * np.ones((total_rows, n_columns))
    group_by = cfg.get('group_by', 'domain,inference')
    v_min = cfg.get('v_min', -np.inf)
    v_max = cfg.get('v_max', np.inf)
    eps = cfg.get('eps', 1e-6)
    modelling_required = []

    if group_by == 'domain,inference':
        outer_vars = uniq_domain_ids
        inner_vars = uniq_inference_periods
    else:
        outer_vars = uniq_inference_periods
        inner_vars = uniq_domain_ids

    for vo in outer_vars:

        for vi in inner_vars:
            gd = vo if group_by == 'domain,inference' else vi
            mA = vi if group_by == 'domain,inference' else vo
            modelling_required.append(data_available_for(mA, gd))
            mA_mB_mC = f"%02d_%02d_%02d" % (mA, mA+1, mA+2)
            analysis_csv = os.path.join(result_dir, rspace, f"{mA_mB_mC}", f"analysis-{gd}.csv")
            try:
                df = pd.read_csv(analysis_csv, index_col=0, header=0)
                df.rename(columns={'Likelihood': 'Consensus'}, inplace=True)                M[:n_rows, col] = np.minimum(np.maximum(
                                  [df.loc[m, attribute] for m in ordered_model_names], v_min), v_max)
                if include_reference_models:
                    local_attribute = 'Variogram Ratios' if attribute == 'Spatial Fidelity' else attribute
                    values = compute_results_for_reference_models(local_attribute, mA, gd)
                    if attribute == 'Spatial Fidelity':
                        values = np.sqrt(1 - np.abs(np.minimum(values, 2) - 1))
                    M[n_rows:, col] = np.minimum(np.maximum(values, v_min), v_max)
            except:
                pass
            col += 1

    all_ordered_model_names = ordered_model_names + (non_probabilistic_models if include_reference_models else [])
    all_model_names_for_display = [x + ('*' if x in non_probabilistic_models else '') for x in all_ordered_model_names]

    # (Option) Hide columns where there is no groundtruth data (no modelling is done)
    frac_columns_removed = 0.0
    if cfg.get('hide_unmodelled_columns', False):
        # - truncate matrix by discarding columns with no data, less distraction for the viewer
        modelling_required = np.array(modelling_required, dtype=bool)
        frac_columns_removed = sum(modelling_required == False) / n_columns
        M = M[:, modelling_required]

    # Make data available to user via the cfg dictionary
    cfg[f"results:{rspace}_{attribute}"] = {'ordered_model_names': all_ordered_model_names, 'M': M}

    # (Option) Compute ratio using a previously calculated attribute matrix
    if cfg.get('compute_ratio', False):
        k = cfg['compute_ratio']
        M /= (cfg[f"results:{k}"]['M'] + eps)
        M = np.minimum(np.maximum(M, v_min), v_max)
        cfg['skip_plot'] = False # automatically disabled

    if cfg.get('skip_plot', False):
        return cfg if cfg.get('return_config_only', False) else M

    fig = plt.figure(figsize=(15,10)) #width, height
    im = plt.imshow(M, cmap=colourmap, aspect=2)

    # (Option) Mark NaN locations with a distinctive marker to distinguish from bright intensity values
    if cfg.get('decorate_nans', False):
        pos = np.where(np.isnan(M))
        plt.scatter(pos[1], pos[0], s=4, c='k', marker='o')

    # Horizontal dividers
    for i in range(int(M.shape[0] / (2 + n_exponents))):
        plt.plot([-0.5, (M.shape[1]-1)+0.5], [(i+1)*(2+n_exponents)-0.5]*2, 'w')

    ax = plt.gca()
    _ = ax.set_xticks(np.arange(M.shape[1]))
    _ = ax.set_xticklabels([]) #leave xtick labels as blank to avoid overcrowding
    _ = ax.set_yticks(np.arange(M.shape[0]))
    _ = ax.set_yticklabels(all_model_names_for_display, fontsize=8)
    plt.text(0.0, 1.005,'* denotes non-probabilistic models', fontsize=8,
             horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes)

    # Zigzag placement of primary x-labels
    if group_by == 'domain,inference':
        # inner-axis describes inference
        complete_inner_labels = list(uniq_inference_periods) * len(uniq_domain_ids)
        x1_spacing_odd = '     '
    else:
        # inner-axis describes domain
        complete_inner_labels = list(uniq_domain_ids) * len(uniq_inference_periods)
        x1_spacing_odd = '         '

    # - Make adjustment if columns have been discarded
    if cfg.get('hide_unmodelled_columns', False):
        complete_inner_labels = [x for i,x in enumerate(complete_inner_labels) if modelling_required[i]]
    
    complete_inner_ticks = np.arange(M.shape[1])
    desc_x1_even = complete_inner_labels[::2]
    desc_x1_odd = [f"{x}{x1_spacing_odd}" for x in complete_inner_labels[1::2]]
    ticks_x1_even = complete_inner_ticks[::2]
    ticks_x1_odd = complete_inner_ticks[1::2]
    prim_even = ax.secondary_xaxis(location=0)
    prim_even.set_xticks(ticks_x1_even, labels=desc_x1_even, ha='center', fontsize=8, rotation='vertical')
    prim_even.tick_params('x', length=0.05)
    prim_odd = ax.secondary_xaxis(location=0)
    prim_odd.set_xticks(ticks_x1_odd, labels=desc_x1_odd, ha='center', fontsize=8, rotation='vertical')
    prim_odd.tick_params('x', length=0.05)

    # Nested secondary x-labels (outer-axis)
    x2_vert_spacing = '\n\n\n' if group_by == 'domain,inference' else '\n\n\n\n\n'
    xlabel_vert_spacing = '\n\n\n\n' if group_by == 'domain,inference' else '\n\n\n\n\n'
    desc_x2 = [f"{x2_vert_spacing}{x}" for x in outer_vars]
    if group_by == 'domain,inference':
        ticks_x2 = [n_ip/2. + i * n_ip for i in range(len(uniq_domain_ids))]
        ticks_line_x2 = np.array([0] + [(i+1) * n_ip for i in range(len(uniq_domain_ids))]) - 0.5
        outer_axis_y_offset = -0.07 * (1 - frac_columns_removed)
        x_desc = '(Inner) Inference Period / (Outer) Domain Label'
    else:
        ticks_x2 = [n_gd/2. + i * n_gd for i in range(len(uniq_inference_periods))]
        ticks_line_x2 = np.array([0] + [(i+1) * n_gd for i in range(len(uniq_inference_periods))]) - 0.5
        outer_axis_y_offset = -0.12 * (1 - frac_columns_removed)
        x_desc = '(Inner) Domain Label / (Outer) Inference Period'

    # - Make adjustment to horizontal spacing if columns have been discarded
    if cfg.get('hide_unmodelled_columns', False):
        discard = 1 - np.array(modelling_required, dtype=int)
        xspace_reduction = np.array([sum(discard[:int(x+0.5)]) for x in ticks_line_x2[1:]])
        ticks_line_x2 -= np.hstack([[0], xspace_reduction])
        ticks_x2 = [(left + right)/2. for left, right in zip(ticks_line_x2[:-1], ticks_line_x2[1:])]

    sec = ax.secondary_xaxis(location=0)
    sec.set_xticks(ticks_x2, labels=desc_x2, ha='center')
    sec.tick_params('x', length=0)

    # lines between outer labels
    sec2 = ax.secondary_xaxis(location=outer_axis_y_offset)
    sec2.set_xticks(ticks_line_x2, labels=[])
    sec2.tick_params('x', length=18, width=1)

    fs = cfg.get('fontsize', 12)
    plt.xlabel(f"{xlabel_vert_spacing}{x_desc}", fontsize=fs)
    plt.ylabel('Probabilistic Model', fontsize=fs)
    plt.title(cfg.get('title', ''), y=1.01, fontsize=fs)
    plt.colorbar(im, fraction=0.025, pad=0.02, label=cfg.get('colorbar_title',''))

def imshow_matrix_group_by_inference_domain(rspace, attribute, colourmap, cfg={}):
    cfg['group_by'] = 'inference,domain'
    imshow_matrix_common(rspace, attribute, colourmap, cfg)

def imshow_matrix_group_by_domain_inference(rspace, attribute, colourmap, cfg={}):
    #_cfg = copy.deepcopy(cfg)
    cfg['group_by'] = 'domain,inference'
    imshow_matrix_common(rspace, attribute, colourmap, cfg)

def create_inverted_colormap(gamma=1, levels=256, monochrome=False, lscm_upper=None, lscm_lower=None,
                             flip_upper=False, flip_lower=False):
    """
    Combine a symmetric blue/red colour scale to cover range [-1,1] with darker shades near zero
    :param gamma: nonlinearity (< 1 to expand, > 1 to compress small values nearest to zero)
    :param levels: discretisation (number of colours in palette)
    :param monochrome: use only half the palette nominally associated with negative values
    :param lscm_upper: matplotlib.colors.LinearSegmentedColormap for positive values
    :param lscm_lower: matplotlib.colors.LinearSegmentedColormap for negative values (or all values if monochrome=True)
    :param flip_upper: reverse the intensity progression logic for lscm_upper
    :param flip_lower: reverse the intensity progression logic for lscm_lower
    """
    x = np.linspace(0, 1, levels//2)
    xp = x**gamma
    if lscm_upper is None:
        lscm_upper = mpl.cm.Reds
    if lscm_lower is None:
        lscm_lower = mpl.cm.Blues
    cmap_upper = lscm_upper(np.linspace(0, 1, levels//2))
    cmap_lower = lscm_lower(np.linspace(0, 1, levels//2))
    lower, upper = np.zeros((levels//2, 4)), np.zeros((levels//2, 4))
    for rgba in range(4):
        upper[:,rgba] = np.interp(xp, x, cmap_upper[:,rgba])[::-1]
        lower[:,rgba] = np.interp(xp, x, cmap_lower[:,rgba])
    if flip_upper:
        upper = upper[::-1]
    if flip_lower:
        lower = lower[::-1]
    if monochrome:
        cmap = mpl.colors.ListedColormap(np.r_[lower])
    else:
        cmap = mpl.colors.ListedColormap(np.r_[lower, upper])
    return cmap

def fetch_property(rspace, attribute, cfg):
    _cfg = copy.deepcopy(cfg)
    _cfg['skip_plot'] = True
    return imshow_matrix_common(rspace, attribute, plt.get_cmap('Blues_r'), _cfg)

In [None]:
# Convenient function for significance testing
def t_test(x, y, which='dependent-t', ha='greater'):
    '''
    Wrapper for dependent and Welch's t-test
    :param ha: alternative hypothesis type, one of ['less', 'greater', 'two-sided']
    :return: (t-statistic, p-value, degree-of-freedom, CI.lower, CI.upper)
    '''
    if which == 'welch': #assumes independence/unequal variance
        r_different = ttest_ind(x, y, nan_policy='omit', equal_var=False, alternative='two-sided')
        r_inequality = ttest_ind(x, y, nan_policy='omit', equal_var=False, alternative=ha)
    else: #employ dependent t-test
        r_different = ttest_rel(x, y, nan_policy='omit', alternative='two-sided')
        r_inequality = ttest_rel(x, y, nan_policy='omit', alternative=ha)
    ci = r_different.confidence_interval(confidence_level=0.95)
    r = r_inequality
    rounded_pval = min(max(r.pvalue, 0.0001), 0.9999)
    fmt = lambda x: np.round(x, 6)
    return r.statistic, fmt(rounded_pval), r.df, fmt(ci.low), fmt(ci.high)

### Visualise Cu grade histograms in different domains and inference periods

In [None]:
histogram_matrix_pfile = os.path.join(os.getcwd(), 'histogram_matrix.p')
histogram_matrix_png = os.path.join(os.getcwd(), 'histogram_matrix.png')

show_jsd_similarity = True

n_bins = 40
bin_edges = np.r_[-np.inf, np.linspace(0.05, 1.6, n_bins+1), np.inf]
delta = np.diff(bin_edges[1:-1])
centroids = np.mean(np.vstack([bin_edges[1:-2], bin_edges[2:-1]]), axis=0)
bin_representative_values = np.r_[centroids[0]-delta[0], centroids, centroids[-1]+delta[-1]]
w = np.median(np.diff(bin_representative_values))

hist_training = {}
hist_true_inference = {}
training_groundtruth_hist_similarity = {}
eps = 1e-9

# load the plot if it was previously saved
if os.path.exists(histogram_matrix_png):
    img = Image(filename=histogram_matrix_png)
    display(img)

else: # create the plot from scratch
    if os.path.exists(histogram_matrix_pfile):
        with open(histogram_matrix_pfile, 'rb') as f:
            (hist_training, hist_true_inference, training_groundtruth_hist_similarity) = pickle.load(f)
    else:
        for mA in uniq_inference_periods:
            mA_mB_mC = f"%02d_%02d_%02d" % (mA, mA+1, mA+2)
            for gd in uniq_domain_ids:
                # read grade values from training data and groundtruth data for inferenced blocks
                data_path = os.path.join(data_dir, f"{mA_mB_mC}")
                df_bh = pd.read_csv(f"{data_path}/blastholes_tagged.csv")
                df_bh = df_bh.rename(columns = {'EAST': 'X', 'NORTH': 'Y', 'RL': 'Z', 'PL_CU': 'V'})
                df_domain_bh = df_bh[(df_bh['lookup_domain'] == gd) & np.isfinite(df_bh['V'].values)]
                training_grades = df_domain_bh['V'].values
                df_k = pd.read_csv(f"{data_path}/{blocks_csv}")
                df_domain_k = pd.DataFrame(df_k[df_k['domain'] == gd])
                inference_true_grades = df_domain_k['cu_bh_nn'].values
                # compute and normalise histograms
                counts_t, _ = np.histogram(training_grades, bin_edges)
                counts_i, _ = np.histogram(inference_true_grades, bin_edges)
                key = f"{mA}:{gd}"
                hist_training[key] = counts_t / (sum(counts_t) + eps)
                hist_true_inference[key] = counts_i / (sum(counts_i) + eps)
                if sum(hist_training[key]) > 0 and sum(hist_true_inference[key]) > 0:
                    training_groundtruth_hist_similarity[key] = 1 - jensenshannon(counts_t, counts_i)
        with open(histogram_matrix_pfile, 'wb') as hdl:
            variables = [hist_training, hist_true_inference, training_groundtruth_hist_similarity]
            pickle.dump(variables, hdl, protocol=4)

    # make graphs
    f_width, f_height = 15., 12. # ideally, width=15, height=20 (but we will compress the y-axis to save space)
    fig = plt.figure(figsize=(f_width, f_height))
    count = 0

    sim = training_groundtruth_hist_similarity
    x_max = np.max(bin_representative_values)
    for mA in uniq_inference_periods:
        mA_mB_mC = f"%02d_%02d_%02d" % (mA, mA+1, mA+2)
        for gd in uniq_domain_ids:
            key = f"{mA}:{gd}"
            # Convention: blue bars = training data, black rectangles = ground truth
            plt.subplot(len(uniq_inference_periods), len(uniq_domain_ids), count+1) #rows, columns
            plt.bar(bin_representative_values, hist_training[key], edgecolor=None, width=w)
            plt.bar(bin_representative_values, hist_true_inference[key], facecolor=None, fill=False, edgecolor='k', width=w)
            # Add annotation to show degree of similarity between training data and groundtruth distributions
            if show_jsd_similarity:
                annot = "%.1f%%" % (100 * sim[key]) if np.isfinite(sim.get(key,np.nan)) else ""
                y_max = np.max(np.r_[hist_training[key], hist_true_inference[key]])
                plt.text(x_max, y_max, annot, fontsize=7, color='#7f7f7f', ha='right', va='top')
                if mA == uniq_inference_periods[-1] and gd == uniq_domain_ids[0]:
                    explanation = '\n\n\n% indicates JSD similarity between ' \
                                  'training data and groundtruth distributions'
                    axL = plt.gca().secondary_xaxis(location=0)
                    axL.set_xticks([0], labels=[explanation], fontsize=9, color='#7f7f7f', ha='left')
            count += 1
            if gd == uniq_domain_ids[0]:
                plt.ylabel(str(mA))
            plt.gca().set_yticks([])
            if mA == uniq_inference_periods[-1]:
                plt.xlabel(str(gd))
            else:
                plt.gca().set_xticks([])

    fig.supylabel('Inference Period', fontsize=14)
    fig.supxlabel('Geological Domain', y=-0.01, fontsize=14)
    plt.suptitle("$\\bf{Copper\ Grade\ Probability\ Mass\ Functions}$\n"
                 "Histograms: Blue represents training data. Black represents groundtruth for predicted blocks",
                 y=1.0, fontsize=14)
    # - make grids super-tight
    plt.subplots_adjust(left=0.05, bottom=0.05, right=0.95, top=0.95, wspace=0.05, hspace=0.05)

    for fmt in ['pdf', 'png']:
        plt.savefig(os.path.join(os.getcwd(), f"histogram_matrix.{fmt}"), bbox_inches='tight', pad_inches=0.05)

### Jensen-Shannon (JS) Histogram Difference Matrix
- Graph 1: Grouped by Inference Period, then Geological Domain
- Graph 2: Grouped by Geological Domain, then Inference Period

In [None]:
#===============================
rspace = 'learning_rotated'
attribute = 'h(JS)'
cmapBl = plt.get_cmap('Blues_r').copy()
cmapBl.set_bad(color='#7f7f7f')
cfg = {'title': r"Jensen-Shannon histogram distances $h_{JS}$ (model vs groundtruth)", 'fontsize': 14}
#===============================

# Order experiments by Inference Period (outer loop: mA), then by Domain (inner loop: gd)
imshow_matrix_group_by_inference_domain(rspace, attribute, cmapBl, cfg)

In [None]:
cmapBl = create_inverted_colormap(gamma=0.75, monochrome=True, lscm_lower=mpl.cm.Blues, flip_lower=True)
cmapBl.set_bad(color='#dddddd')

# Style 1: Use distinctive marker to represent NaNs to differentiate from high intensity values
cfg = {'title': r"Jensen-Shannon histogram distances $h_{JS}$ (model vs groundtruth)",
       'fontsize': 14, 'colorbar_title': attribute, 'decorate_nans': True,
       'include_reference_models': True}
imshow_matrix_group_by_domain_inference(rspace, attribute, cmapBl, cfg)

# Style 2: Remove combinations where groundtruth is not available and modelling is not done
#          Shift x-labels and line separator accordingly
cfg = {'title': r"Jensen-Shannon histogram distances $h_{JS}$ (model vs groundtruth)",
       'fontsize': 14, 'colorbar_title': attribute, 'hide_unmodelled_columns': True,
       'include_reference_models': True}
imshow_matrix_group_by_domain_inference(rspace, attribute, cmapBl, cfg)
plt.savefig(os.path.join(os.getcwd(), f"image-jensen-shannon-hist-dist.pdf"), bbox_inches='tight', pad_inches=0.05)

### <font color="#7f7f7f">Experimental:</font> Bellman-Ford min-cost path via Dijkstra algorithm

In [None]:
import skimage.graph

def find_min_cost_path(rspace, attribute, cmap, cfg):
    # Cost matrix
    M = fetch_property(rspace, attribute, cfg)
    # - apply custom cost function (if specified)
    if cfg.get('dijkstra:cost_fn', None) is not None:
        M = cfg['dijkstra:cost_fn'](M)
    # - rectify cells with undefined values
    nan_cost = cfg.get('nan_cost', np.nanmean(M))
    M[~np.isfinite(M)] = nan_cost
    objective = 'min'
    # - option to maximise instead of minimise
    if not cfg.get('dijkstra:minimise', True):
        M = np.max(M) - M + 1
        objective = 'max'

    # Find optimal passage
    nY = 2 + n_exponents #group_size
    paths, costs = {}, {}
    for g in range(int(M.shape[0] / nY)):
        for i in range(g*nY, (g+1)*nY):
            node_s = (i, 0)
            for j in range(g*nY, (g+1)*nY):
                node_f = (j, M.shape[1]-1)
                k = f"{i},{j}"
                paths[k], costs[k] = skimage.graph.route_through_array(M,
                                     start=node_s, end=node_f, fully_connected=True)
    # Rank paths by cost in ascending order
    keys = list(paths.keys())
    ranks = np.argsort([costs[k] for k in keys])
    if not cfg.get('dijkstra:categorical_judgment', False):
        print(f"{attribute}: {objective}_key={keys[ranks[0]]}, {objective}_cost={costs[keys[ranks[0]]]}")

    # Prune preferences - retain only the best option for each unique source node
    ordered_keys = [keys[r] for r in ranks]
    source_nodes_encountered = set()
    uniq_source_key_preferences = []
    for k in ordered_keys:
        src, dest = [int(i) for i in k.split(',')]
        if src not in source_nodes_encountered:
            if cfg.get('dijkstra:categorical_judgment', False):
                uniq_source_key_preferences.append(k)
                # discount remaining options in the current group
                # instead, determine best path for other categories
                g = int(np.floor(src / nY))
                for i in range(g*nY, (g+1)*nY):
                    source_nodes_encountered.add(i)
            else:
                uniq_source_key_preferences.append(k)
                source_nodes_encountered.add(src)

    # Specify number of paths to display
    nD = min(max(cfg.get('num_paths', 3), 1), 6)
    imshow_matrix_group_by_domain_inference(rspace, attribute, cmap, cfg)
    for i, k, rgb in zip(range(nD), uniq_source_key_preferences[:nD],
                         ['#fcf403','#fcba03','#fca903','#fc9003','#fc8003','#fc6703'][:nD]):
        yx = np.array(paths[k])
        plt.plot(yx[:,1], yx[:,0], '.-', color=rgb, linewidth=3-i*0.5)
        if cfg.get('dijkstra:categorical_judgment', False):
            print(f"{attribute}: {objective}_key={k}, {objective}_cost={costs[k]}")

In [None]:
cfg_dijk = {'title': r"Jensen-Shannon histogram distances $h_{JS}$ min-cost path",
            'hide_unmodelled_columns': True, 'include_reference_models': False, 'num_paths': 1}
find_min_cost_path(rspace, 'h(JS)', cmapBl, cfg_dijk)

### Wasserstein (EM) Histogram Difference Matrix

Henceforth, grouped by geological domain, then inference period

In [None]:
#===============================
rspace = 'learning_rotated'
attribute = 'h(EM)'
cmapBl = create_inverted_colormap(gamma=2.0, monochrome=True, lscm_lower=mpl.cm.Blues, flip_lower=True)
cmapBl.set_bad(color='#7f7f7f')
# Reusing configuration to preserve results from previous run
cfg['title'] = r"Wasserstein histogram distances $h_{EM}$ (model vs groundtruth)"
cfg['colorbar_title'] = attribute
#===============================
imshow_matrix_group_by_domain_inference(rspace, attribute, cmapBl, cfg)
plt.savefig(os.path.join(os.getcwd(), f"image-wasserstein-hist-dist.pdf"), bbox_inches='tight', pad_inches=0.05)

In [None]:
def standard_error(x, fmt='%.6f'):
    xf = x.flatten()
    mask = np.isfinite(xf)
    return fmt % (np.std(xf[mask]) / np.sqrt(sum(mask)))

def mean_of(x, fmt='%.6f'):
    return fmt % np.nanmean(x)

def median_of(x, fmt='%.6f'):
    return fmt % np.nanmedian(x)

### Grade Prediction RMSE (model vs groundtruth)

In [None]:
#===============================
attribute = 'RMSE'
cmapBl = create_inverted_colormap(gamma=0.75, monochrome=True, lscm_lower=mpl.cm.Blues, flip_lower=True)
cmapBl.set_bad(color='#7f7f7f')
# Reusing configuration to preserve results from previous run
cfg['title'] = r"Grade Prediction Root Mean Squared Error"
cfg['colorbar_title'] = 'Uncorrelated RMS Error'
#===============================

imshow_matrix_group_by_domain_inference(rspace, attribute, cmapBl, cfg) #plt.grid('on')

h_EM = cfg[f"results:{rspace}_h(EM)"]['M']
h_JS = cfg[f"results:{rspace}_h(JS)"]['M']
d_RMSE = cfg[f"results:{rspace}_RMSE"]['M']

cached_h_EM = dict()
cached_h_JS = dict()
cached_d_RMSE = dict()

print("\nAverage histogram difference h(Jensen-Shannon)")
model_families = ['SimpleKriging','OrdinaryKriging','GaussianProcess(G)-CRF','GaussianProcess(L)-SGS']
for i,cat in zip(range(4), model_families):
    values = h_JS[i*(n_exponents+2):(i+1)*(n_exponents+2),:]
    cached_h_EM[cat] = values.flatten()
    print(f"- {'{:<30}'.format(cat + ' models')} {mean_of(values)} +/- {standard_error(values)}")

print("\nAverage histogram difference h(Wasserstein.EM)")
for i,cat in zip(range(4), model_families):
    values = h_EM[i*(n_exponents+2):(i+1)*(n_exponents+2),:]
    cached_h_JS[cat] = values.flatten()
    print(f"- {'{:<30}'.format(cat + ' models')} {mean_of(values)} +/- {standard_error(values)}")

print("\nAverage prediction RMSE")
for i,cat in zip(range(4), model_families):
    values = d_RMSE[i*(n_exponents+2):(i+1)*(n_exponents+2),:]
    cached_d_RMSE[cat] = values.flatten()
    print(f"- {'{:<30}'.format(cat + ' models')} {mean_of(values)} +/- {standard_error(values)}")

vect_h_EM = h_EM.flatten()
vect_h_JS = h_JS.flatten()
vect_d_RMSE = d_RMSE.flatten()
valid = np.isfinite(vect_h_EM + vect_h_JS + vect_d_RMSE)
print("\nPearson correlation")
print(f"- rho(h(Wasserstein.EM),RMSE) = {np.corrcoef(vect_h_EM[valid], vect_d_RMSE[valid])[0,1]}\n")

# significance testing using the dependent T-test
# https://www.investopedia.com/terms/t/t-test.asp
ft = lambda x: np.round(x,6)
print("Dependent T-test on h(Jensen-Shannon) w.r.t. GaussianProcess(L)-SGS")
for i,cat in zip(range(4), [m for m in model_families if m != 'GaussianProcess(L)-SGS']):
    r = t_test(cached_h_JS[cat], cached_h_JS['GaussianProcess(L)-SGS'], ha='greater')
    print(f"- {cat}: T={r[0]}, p={r[1]}, df={r[2]}, CI=[{r[3]},{r[4]}]")

print("Dependent T-test on h(Wasserstein.EM) w.r.t. GaussianProcess(L)-SGS")
for i,cat in zip(range(4), [m for m in model_families if m != 'GaussianProcess(L)-SGS']):
    r = t_test(cached_h_EM[cat], cached_h_EM['GaussianProcess(L)-SGS'], ha='greater')
    print(f"- {cat}: T={r[0]}, p={r[1]}, df={r[2]}, CI=[{r[3]},{r[4]}]")

print("Dependent T-test on prediction RMSE w.r.t. GaussianProcess(L)-SGS")
for i,cat in zip(range(4), [m for m in model_families if m != 'GaussianProcess(L)-SGS']):
    r = t_test(cached_d_RMSE[cat], cached_d_RMSE['GaussianProcess(L)-SGS'], ha='greater')
    print(f"- {cat}: T={r[0]}, p={r[1]}, df={r[2]}, CI=[{r[3]},{r[4]}]")

### Spatial Fidelity Matrix based on Variogram-Ratios

In [None]:
rspace = 'learning_rotated'
attribute = 'Variogram Ratios'

# colour version
cmapBi = create_inverted_colormap(gamma=0.75, monochrome=False, lscm_upper=mpl.cm.Purples, #YlOrBr
                                  lscm_lower=mpl.cm.Reds, flip_lower=False, flip_upper=False)
cmapBi.set_bad(color='#ffffff')
cfg_sf = {'title': r"Variogram Ratios (model vs groundtruth)",
          'v_min': 0, 'v_max': 2., 'fontsize': 14, 'colorbar_title': attribute,
          'hide_unmodelled_columns': True, 'decorate_nans': True,
          'include_reference_models': True}
imshow_matrix_group_by_domain_inference(rspace, attribute, cmapBi, cfg_sf)
plt.savefig(os.path.join(os.getcwd(), f"image-variogram-ratios.pdf"), bbox_inches='tight', pad_inches=0.05)

# gray-scale version
attribute = 'Spatial Fidelity'
cmapK = create_inverted_colormap(gamma=1.5, monochrome=True, lscm_upper=mpl.cm.gray,
                                  lscm_lower=mpl.cm.gray, flip_lower=True, flip_upper=True)
cmapK.set_bad(color='#ffffff')
cfg_sf = {'title': r"Variogram-derived Spatial Fidelity (model vs groundtruth)",
          'v_min': -0.01, 'v_max': 1., 'fontsize': 14, 'colorbar_title': attribute,
          'hide_unmodelled_columns': True, 'decorate_nans': True,
          'include_reference_models': True}
imshow_matrix_group_by_domain_inference(rspace, attribute, cmapK, cfg_sf)
plt.savefig(os.path.join(os.getcwd(), f"image-variogram-based-spatial-fidelity.pdf"), bbox_inches='tight', pad_inches=0.05)

In [None]:
cfg_dijk = {'title': r"Variogram-derived $Spatial\ Fidelity$ (model vs groundtruth)",
            'v_min': 0, 'v_max': 2., 'fontsize': 14, 'colorbar_title': attribute,
            'hide_unmodelled_columns': True, 'decorate_nans': True,
            'include_reference_models': False, 'num_paths': 1,
            'dijkstra:minimise': False, 'dijkstra:cost_fn': lambda x: np.sqrt(1 - np.abs(x - 1))}
find_min_cost_path(rspace, 'Spatial Fidelity', cmapK, cfg_dijk)

In [None]:
# Want a convex symmetric cost function about x=1
# The desired expression is y = sqrt(1 - |x-1|)
attribute = 'Spatial Fidelity'
cost_sf = lambda x: np.sqrt(1 - np.abs(np.minimum(x, 2) - 1))
C_variogram_ratios = cfg_sf[f"results:{rspace}_{attribute}"]['M']
C_spatial_fidelity = cost_sf(cfg_sf[f"results:{rspace}_{attribute}"]['M'])

cached_variogram_ratios = dict()
cached_spatial_fidelity = dict()
print("\nAverage Variogram Ratios")
for i,cat in enumerate(model_families):
    cells = C_variogram_ratios[i*(n_exponents+2):(i+1)*(n_exponents+2),:]
    cached_variogram_ratios[cat] = cells.flatten()
    print(f"- {'{:<30}'.format(cat + ' models')} {mean_of(cells)} +/- {standard_error(cells)}")

print("\nAverage Spatial Fidelity")
for i,cat in enumerate(model_families):
    cells = C_spatial_fidelity[i*(n_exponents+2):(i+1)*(n_exponents+2),:]
    cached_spatial_fidelity[cat] = cells.flatten()
    print(f"- {'{:<30}'.format(cat + ' models')} {mean_of(cells)} +/- {standard_error(cells)}")

# significance testing using the dependent T-test
ft = lambda x: np.round(x,6)
print("Dependent T-test on Spatial Fidelity w.r.t. GaussianProcess(L)-SGS")
for i,cat in zip(range(4), [m for m in model_families if m != 'GaussianProcess(L)-SGS']):
    r = t_test(cached_spatial_fidelity[cat], cached_spatial_fidelity['GaussianProcess(L)-SGS'], ha='less')
    print(f"- {cat}: T={r[0]}, p={r[1]}, df={r[2]}, CI=[{r[3]},{r[4]}]")

### <font color="#cc0044">Uncertainty-based Measures</font>

### Deutsch's Accuracy for Probabilistic Predictions

In [None]:
rspace = 'learning_rotated'
attribute = 'Accuracy(.05)'

cmapRd = create_inverted_colormap(gamma=1, monochrome=True,
                                  lscm_lower=mpl.cm.Reds, flip_lower=False)
cmapRd.set_bad(color='#7f7f7f')
cfg_a = {'title': r"Deutsch's Accuracy for Probabilistic Predictions",
         'v_min': 0, 'v_max': 1., 'fontsize': 14, 'colorbar_title': attribute,
         'hide_unmodelled_columns': True, 'decorate_nans': True,
         'include_reference_models': False}
imshow_matrix_group_by_domain_inference(rspace, attribute, cmapRd, cfg_a)

um_A = cfg_a[f"results:{rspace}_{attribute}"]['M']

cached_A = dict()
print("\nAverage Accuracy of Probabilistic Predictions")
# Special note: we exclude rows labelled '*_[SGS|CRF]_from_{m}' where m=2,4,8
mask = [True]*2 + [False]*3 + [True]*(n_exponents-3) 
for i,cat in enumerate(model_families):
    rows = np.arange(i*(n_exponents+2),(i+1)*(n_exponents+2))[mask]
    values = um_A[rows,:]
    cached_A[cat] = values.flatten()
    print(f"- {'{:<30}'.format(cat + ' models')} {mean_of(values)} +/- {standard_error(values)}")
plt.savefig(os.path.join(os.getcwd(), f"image-distribution-accuracy.pdf"), bbox_inches='tight', pad_inches=0.05)

# significance testing using the dependent T-test
ft = lambda x: np.round(x,6)
print("Dependent T-test on Deutsch Accuracy w.r.t. GaussianProcess(L)-SGS")
for i,cat in zip(range(4), [m for m in model_families if m != 'GaussianProcess(L)-SGS']):
    r = t_test(cached_A[cat], cached_A['GaussianProcess(L)-SGS'], ha='less')
    print(f"- {cat}: T={r[0]}, p={r[1]}, df={r[2]}, CI=[{r[3]},{r[4]}]")

### Conditional Precision for Probabilistic Predictions

In [None]:
rspace = 'learning_rotated'
attribute = 'Precision'

cmapRd = create_inverted_colormap(gamma=1, monochrome=True,
                                  lscm_lower=mpl.cm.Reds, flip_lower=False)
cmapRd.set_bad(color='#7f7f7f')
cfg_p = {'title': r"Conditional Precision for Probabilistic Predictions",
         'v_min': 0, 'v_max': 1., 'fontsize': 14, 'colorbar_title': attribute,
         'hide_unmodelled_columns': True, 'decorate_nans': True,
         'include_reference_models': False}
imshow_matrix_group_by_domain_inference(rspace, attribute, cmapRd, cfg_p)

um_P = cfg_p[f"results:{rspace}_{attribute}"]['M']

cached_P = dict()
print("\nAverage Precision of Probabilistic Predictions")
mask = [True]*2 + [True]*3 + [True]*(n_exponents-3)
for i,cat in enumerate(model_families):
    rows = np.arange(i*(n_exponents+2),(i+1)*(n_exponents+2))[mask]
    select = um_A[rows,:] > 0.0
    values = um_P[rows,:]
    cached_P[cat] = values.flatten()
    print(f"- {'{:<30}'.format(cat + ' models')} {mean_of(values[select])} +/- {standard_error(values[select])}")
plt.savefig(os.path.join(os.getcwd(), f"image-distribution-precision.pdf"), bbox_inches='tight', pad_inches=0.05)

# significance testing using the dependent T-test
ft = lambda x: np.round(x,6)
print("Dependent T-test on Precision w.r.t. GaussianProcess(L)-SGS")
for i,cat in zip(range(4), [m for m in model_families if m != 'GaussianProcess(L)-SGS']):
    r = t_test(cached_P[cat], cached_P['GaussianProcess(L)-SGS'], ha='less')
    print(f"- {cat}: T={r[0]}, p={r[1]}, df={r[2]}, CI=[{r[3]},{r[4]}]")

### Local Consensus for Probabilistic Predictions

In [None]:
rspace = 'learning_rotated'
attribute = 'Consensus'

cmapRd = create_inverted_colormap(gamma=0.5, monochrome=True,
                                  lscm_lower=mpl.cm.Reds, flip_lower=False)
cmapRd.set_bad(color='#7f7f7f')
cfg_l = {'title': r"Local Consensus for Probabilistic Predictions",
         'v_min': 0, 'v_max': 1., 'fontsize': 14, 'colorbar_title': attribute,
         'hide_unmodelled_columns': True, 'decorate_nans': True,
         'include_reference_models': False}
imshow_matrix_group_by_domain_inference(rspace, attribute, cmapRd, cfg_l)

um_L = cfg_l[f"results:{rspace}_{attribute}"]['M']
um_sL = fetch_property(rspace, '|s|_L', cfg_l)
um_sU = fetch_property(rspace, '|s|_U', cfg_l)

mask = [True]*2 + [True]*3 + [True]*(n_exponents-3)
cached_L = dict()
print("\nLocal Consensus for Probabilistic Predictions")
for i,cat in enumerate(model_families):
    rows = np.arange(i*(n_exponents+2),(i+1)*(n_exponents+2))[mask]
    values = um_L[rows,:]
    cached_L[cat] = values.flatten()
    print(f"- {'{:<30}'.format(cat + ' models')} median={median_of(values)} "
          f"[qL={mean_of(um_sL[rows,:])}, qU={mean_of(um_sU[rows,:])}]")
plt.savefig(os.path.join(os.getcwd(), f"image-distribution-consensus.pdf"), bbox_inches='tight', pad_inches=0.05)

# significance testing using the dependent T-test
ft = lambda x: np.round(x,6)
print("Dependent T-test on Consensus w.r.t. GaussianProcess(L)-SGS")
for i,cat in zip(range(4), [m for m in model_families if m != 'GaussianProcess(L)-SGS']):
    r = t_test(cached_L[cat], cached_L['GaussianProcess(L)-SGS'], ha='less')
    print(f"- {cat}: T={r[0]}, p={r[1]}, df={r[2]}, CI=[{r[3]},{r[4]}]")

# significance testing using the Welch's T-test
print("Independent Walsh's T-test on Consensus w.r.t. GaussianProcess(L)-SGS")
for i,cat in zip(range(4), [m for m in model_families if m != 'GaussianProcess(L)-SGS']):
    r = t_test(cached_L[cat], cached_L['GaussianProcess(L)-SGS'], ha='less', which='welch')
    print(f"- {cat}: T={r[0]}, p={r[1]}, df={r[2]}, CI=[{r[3]},{r[4]}]")

### Goodness of Probabilistic Predictions

In [None]:
rspace = 'learning_rotated'
attribute = 'Goodness'

cmapRd = create_inverted_colormap(gamma=1, monochrome=True,
                                  lscm_lower=mpl.cm.Reds, flip_lower=False)
cmapRd.set_bad(color='#7f7f7f')
cfg_g = {'title': r"Goodness of Probabilistic Predictions",
         'v_min': 0, 'v_max': 1., 'fontsize': 14, 'colorbar_title': attribute,
         'hide_unmodelled_columns': True, 'decorate_nans': True,
         'include_reference_models': False}
imshow_matrix_group_by_domain_inference(rspace, attribute, cmapRd, cfg_g)

um_G = cfg_g[f"results:{rspace}_{attribute}"]['M']

mask = [True]*2 + [True]*3 + [True]*(n_exponents-3)
cached_G = dict()
print("\nAverage Goodness of Probabilistic Predictions")
for i,cat in enumerate(model_families):
    rows = np.arange(i*(n_exponents+2),(i+1)*(n_exponents+2))[mask]
    values = um_G[rows,:]
    cached_G[cat] = values.flatten()
    print(f"- {'{:<30}'.format(cat + ' models')} {mean_of(values)} +/- {standard_error(values)}")
plt.savefig(os.path.join(os.getcwd(), f"image-distribution-goodness.pdf"), bbox_inches='tight', pad_inches=0.05)

# significance testing using the dependent T-test
ft = lambda x: np.round(x,6)
print("Dependent T-test on Goodness w.r.t. GaussianProcess(L)-SGS")
for i,cat in zip(range(4), [m for m in model_families if m != 'GaussianProcess(L)-SGS']):
    r = t_test(cached_G[cat], cached_G['GaussianProcess(L)-SGS'], ha='less')
    print(f"- {cat}: T={r[0]}, p={r[1]}, df={r[2]}, CI=[{r[3]},{r[4]}]")

In [None]:
cfg_dijk = {'title': r"Goodness of Probabilistic Predictions",
            'v_min': 0, 'v_max': 1., 'fontsize': 14, 'colorbar_title': attribute,
            'hide_unmodelled_columns': True, 'decorate_nans': True,
            'include_reference_models': False, 'num_paths': 1, 'dijkstra:minimise': False}
cmapGray = create_inverted_colormap(gamma=1, monochrome=True,
                                    lscm_lower=mpl.cm.gray, flip_lower=True)
#find_min_cost_path(rspace, 'Goodness', cmapGray, cfg_dijk)

### Tightness of Probabilistic Predictions (Normalised by $\sigma_Y$)

In [None]:
rspace = 'learning_rotated'
attribute = 'Tightness'

cmapRd = create_inverted_colormap(gamma=0.4, monochrome=True,
                                  lscm_lower=mpl.cm.Blues, flip_lower=True)
cmapRd.set_bad(color='#ffffff')
cfg_t = {'title': r"Tightness of Probabilistic Predictions (Normalised by $\sigma_Y$)",
         'v_min': 0, 'v_max': 1., 'fontsize': 14, 'colorbar_title': attribute,
         'hide_unmodelled_columns': True, 'decorate_nans': True,
         'include_reference_models': False}
imshow_matrix_group_by_domain_inference(rspace, attribute, cmapRd, cfg_t)

um_T = cfg_t[f"results:{rspace}_{attribute}"]['M']

mask = [True]*2 + [True]*2 + [True]*(n_exponents-2) 
cached_T = dict()
print("\nAverage Tightness of Probabilistic Predictions")
for i,cat in enumerate(model_families):
    rows = np.arange(i*(n_exponents+2),(i+1)*(n_exponents+2))[mask]
    values = um_T[rows,:]
    cached_T[cat] = values.flatten()
    print(f"- {'{:<30}'.format(cat + ' models')} {mean_of(values)} +/- {standard_error(values)}")
plt.savefig(os.path.join(os.getcwd(), f"image-distribution-tightness.pdf"), bbox_inches='tight', pad_inches=0.05)

# significance testing using the dependent T-test
print("Dependent T-test on Tightness w.r.t. GaussianProcess(L)-SGS")
for i,cat in zip(range(4), [m for m in model_families if m != 'GaussianProcess(L)-SGS']):
    r = t_test(cached_T[cat], cached_T['GaussianProcess(L)-SGS'], ha='greater')
    print(f"- {cat}: T={r[0]}, p={r[1]}, df={r[2]}, CI=[{r[3]},{r[4]}]")

In [None]:
cfg_dijk = {'title': r"Tightness of Probabilistic Predictions (Normalised by $\sigma_Y$)",
            'v_min': 0, 'v_max': 1, 'fontsize': 14, 'colorbar_title': attribute,
            'hide_unmodelled_columns': True, 'decorate_nans': True,
            'include_reference_models': False, 'num_paths': 1, 'dijkstra:categorical_judgment': True}
cmapGray = create_inverted_colormap(gamma=1, monochrome=True,
                                    lscm_lower=mpl.cm.gray, flip_lower=True)
#find_min_cost_path(rspace, 'Tightness', cmapGray, cfg_dijk)

### Average statistics weighted by domain sample count
- Due to the complexity of the Cochran formula for weighted standard error of mean (wSEM), we will not be using "add-and-save" accumulation to save memory. Instead, we will maintain the (w*M)[j] and w[j] data points for j=1:n_obsv, where n_obsv = len(uniq_domains)*len(uniq_inference_periods)
- We perform an extra step and computes Spatial Fidelity as sqrt(1 - |x - 1|)

In [None]:
def compute_weighted_stats(rspace, cfg={}):
    """
    @brief  Compute sample_weights and weighted average stats M where
            M.shape=(rows,cols) where rows = #model, cols = (#domain * #inference_period)
    @note   Spatial Fidelity here refers to the cost 
    """
    n_ip = len(uniq_inference_periods)
    n_gd = len(uniq_domain_ids)
    n_obsv = len(uniq_inference_periods) * len(uniq_domain_ids)
    n_rows = len(df_example)
    n_cols = df_example.shape[1]

    array_wX = np.zeros((n_rows, n_cols, n_obsv))
    array_w = np.zeros((n_rows, n_cols, n_obsv))
    group_by = cfg.get('group_by', 'domain,inference')
    eps = cfg.get('eps', 1e-6)

    if group_by == 'domain,inference':
        outer_vars = uniq_domain_ids
        inner_vars = uniq_inference_periods
    else:
        outer_vars = uniq_inference_periods
        inner_vars = uniq_domain_ids

    obsv = 0
    for vo in outer_vars:
        for vi in inner_vars:
            gd = vo if group_by == 'domain,inference' else vi
            mA = vi if group_by == 'domain,inference' else vo
            mA_mB_mC = f"%02d_%02d_%02d" % (mA, mA+1, mA+2)
            analysis_csv = os.path.join(result_dir, rspace, f"{mA_mB_mC}", f"analysis-{gd}.csv")
            record = df_gdp.query(f"mA == {mA} & domain == {gd}")
            if record.shape[0] > 0:
                n = record.n_inference_pts.values[0]
                df = pd.read_csv(analysis_csv, index_col=0, header=0)
                df.rename(columns={'Likelihood': 'Consensus'}, inplace=True)                cost_f = lambda x: np.sqrt(1 - np.abs(x - 1)) if np.isfinite(x) else np.nan
                df['Spatial Fidelity'] = df['Spatial Fidelity'].apply(cost_f)
                X = df.values
                array_w[:,:,obsv] = n * np.isfinite(X)
                array_wX[:,:,obsv] = array_w[:,:,obsv] * X
            obsv += 1
    array_wX[~np.isfinite(array_wX)] = 0

    # Compute (wSEM)^2, see (Gatz and Smith, 1995)
    n = np.sum(array_w, axis=2)
    w = array_w
    wX = array_wX
    X_bar_w = np.sum(wX, axis=2) / np.sum(w, axis=2)
    w_bar = np.mean(w, axis=2)
    t1 = (n / ((n-1) * np.sum(w, axis=2)**2))
    t2 = np.sum((wX - (w_bar * X_bar_w)[:,:,np.newaxis])**2, axis=2)
    t3 = - 2 * X_bar_w * np.sum((w - w_bar[:,:,np.newaxis]) * (wX - (w_bar * X_bar_w)[:,:,np.newaxis]), axis=2)
    t4 = X_bar_w**2 * np.sum((w - w_bar[:,:,np.newaxis])**2, axis=2)
    wSEM = np.sqrt(t1 * (t2 + t3 + t4))

    # Present results in DataFrame
    df_combined = df.copy()
    df_weighted_mean = df.copy()
    df_weighted_se = df.copy()
    for j, c in enumerate(df.columns):
        df_combined.loc[:, c] = ['%.4f (%.4f)' % (x,se) for x, se in zip(X_bar_w[:,j], wSEM[:,j])]
        df_weighted_mean.loc[:, c] = ['%.4f' % (x) for x in X_bar_w[:,j]]
        df_weighted_se.loc[:, c] = ['%.4f' % (se) for se in wSEM[:,j]]

    return (df_weighted_mean.reindex(ordered_model_names),
            df_weighted_se.reindex(ordered_model_names),
            df_combined.reindex(ordered_model_names),
            X_bar_w, wSEM)

In [None]:
df_weighted_mean, df_weighted_se, df_combined, wX, wSE = compute_weighted_stats(rspace)

In [None]:
df_weighted_mean

In [None]:
print("\nSample-weighted Average Statistics")

column_A = -4
rows_ignored = np.array([i*(n_exponents+2)+np.r_[2,3] for i in np.arange(4)]).flatten()

values = np.array([[eval(col) for col in row] for row in df_weighted_mean.values])
values[rows_ignored, column_A] = np.nan
family_mean = []
family_se = []

for i,cat in enumerate(model_families):
    rows = np.arange(i*(n_exponents+2),(i+1)*(n_exponents+2))[mask]
    block = values[rows,:]
    family_mean.append(np.nanmean(block, axis=0))
    family_se.append(np.nanstd(block, axis=0) / np.sqrt(np.sum(np.isfinite(block), axis=0)))

df_family_means = pd.DataFrame(data=family_mean, index=model_families, columns=df_weighted_mean.columns)
df_family_stderrs = pd.DataFrame(data=family_se, index=model_families, columns=df_weighted_mean.columns)

print('Family means')
df_family_means

In [None]:
print('Family standard errors')
df_family_stderrs