# Measure performance
This notebook loads a file with precomputed measures for metrics available in Quantus (*faithfulness_correlations* & *monotonicity_correlations*) for a set of rankings for a given instance of the dataset and measures the performance of the different alternative measures

# There's a .py version for this script in the src folder which is the one that should be used to generate the files:
`src/2-measure-performance-quantus.py`

## 1. Load libraries, model and data

In [44]:
FILENAME = 'avila_11_measures.npz'

# Import the necessary libraries
import sys
import os
PROJ_DIR = os.path.realpath(os.path.dirname(os.path.abspath('')))
sys.path.append(os.path.join(PROJ_DIR,'src'))
import xai_faithfulness_experiments_lib_edits as fl
import numpy as np

# Load data
data = fl.load_generated_data(os.path.join(PROJ_DIR, 'results', FILENAME))

inv_lookup = np.load(os.path.join(PROJ_DIR, 'results', 'avila_permutations_inv_lookup.npz'))['inv_lookup']

faithfulness_correlations = data['faithfulness_correlations']
faithfulness_correlations_basX = []
faithfulness_correlations_inv = faithfulness_correlations - data['faithfulness_correlations_inv']
monotonicity_correlations = data['monotonicity_correlations']
monotonicity_correlations_basX = []
monotonicity_correlations_inv = data['monotonicity_correlations_inv']

# Compute the baseline with varying number of samples
def compute_qbas(measure, num_samples):
    random_indices = np.random.randint(0,  measure.shape[0], (measure.shape[0], num_samples))
    random_qmeans = measure[random_indices]
    mean = np.mean(random_qmeans, axis=1)

    # First way to deal with std==0; add some epsilon
    #std = np.std(random_qmeans, axis=1) + 1e-10

    # Second way to deal with std==0; ignore std (divide by 1)
    std = np.std(random_qmeans, axis=1)
    std[std==0] = 1

    # Always ignore std
    std=1
    return (measure - mean) / std

for i in range(1,11):
    faithfulness_correlations_basX.append(compute_qbas(faithfulness_correlations, i))
    monotonicity_correlations_basX.append(compute_qbas(monotonicity_correlations, i))

# Compute the qinv version
def compute_qinv(measure, inv_lookup):
    return measure - measure[inv_lookup]

faithfulness_correlations_inv = compute_qinv(faithfulness_correlations, inv_lookup)
monotonicity_correlations_inv = compute_qinv(monotonicity_correlations, inv_lookup)

MEASURES = [('faithfulness_correlations', faithfulness_correlations, faithfulness_correlations_basX, faithfulness_correlations_inv), ('monotonicity_correlations', monotonicity_correlations, monotonicity_correlations_basX, monotonicity_correlations_inv)]

level_indices_by_measure = {}
for name, q, qbasX, qinv in MEASURES:
    # Compute z-score for stratification
    q_mean = np.mean(q)
    q_std = np.std(q)
    z_scores = ((q - q_mean) / q_std).flatten()

    # Stratify z-scores to be able to compare performance on different parts of the spectrum
    indices = np.arange(z_scores.shape[0])
    z_scores_numbered = np.vstack((z_scores, indices))
    level_indices = []
    boundaries = [float('-inf'), 0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5]
    for i in range(1,len(boundaries)+1):
        bottom_limit = boundaries[i-1]
        top_limit = float('inf')
        if i < len(boundaries):
            top_limit = boundaries[i]
        level_indices.append((z_scores_numbered[:,np.logical_and(bottom_limit<=z_scores, z_scores<top_limit)][1,:].astype(int),(bottom_limit, top_limit)))
    
    level_indices_by_measure[name] = level_indices

row
label
rankings
faithfulness_correlations
faithfulness_correlations_inv
monotonicity_correlations_inv
pixel_flippings
qmeans
qmean_invs
qmean_bas
qargmaxs
qargmax_invs
qargmax_bas
qaucs
qauc_invs
qauc_bas
output_curves
is_hit_curves
output_curves_inv
is_hit_curves_inv
output_curves_bas
is_hit_curves_bas


In [31]:
print(faithfulness_correlations[:10])
print(faithfulness_correlations_inv[:10])
print(faithfulness_correlations[inv_lookup[:10]])
print(inv_lookup[-1000:-990])

[ 0.0040658   0.19762787  0.0938892  -0.4524167  -0.38390726 -0.56621015
 -0.22212431 -0.33148053  0.28807935  0.07943838]
[ 0.21191725  0.7692199   0.37717423 -0.11780098 -0.23927434 -0.36162317
 -0.4605855  -0.4075103  -0.20984188  0.6636807 ]
[-0.25803754 -0.09681147 -0.81192577  0.12770975  0.2759414   0.34141186
 -0.13912895  0.13981551  0.10794356  0.02536194]
[999 998 997 996 995 994 993 992 991 990]


In [27]:
from scipy.stats import spearmanr, pearsonr

f = data['faithfulness_correlations']
f_inv = data['faithfulness_correlations_inv']

print(f'f - mean:{np.mean(f):.4f} std:{np.std(f):.4f}')
print(f'f_inv - mean:{np.mean(f_inv):.4f} std:{np.std(f_inv):.4f}')
print(f'spearman {spearmanr(f, f_inv)[0]}')
print(f'pearson {pearsonr(f, f_inv)[0]}')

f - mean:-0.0000 std:0.3545
f_inv - mean:0.0001 std:0.3545
spearman -0.14381271647166918
pearson -0.14575276887313865


In [17]:
print(np.mean(f_inv[np.where(f>0.8)]))

-0.11841377


## 2. Measure performance
### 2.1 Order preservation
 1. The issue with using qmean directly is that it doesn't have a fixed scale and you don't get an idea of how good your explanation is compared to other explanations
 2. To address this, ideally you would determine the distribution of all qmeans and then compute the z-score. That's very costly, so you either:
    1. Estimate the qmeans distribution with X samples $\rightarrow$ qbasX
    2. Calculate an alternative to the z-index directly $\rightarrow$ qinv
 3. The problem with both alternatives is that you adulterate the value of your original qmean measurement, so you may end up in a situation where $qmean_i<qmean_j$ but $qinv_i<qinv_j$, which is undesirable
 4. Hence, we measure how many times that happens for each measure.

 (This may be measuring the same as Pearson correlation, which is computed below)

In [39]:
def measure_correct_orderings(truths, estimators):
    '''
    Creates len(truth) x,y pairs and computes the fraction of them for which (truths[x]<truths[y] and estimators[x]<estimators[y]) or (truths[x]>truths[y] and estimators[x]>estimators[y])
    Inputs:
        - Truths & estimators contain num_elems floats
    Output:
        - Float representing the fraction of correctly ordered pairings
    '''
    xs = np.random.permutation(truths.size)
    ys = np.random.permutation(truths.size)
    truthX_lt_Y = truths[xs] < truths[ys]
    estimatorX_lt_Y = estimators[xs] < estimators[ys]
    hits = truthX_lt_Y==estimatorX_lt_Y
    return hits.sum()/truths.size

correct_pairings_basX_by_measure = {}
correct_pairings_inv_by_measure = {}
for name, q, qbasX, qinv in MEASURES:
    correct_pairings_basX = []
    for i in range(len(qbasX)):
        correct_pairings_basX.append(measure_correct_orderings(q, qbasX[i]))
        print(f'{name}_bas{i+1}: {correct_pairings_basX[i]:.4f}')
    correct_pairings_inv = measure_correct_orderings(q, qinv)
    print(f'{name}_inv: {correct_pairings_inv:.4f}')
    
    correct_pairings_basX_by_measure[name] = correct_pairings_basX
    correct_pairings_inv_by_measure[name] = correct_pairings_inv

faithfulness_correlations_bas1: 0.7499
faithfulness_correlations_bas2: 0.8088
faithfulness_correlations_bas3: 0.8383
faithfulness_correlations_bas4: 0.8578
faithfulness_correlations_bas5: 0.8713
faithfulness_correlations_bas6: 0.8813
faithfulness_correlations_bas7: 0.8899
faithfulness_correlations_bas8: 0.8968
faithfulness_correlations_bas9: 0.9018
faithfulness_correlations_bas10: 0.9069
faithfulness_correlations_inv: 0.8245
monotonicity_correlations_bas1: 0.7507
monotonicity_correlations_bas2: 0.8088
monotonicity_correlations_bas3: 0.8382
monotonicity_correlations_bas4: 0.8574
monotonicity_correlations_bas5: 0.8708
monotonicity_correlations_bas6: 0.8807
monotonicity_correlations_bas7: 0.8886
monotonicity_correlations_bas8: 0.8952
monotonicity_correlations_bas9: 0.9007
monotonicity_correlations_bas10: 0.9053
monotonicity_correlations_inv: 1.0000


### 2.2. Spearman correlation
Same thing, is the order of qmeans preserved in qbasX/qinv?

In [40]:
from scipy.stats import spearmanr

spearman_basX_by_measure = {}
spearman_inv_by_measure = {}
for name, q, qbasX, qinv in MEASURES:
    spearman_basX = []
    for i in range(len(qbasX)):
        spearman_basX.append(spearmanr(q, qbasX[i])[0])
        print(f'{name}_bas{i+1}: {spearman_basX[i]:.4f}')
    spearman_inv = spearmanr(q, qinv)[0]
    print(f'{name}_inv: {spearman_inv:.4f}')

    spearman_basX_by_measure[name] = spearman_basX
    spearman_inv_by_measure[name] = spearman_inv

faithfulness_correlations_bas1: 0.6981
faithfulness_correlations_bas2: 0.8185
faithfulness_correlations_bas3: 0.8697
faithfulness_correlations_bas4: 0.8984
faithfulness_correlations_bas5: 0.9167
faithfulness_correlations_bas6: 0.9293
faithfulness_correlations_bas7: 0.9386
faithfulness_correlations_bas8: 0.9459
faithfulness_correlations_bas9: 0.9515
faithfulness_correlations_bas10: 0.9561
faithfulness_correlations_inv: 0.8435
monotonicity_correlations_bas1: 0.6983
monotonicity_correlations_bas2: 0.8203
monotonicity_correlations_bas3: 0.8711
monotonicity_correlations_bas4: 0.9000
monotonicity_correlations_bas5: 0.9180
monotonicity_correlations_bas6: 0.9304
monotonicity_correlations_bas7: 0.9397
monotonicity_correlations_bas8: 0.9466
monotonicity_correlations_bas9: 0.9522
monotonicity_correlations_bas10: 0.9567
monotonicity_correlations_inv: 1.0000


### 2.3. Ability to detect exceptionally good rankings
As stated above, there are some ordering errors in the estimators. Are they in the relevant part of the distribution? i.e. Do they affect the ability to identify exceptionally good rankings?

In [41]:
from sklearn import metrics

def measure_detection(target_indices, estimator):
    if len(target_indices)==0:
        return 1
    target = np.zeros_like(estimator, dtype=int)
    target[target_indices] = 1
    return metrics.roc_auc_score(target, estimator)

aucs_basX_by_measure = {}
aucs_inv_by_measure = {}
for name, q, qbasX, qinv in MEASURES:
    aucs_inv = []
    aucs_basX = [[] for i in qbasX]

    for indices, (bottom_limit, upper_limit) in level_indices_by_measure[name][2:]:
        aucs_inv.append(measure_detection(indices, qinv))
        for i in range(len(qbasX)):
            aucs_basX[i].append(measure_detection(indices, qbasX[i]))

    for i in range(len(qbasX)):
        print(f'{name}_auc_bas{i} ' + ' | '.join(map(lambda x: f'{x:.4f}',aucs_basX[i])))
    print(f'{name}_aucs_inv ' + ' | '.join(map(lambda x: f'{x:.4f}',aucs_inv)))

    aucs_basX_by_measure[name] = aucs_basX
    aucs_inv_by_measure[name] = aucs_inv


faithfulness_correlations_auc_bas0 0.6888 | 0.7870 | 0.8528 | 0.8873 | 1.0000 | 1.0000 | 1.0000
faithfulness_correlations_auc_bas1 0.7267 | 0.8398 | 0.9064 | 0.9341 | 1.0000 | 1.0000 | 1.0000
faithfulness_correlations_auc_bas2 0.7430 | 0.8618 | 0.9287 | 0.9531 | 1.0000 | 1.0000 | 1.0000
faithfulness_correlations_auc_bas3 0.7519 | 0.8746 | 0.9412 | 0.9631 | 1.0000 | 1.0000 | 1.0000
faithfulness_correlations_auc_bas4 0.7577 | 0.8828 | 0.9495 | 0.9693 | 1.0000 | 1.0000 | 1.0000
faithfulness_correlations_auc_bas5 0.7617 | 0.8883 | 0.9550 | 0.9738 | 1.0000 | 1.0000 | 1.0000
faithfulness_correlations_auc_bas6 0.7642 | 0.8926 | 0.9593 | 0.9771 | 1.0000 | 1.0000 | 1.0000
faithfulness_correlations_auc_bas7 0.7666 | 0.8956 | 0.9628 | 0.9794 | 1.0000 | 1.0000 | 1.0000
faithfulness_correlations_auc_bas8 0.7681 | 0.8981 | 0.9655 | 0.9815 | 1.0000 | 1.0000 | 1.0000
faithfulness_correlations_auc_bas9 0.7694 | 0.9002 | 0.9678 | 0.9832 | 1.0000 | 1.0000 | 1.0000
faithfulness_correlations_aucs_inv 0.729

### 2.4 Ability to rank exceptionally good rankings
How well is the order preserved for exceptionally good rankings?

In [42]:
spearman_exceptional_basX_by_measure = {}
spearman_exceptional_inv_by_measure = {}
for name, q, qbasX, qinv in MEASURES:

    spearman_exceptional_inv = []
    spearman_exceptional_basX = [[] for i in qbasX]

    for indices, (bottom_limit, upper_limit) in level_indices_by_measure[name][2:]:
        spearman_exceptional_inv.append(spearmanr(q[indices], qinv[indices])[0])
        for i in range(len(qbasX)):
            spearman_exceptional_basX[i].append(spearmanr(q[indices], qbasX[i][indices])[0])

    for i in range(len(qbasX)):
        print(f'{name}_spearman_exceptional_bas{i} ' + ' | '.join(map(lambda x: f'{x:.4f}', spearman_exceptional_basX[i])))
    print(f'{name}_spearman_exceptional_inv ' + ' | '.join(map(lambda x: f'{x:.4f}', spearman_exceptional_inv)))

    spearman_exceptional_basX_by_measure[name] = spearman_exceptional_basX
    spearman_exceptional_inv_by_measure[name] = spearman_exceptional_inv

faithfulness_correlations_spearman_exceptional_bas0 0.1317 | 0.1309 | 0.1274 | 0.0616 | nan | nan | nan
faithfulness_correlations_spearman_exceptional_bas1 0.1892 | 0.1897 | 0.1829 | 0.1075 | nan | nan | nan
faithfulness_correlations_spearman_exceptional_bas2 0.2335 | 0.2334 | 0.2193 | 0.1259 | nan | nan | nan
faithfulness_correlations_spearman_exceptional_bas3 0.2705 | 0.2678 | 0.2556 | 0.1411 | nan | nan | nan
faithfulness_correlations_spearman_exceptional_bas4 0.2987 | 0.2984 | 0.2832 | 0.1682 | nan | nan | nan
faithfulness_correlations_spearman_exceptional_bas5 0.3218 | 0.3250 | 0.3070 | 0.1846 | nan | nan | nan
faithfulness_correlations_spearman_exceptional_bas6 0.3493 | 0.3506 | 0.3357 | 0.1927 | nan | nan | nan
faithfulness_correlations_spearman_exceptional_bas7 0.3711 | 0.3683 | 0.3551 | 0.2175 | nan | nan | nan
faithfulness_correlations_spearman_exceptional_bas8 0.3915 | 0.3899 | 0.3739 | 0.2169 | nan | nan | nan
faithfulness_correlations_spearman_exceptional_bas9 0.4083 | 0.4

### 3. Save

In [20]:
for name, q, qbasX, qinv in MEASURES:
    np.savez(os.path.join(PROJ_DIR, 'results', FILENAME.replace('_measures',f'_results_{name}')), \
            correct_pairings_inv=correct_pairings_inv_by_measure[name], \
            correct_pairings_basX=correct_pairings_basX_by_measure[name], \
            spearman_inv=spearman_inv_by_measure[name], \
            spearman_basX=spearman_basX_by_measure[name], \
            aucs_inv=aucs_inv_by_measure[name], \
            aucs_basX=aucs_basX_by_measure[name], \
            spearman_exceptional_inv=spearman_exceptional_inv_by_measure[name], \
            spearman_exceptional_basX=spearman_exceptional_basX_by_measure[name])