# Measure performance
This notebook loads a file with precomputed measures (*qmeans*, *qbas* & *qinv*) for a set of rankings for a given instance of the dataset and measures the performance of the different alternative measures

## 1. Load libraries, model and data

In [15]:
FILENAME = 'avila_70_measures.npz'

# Import the necessary libraries
import sys
import os
PROJ_DIR = os.path.realpath(os.path.dirname(os.path.abspath('')))
sys.path.append(os.path.join(PROJ_DIR,'src'))
import xai_faithfulness_experiments_lib_edits as fl
import numpy as np

# Load data
data = fl.load_generated_data(os.path.join(PROJ_DIR, 'results', FILENAME))
qmeans = data['qmeans']
qmeans_basX = [data['qmean_bas']]
qmeans_inv = data['qmean_invs']

# Compute qmeans_bas[2-10]
def compute_qbas(measure, num_samples):
    random_indices = np.random.randint(0,  measure.shape[0], (measure.shape[0], num_samples))
    random_qmeans = measure[random_indices]
    mean = np.mean(random_qmeans, axis=1)

    # First way to deal with std==0; add some epsilon
    #std = np.std(random_qmeans, axis=1) + 1e-10

    # Second way to deal with std==0; ignore std (divide by 1)
    std = np.std(random_qmeans, axis=1)
    std[std==0] = 1

    # Always ignore std
    std=1
    return (measure - mean) / std
for i in range(2,11):
    qmeans_basX.append(compute_qbas(qmeans, i))

# Compute z-score??
qmean_mean = np.mean(qmeans)
qmean_std = np.std(qmeans)
z_scores = ((qmeans - qmean_mean) / qmean_std).flatten()

# Stratify z-index to be able to compare performance on different parts of the spectrum
indices = np.arange(z_scores.shape[0])
z_scores_numbered = np.vstack((z_scores, indices))
level_indices = []
boundaries = [float('-inf'), 0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5]
for i in range(1,len(boundaries)+1):
    bottom_limit = boundaries[i-1]
    top_limit = float('inf')
    if i < len(boundaries):
        top_limit = boundaries[i]
    level_indices.append((z_scores_numbered[:,np.logical_and(bottom_limit<=z_scores, z_scores<top_limit)][1,:].astype(int),(bottom_limit, top_limit)))

## 2. Measure performance
### 2.1 Order preservation
 1. The issue with using qmean directly is that it doesn't have a fixed scale and you don't get an idea of how good your explanation is compared to other explanations
 2. To address this, ideally you would determine the distribution of all qmeans and then compute the z-score. That's very costly, so you either:
    1. Estimate the qmeans distribution with X samples $\rightarrow$ qbasX
    2. Calculate an alternative to the z-index directly $\rightarrow$ qinv
 3. The problem with both alternatives is that you adulterate the value of your original qmean measurement, so you may end up in a situation where $qmean_i<qmean_j$ but $qinv_i<qinv_j$, which is undesirable
 4. Hence, we measure how many times that happens for each measure.

 (This may be measuring the same as Pearson correlation, which is computed below)

In [16]:
def measure_correct_orderings(truths, estimators):
    '''
    Creates len(truth) x,y pairs and computes the fraction of them for which (truths[x]<truths[y] and estimators[x]<estimators[y]) or (truths[x]>truths[y] and estimators[x]>estimators[y])
    Inputs:
        - Truths & estimators contain num_elems floats
    Output:
        - Float representing the fraction of correctly ordered pairings
    '''
    xs = np.random.permutation(truths.size)
    ys = np.random.permutation(truths.size)
    truthX_lt_Y = truths[xs] < truths[ys]
    estimatorX_lt_Y = estimators[xs] < estimators[ys]
    hits = truthX_lt_Y==estimatorX_lt_Y
    return hits.sum()/truths.size

correct_pairings_basX = []
for i in range(len(qmeans_basX)):
    correct_pairings_basX.append(measure_correct_orderings(qmeans, qmeans_basX[i]))
    print(f'qmeans_bas{i+1}: {correct_pairings_basX[i]:.4f}')
correct_pairings_inv = measure_correct_orderings(qmeans, qmeans_inv)
print(f'qmeans_inv: {correct_pairings_inv:.4f}')

qmeans_bas1: 0.7499
qmeans_bas2: 0.7886
qmeans_bas3: 0.8143
qmeans_bas4: 0.8318
qmeans_bas5: 0.8452
qmeans_bas6: 0.8551
qmeans_bas7: 0.8632
qmeans_bas8: 0.8697
qmeans_bas9: 0.8758
qmeans_bas10: 0.8808
qmeans_inv: 0.8342


### 2.2. Spearman correlation
Same thing, is the order of qmeans preserved in qbasX/qinv?

In [17]:
from scipy.stats import spearmanr
spearman_basX = []
for i in range(len(qmeans_basX)):
    spearman_basX.append(spearmanr(qmeans, qmeans_basX[i])[0])
    print(f'qmeans_bas{i+1}: {spearman_basX[i]:.4f}')
spearman_inv = spearmanr(qmeans, qmeans_inv)[0]
print(f'qmeans_inv: {spearman_inv:.4f}')

qmeans_bas1: 0.6758
qmeans_bas2: 0.7689
qmeans_bas3: 0.8196
qmeans_bas4: 0.8508
qmeans_bas5: 0.8722
qmeans_bas6: 0.8878
qmeans_bas7: 0.8995
qmeans_bas8: 0.9091
qmeans_bas9: 0.9167
qmeans_bas10: 0.9232
qmeans_inv: 0.8474


### 2.3. Ability to detect exceptionally good rankings
As stated above, there are some ordering errors in the estimators. Are they in the relevant part of the distribution? i.e. Do they affect the ability to identify exceptionally good rankings?

In [18]:
from sklearn import metrics

def measure_detection(target_indices, estimator):
    if len(target_indices)==0:
        return 1
    target = np.zeros_like(estimator, dtype=int)
    target[target_indices] = 1
    return metrics.roc_auc_score(target, estimator)

aucs_inv = []
aucs_basX = [[] for i in qmeans_basX]

for indices, (bottom_limit, upper_limit) in level_indices[2:]:
    aucs_inv.append(measure_detection(indices, qmeans_inv))
    for i in range(len(qmeans_basX)):
        aucs_basX[i].append(measure_detection(indices, qmeans_basX[i]))

for i in range(len(qmeans_basX)):
    print(f'aucs_bas{i} ' + ' | '.join(map(lambda x: f'{x:.4f}',aucs_basX[i])))
print('aucs_inv ' + ' | '.join(map(lambda x: f'{x:.4f}',aucs_inv)))


aucs_bas0 0.7046 | 0.7956 | 0.8614 | 0.9070 | 0.9399 | 0.9635 | 0.9792
aucs_bas1 0.7430 | 0.8395 | 0.9024 | 0.9408 | 0.9663 | 0.9828 | 0.9916
aucs_bas2 0.7649 | 0.8596 | 0.9178 | 0.9523 | 0.9745 | 0.9876 | 0.9949
aucs_bas3 0.7773 | 0.8698 | 0.9258 | 0.9580 | 0.9784 | 0.9900 | 0.9964
aucs_bas4 0.7856 | 0.8762 | 0.9305 | 0.9611 | 0.9805 | 0.9914 | 0.9971
aucs_bas5 0.7914 | 0.8804 | 0.9334 | 0.9632 | 0.9818 | 0.9923 | 0.9976
aucs_bas6 0.7955 | 0.8836 | 0.9355 | 0.9647 | 0.9827 | 0.9929 | 0.9980
aucs_bas7 0.7986 | 0.8860 | 0.9371 | 0.9657 | 0.9836 | 0.9933 | 0.9982
aucs_bas8 0.8010 | 0.8876 | 0.9384 | 0.9667 | 0.9840 | 0.9937 | 0.9984
aucs_bas9 0.8027 | 0.8891 | 0.9394 | 0.9673 | 0.9845 | 0.9940 | 0.9986
aucs_inv 0.7668 | 0.8730 | 0.9360 | 0.9669 | 0.9852 | 0.9952 | 0.9999


### 2.4 Ability to rank exceptionally good rankings
How well is the order preserved for exceptionally good rankings?

In [19]:
spearman_exceptional_inv = []
spearman_exceptional_basX = [[] for i in qmeans_basX]

for indices, (bottom_limit, upper_limit) in level_indices[2:]:
    spearman_exceptional_inv.append(spearmanr(qmeans[indices], qmeans_inv[indices])[0])
    for i in range(len(qmeans_basX)):
        spearman_exceptional_basX[i].append(spearmanr(qmeans[indices], qmeans_basX[i][indices])[0])

for i in range(len(qmeans_basX)):
    print(f'spearman_exceptional_bas{i} ' + ' | '.join(map(lambda x: f'{x:.4f}', spearman_exceptional_basX[i])))
print('spearman_exceptional_inv ' + ' | '.join(map(lambda x: f'{x:.4f}', spearman_exceptional_inv)))

spearman_exceptional_bas0 0.1646 | 0.1731 | 0.1784 | 0.1824 | 0.1800 | 0.1708 | 0.2107
spearman_exceptional_bas1 0.1867 | 0.1993 | 0.2052 | 0.2085 | 0.2095 | 0.1915 | 0.2546
spearman_exceptional_bas2 0.2196 | 0.2316 | 0.2452 | 0.2484 | 0.2411 | 0.2401 | 0.2854
spearman_exceptional_bas3 0.2471 | 0.2630 | 0.2752 | 0.2829 | 0.2684 | 0.2775 | 0.3261
spearman_exceptional_bas4 0.2737 | 0.2885 | 0.3082 | 0.3096 | 0.3019 | 0.2875 | 0.3589
spearman_exceptional_bas5 0.2971 | 0.3137 | 0.3294 | 0.3352 | 0.3281 | 0.3231 | 0.3828
spearman_exceptional_bas6 0.3217 | 0.3392 | 0.3538 | 0.3587 | 0.3518 | 0.3457 | 0.4167
spearman_exceptional_bas7 0.3399 | 0.3567 | 0.3708 | 0.3819 | 0.3796 | 0.3663 | 0.4321
spearman_exceptional_bas8 0.3558 | 0.3749 | 0.3975 | 0.3996 | 0.3977 | 0.3911 | 0.4587
spearman_exceptional_bas9 0.3724 | 0.3957 | 0.4149 | 0.4178 | 0.4079 | 0.3977 | 0.4613
spearman_exceptional_inv 0.3165 | 0.3189 | 0.4045 | 0.5779 | 0.6483 | 0.8027 | 0.9421


### 3. Save

In [20]:
np.savez(os.path.join(PROJ_DIR, 'results', FILENAME.replace('_measures','_results')), \
         correct_pairings_inv=correct_pairings_inv, \
         correct_pairings_basX=correct_pairings_basX, \
         spearman_inv=spearman_inv, \
         spearman_basX=spearman_basX, \
         aucs_inv=aucs_inv, \
         aucs_basX=aucs_basX, \
         spearman_exceptional_inv=spearman_exceptional_inv, \
         spearman_exceptional_basX=spearman_exceptional_basX)