## Preliminaries

In [2]:
# Mount Google Drive.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Install packages.
from IPython.display import clear_output
!pip install scipy==1.7.3
!pip install captum opencv-python krippendorff xmltodict
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html

# Imports general.
import sys
import gc
import warnings
import uuid
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torchvision
from torchvision import transforms
import captum
from captum.attr import *
import random
import os
import cv2
from sklearn.linear_model import LinearRegression
import time
import scipy
import json
import copy
import scipy
import datetime

Mounted at /content/drive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting captum
  Downloading captum-0.5.0-py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 26.4 MB/s 
Collecting krippendorff
  Downloading krippendorff-0.5.1-py3-none-any.whl (17 kB)
Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict, krippendorff, captum
Successfully installed captum-0.5.0 krippendorff-0.5.1 xmltodict-0.13.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.8.0%2Bcu111-cp37-cp37m-linux_x86_64.whl (1982.2 MB)
[K     |█████████████▌          

In [3]:
# Import local packages.
path = "/content/drive/MyDrive/Projects"

sys.path.append(f'{path}/quantus')
import quantus

sys.path.append(f'{path}/MetaQuantus')
import metaquantus
from metaquantus.utils import *
from metaquantus.models import *
import configs
#from configs import setup_xai_methods, setup_estimators

# Collect garbage.
gc.collect()
torch.cuda.empty_cache()
sns.set()

# Notebook settings.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Autoload!
#%load_ext autoreload
#%autoreload 2 

clear_output()

# Setting device on GPU if available, else CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# Additional info when using cuda.
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

path_assets = "drive/MyDrive/Projects/assets/"
path_results = "/content/drive/MyDrive/Projects/MetaQuantus/results/"

!nvidia-smi

Using device: cuda

Tesla T4
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
Thu Nov 24 14:24:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+--------

### Experimental Setup

- Different datasets (MNIST, fMNIST, cMNIST, ImageNet)
- Different explanation techniques (xai methods)
- Different baseline estimators (faithfulness, robustness, randomisation and locallisation metrics)
- Different tests:
    1. Model Perturbation Test (Noise Resilience, Adversary Reactivity)
    2. Input Perturbation Test (Noise Resilience, Adversary Reactivity)


### Load models & datasets

In [4]:
# Paths.
path_mnist_model = path_assets + "models/mnist_lenet"
path_mnist_assets = path_assets + "test_sets/mnist_test_set.npy"

# Example for how to reload assets and models to notebook.
model_mnist = LeNet()
model_mnist.load_state_dict(torch.load(path_mnist_model))

assets_mnist = np.load(path_mnist_assets, allow_pickle=True).item()
x_batch_mnist = assets_mnist["x_batch"]
y_batch_mnist = assets_mnist["y_batch"]
s_batch_mnist = assets_mnist["s_batch"]

s_batch_mnist = s_batch_mnist.reshape(len(x_batch_mnist), 1, 28, 28)

# Paths.
path_fmnist_model = path_assets + "models/fmnist_lenet_model"
path_fmnist_assets = path_assets + "test_sets/fmnist_test_set.npy"

# Example for how to reload assets and models to notebook.
model_fmnist = LeNet()
model_fmnist.load_state_dict(torch.load(path_fmnist_model))

assets_fmnist = np.load(path_fmnist_assets, allow_pickle=True).item()
x_batch_fmnist = assets_fmnist["x_batch"]
y_batch_fmnist = assets_fmnist["y_batch"]
s_batch_fmnist = assets_fmnist["s_batch"]

#s_batch_fmnist = s_batch_fmnist.reshape(len(x_batch_fmnist), 1, 28, 28)

# Paths.
path_cmnist_model = path_assets + "models/cmnist_resnet9.ckpt"
path_cmnist_assets = path_assets + "test_sets/cmnist_test_set.npy"
s_type = "box"

# Example for how to reload assets and models to notebook.
model_cmnist = ResNet9(nr_channels=3, nr_classes=10)
model_cmnist.load_state_dict(torch.load(path_cmnist_model))

assets_cmnist = np.load(path_cmnist_assets, allow_pickle=True).item()
x_batch_cmnist = assets_cmnist["x_batch"].detach().numpy()
y_batch_cmnist = assets_cmnist["y_batch"].detach().numpy()
s_batch_cmnist = assets_cmnist[f"s_batch_{s_type}"]

s_batch_cmnist = s_batch_cmnist.reshape(len(x_batch_cmnist), 1, 32, 32)

# Paths.
#path_imagenet_model = path_assets + "models/imagenet_resnet18_model"
#path_imagenet_assets = path_assets + "test_sets/imagenet_test_set.npy"
#batch_size_test = 206

# Example for how to reload assets and models to notebook.
#model_imagenet_resnet18 = torchvision.models.resnet18(pretrained=True) 
#model_imagenet_vgg16 = torchvision.models.vgg16(pretrained=True) 
#model_imagenet_alexnet = torchvision.models.alexnet(pretrained=True) 

#assets_imagenet = np.load(path_imagenet_assets, allow_pickle=True).item()
#x_batch_imagenet = assets_imagenet["x_batch"]
#y_batch_imagenet = assets_imagenet["y_batch"]
#s_batch_imagenet = assets_imagenet["s_batch"]

#s_batch_imagenet = s_batch_imagenet.reshape(len(x_batch_imagenet), 1, 224, 224)

SETTINGS = {
    "MNIST": {
        "x_batch": x_batch_mnist, 
        "y_batch": y_batch_mnist, 
        "s_batch": s_batch_mnist, 
        "models": {"LeNet": model_mnist}, 
        "gc_layers": {"LeNet": 'list(model.named_modules())[3][1]'}, 
        "estimator_kwargs": {
            "features": 28*2,
            "num_classes": 10,
            "img_size": 28,
            "percentage": 0.1,
            }
        },
    "fMNIST": {
        "x_batch": x_batch_fmnist, 
        "y_batch": y_batch_fmnist, 
        "s_batch": s_batch_fmnist, 
        "models": {"LeNet": model_fmnist}, 
        "gc_layers": {"LeNet": 'list(model.named_modules())[3][1]'}, 
        "estimator_kwargs": {
            "features": 28*2,
            "num_classes": 10,
            "img_size": 28,
            "percentage": 0.1,
            }
        },
    "cMNIST": {
        "x_batch": x_batch_cmnist, 
        "y_batch": y_batch_cmnist, 
        "s_batch": s_batch_cmnist,
        "models": {"ResNet9": model_cmnist}, 
        "gc_layers": {"ResNet9": 'list(model.named_modules())[1][1][-6]'}, 
        "estimator_kwargs": {
            "features": 32*2,
            "num_classes": 10,
            "img_size": 32,
            "percentage": 0.1,
            }
        },
    #"ImageNet": {
    #    "x_batch": x_batch_imagenet, 
    #    "y_batch": y_batch_imagenet, 
    #    "s_batch": s_batch_imagenet, 
    #    "models": {
    #        "ResNet18": model_imagenet_resnet18, 
    #        "VGG16": model_imagenet_vgg16,
    #        },
    #    "gc_layers": {
    #        "ResNet18": 'list(model.named_modules())[61][1]', 
    #        "VGG16": 'list(model_imagenet_vgg16.named_modules())[28][1]',
    #        }, 
    #    "estimator_kwargs": {
    #        "num_classes": 1000,
    #        "img_size": 224,
    #        }
    #    }
    }

### Define configs

In [5]:
from typing import Dict
import numpy as np
import quantus
from quantus.metrics import *
from quantus.functions import (
    perturb_func,
    similarity_func,
    norm_func,
    normalise_func,
)


def setup_xai_methods(
    gc_layer: str,
    img_size: int = 28,
) -> Dict:
    return {
    "Gradient": {
    },
    "Saliency": {
    },
    "IntegratedGradients": {
    },
    "GradCAM": {
        "gc_layer": gc_layer,
        "interpolate": (img_size, img_size),
        "interpolate_mode": "bilinear",

    },
    }


def setup_estimators(
    features: int,
    num_classes: int,
    img_size: int,
    percentage: int,
) -> Dict:
    return {
        "Robustness": {
            "Max-Sensitivity": (quantus.MaxSensitivity(
                nr_samples=10,
                perturb_func=perturb_func.uniform_noise,
                norm_numerator=norm_func.fro_norm,
                norm_denominator=norm_func.fro_norm,
                lower_bound=0.01,
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                disable_warnings=True,
            ), True),
            "Local Lipschitz Estimate": (quantus.LocalLipschitzEstimate(
                nr_samples=10,
                perturb_func=perturb_func.gaussian_noise,
                norm_numerator=similarity_func.distance_euclidean,
                norm_denominator=similarity_func.distance_euclidean,
                perturb_std=0.01,
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                disable_warnings=True,
            ), True),
        },
        "Randomisation": {
            "Random Logit": (quantus.RandomLogit(
                similarity_func=similarity_func.correlation_spearman,
                num_classes=num_classes,
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                disable_warnings=True,
            ), False),
            "Model Parameter Randomisation Test": (quantus.ModelParameterRandomisation(
                similarity_func=similarity_func.correlation_spearman,
                return_sample_correlation=True,
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                disable_warnings=True,
            ), True),
        },
        "Faithfulness": {
            "Faithfulness Correlation": (quantus.FaithfulnessCorrelation(
                subset_size=features,
                perturb_baseline="uniform",
                perturb_func=perturb_func.baseline_replacement_by_indices,
                nr_runs=10,
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                disable_warnings=True,
            ), False),
            "Pixel-Flipping": (quantus.PixelFlipping(
                features_in_step=features,
                perturb_baseline="uniform",
                perturb_func=perturb_func.baseline_replacement_by_indices,
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                return_auc_per_sample=True,
                disable_warnings=True,
            ), False),
        },
        "Complexity": {
            "Sparseness": (quantus.Sparseness(
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                disable_warnings=True,
            ), False),
            "Complexity": (quantus.Complexity(
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                disable_warnings=True,
            ), True),
        },
        "Localisation": {
            "Pointing-Game": (quantus.PointingGame(
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                disable_warnings=True,
            ), False),
            #"Top-K Intersection": (quantus.TopKIntersection(
            #    k=int((img_size*img_size)*percentage),
            #    abs=False,
            #    normalise=True,
            #    normalise_func=normalise_func.normalise_by_max,
            #    return_aggregate=False,
            #    aggregate_func=np.mean,
            #    disable_warnings=True,
            #), False),
            "Relevance Rank Accuracy": (quantus.RelevanceRankAccuracy(
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                disable_warnings=True,
            ), False),
            #"Relevance Mass Accuracy": (quantus.RelevanceMassAccuracy(
            #    abs=False,
            #    normalise=True,
            #    normalise_func=normalise_func.normalise_by_max,
            #    return_aggregate=False,
            #    aggregate_func=np.mean,
            #    disable_warnings=True,
            #), False),
        },
    }



### Prepare analysers

In [6]:
# Analysers.
analyser_suite = {
    "Model Resilience Test": 
        metaquantus.ModelPerturbationTest(**{
            "noise_type": "multiplicative",
            "mean": 1.0,
            "std": 0.0001,
            "type": "Resilience",
            }
        ),
    "Model Adversary Test": 
        metaquantus.ModelPerturbationTest(**{
            "noise_type": "multiplicative",
            "mean": 1.0,
            "std": 2.0,
            "type": "Adversary",
            }
        ),
    "Input Resilience Test": 
        metaquantus.InputPerturbationTest(**{
            "noise": 0.0001,
            "type": "Resilience",
            }
        ), 
    "Input Adversary Test": 
        metaquantus.InputPerturbationTest(**{
            "noise": 10.0,
            "type": "Adversary",
            }
        ),
}
    

## Sanity Checking

### Dummy=both

In [24]:
##############################
# Dataset-specific settings. #
##############################

dataset_name = "MNIST"
model_name = "LeNet"
dataset_settings = {dataset_name: SETTINGS[dataset_name]}
dataset_kwargs = dataset_settings[dataset_name]["estimator_kwargs"]

# XAI methods.
xai_methods = setup_xai_methods(gc_layer=dataset_settings[dataset_name]["gc_layers"][model_name], img_size=dataset_kwargs["img_size"])

###########################
# Benchmarkinng settings. #
###########################

nr_perturbations = 10
iterations = 5

# Define master!
master = metaquantus.MetaEvaluation( #
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="Complement_NR",
    write_to_file=True,
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
    sanity_check="NR",
    path=path_results+"adversarial/"
)

master(estimator=quantus.Metric,
       model=dataset_settings[dataset_name]["models"]["LeNet"],
       x_batch=dataset_settings[dataset_name]["x_batch"],
       y_batch=dataset_settings[dataset_name]["y_batch"],
       a_batch=None,
       s_batch=dataset_settings[dataset_name]["s_batch"],
       channel_first=True,
       softmax=False,
       device=device
       )

uid_nr = master.uid

UID=ed25
Model Resilience Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

In [25]:
##############################
# Dataset-specific settings. #
##############################

dataset_name = "MNIST"
model_name = "LeNet"
dataset_settings = {dataset_name: SETTINGS[dataset_name]}
dataset_kwargs = dataset_settings[dataset_name]["estimator_kwargs"]

# XAI methods.
xai_methods = setup_xai_methods(gc_layer=dataset_settings[dataset_name]["gc_layers"][model_name], img_size=dataset_kwargs["img_size"])

###########################
# Benchmarkinng settings. #
###########################

nr_perturbations = 10
iterations = 5

# Define master!
master = metaquantus.MetaEvaluation( #metaquantus.MetaEvaluation
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="Complement_AR",
    write_to_file=True,
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
    sanity_check="AR",
    path=path_results+"adversarial/"
)

master(estimator=quantus.Metric,
       model=dataset_settings[dataset_name]["models"]["LeNet"],
       x_batch=dataset_settings[dataset_name]["x_batch"],
       y_batch=dataset_settings[dataset_name]["y_batch"],
       a_batch=None,
       s_batch=dataset_settings[dataset_name]["s_batch"],
       channel_first=True,
       softmax=False,
       device=device
       )
uid_ar = master.uid

UID=5bdc
Model Resilience Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

In [27]:
from datetime import datetime
today = datetime.today().strftime("%d%m%Y")

def get_mean_std(score_type: str, expectation: str, scores: np.array):
    if "IAC" in score_type:
        score_means = scores.mean(axis=(0, 2))
    else:
        score_means = scores.mean(axis=1)
    print(f"\t{score_type}={score_means.mean():.4f} ({score_means.std():.3f}) \t-----\tExpectation{expectation}")
    
perturbation_type = "Input"

#uid_ar = "9b45"
print(f"\nControlled scenario 1: the estimator always returns the same score, independent of perturbation (deterministic sampling). uid={uid_ar}\n")
print(f"{perturbation_type} Perturbation Test")

inter_scores_nr = np.array(load_obj(path_results+"adversarial/", fname=f"{today}_Complement_AR_inter_scores_{uid_ar}", use_json=True)[f"{perturbation_type} Resilience Test"]).reshape(iterations, nr_perturbations)
intra_scores_nr = load_obj(path_results+"adversarial/", fname=f"{today}_Complement_AR_intra_scores_{uid_ar}", use_json=True)
intra_scores_nr = np.array(list(intra_scores_nr[f"{perturbation_type} Resilience Test"].values())).reshape(len(xai_methods), iterations, nr_perturbations)
inter_scores_ar = np.array(load_obj(path_results+"adversarial/", fname=f"{today}_Complement_AR_inter_scores_{uid_ar}", use_json=True)[f"{perturbation_type} Adversary Test"]).reshape(iterations, nr_perturbations)
intra_scores_ar = load_obj(path_results+"adversarial/", fname=f"{today}_Complement_AR_intra_scores_{uid_ar}", use_json=True)
intra_scores_ar = np.array(list(intra_scores_ar[f"{perturbation_type} Adversary Test"].values())).reshape(len(xai_methods), iterations, nr_perturbations)


get_mean_std(score_type="IAC_{NR}", expectation="=1.0 (should succeed: scores are the same!)", scores=intra_scores_nr)
get_mean_std(score_type="IAC_{AR}", expectation="=0.0 (should fail: scores are not different!)", scores=intra_scores_ar)
get_mean_std(score_type="IEC_{NR}", expectation="=1.0 (should succed: scores, then rankings are the same!)", scores=inter_scores_nr) 
get_mean_std(score_type="IEC_{AR}", expectation="=0.0 (should fail: does not fulfil ranking condition '<' since '=')", scores=inter_scores_ar) 

#uid_nr = "908d"
print(f"\nControlled scenario 2: the estimator always returns scores from a different distribution (stochastic sampling). uid={uid_nr}\n")
print(f"{perturbation_type} Perturbation Test")

inter_scores_nr = np.array(load_obj(path_results+"adversarial/", fname=f"{today}_Complement_NR_inter_scores_{uid_nr}", use_json=True)[f"{perturbation_type} Resilience Test"]).reshape(iterations, nr_perturbations)
intra_scores_nr = load_obj(path_results+"adversarial/", fname=f"{today}_Complement_NR_intra_scores_{uid_nr}", use_json=True)
intra_scores_nr = np.array(list(intra_scores_nr[f"{perturbation_type} Resilience Test"].values())).reshape(len(xai_methods), iterations, nr_perturbations)
inter_scores_ar = np.array(load_obj(path_results+"adversarial/", fname=f"{today}_Complement_NR_inter_scores_{uid_nr}", use_json=True)[f"{perturbation_type} Adversary Test"]).reshape(iterations, nr_perturbations)
intra_scores_ar = load_obj(path_results+"adversarial/", fname=f"{today}_Complement_NR_intra_scores_{uid_nr}", use_json=True)
intra_scores_ar = np.array(list(intra_scores_ar[f"{perturbation_type} Adversary Test"].values())).reshape(len(xai_methods), iterations, nr_perturbations)

get_mean_std(score_type="IAC_{NR}", expectation="≈0.0 (should fail: scores are different!)", scores=intra_scores_nr)
get_mean_std(score_type="IAC_{AR}", expectation="≈1.0 (should succeed: scores are different!)", scores=intra_scores_ar)
get_mean_std(score_type="IEC_{NR}", expectation="≈0.25 (should be =1/L, where L=4: no diff in scores between explainers)", scores=inter_scores_nr)
get_mean_std(score_type="IEC_{AR}", expectation="≈0.0 (depends on the sampling distributions and its variation!)", scores=inter_scores_ar) 


Controlled scenario 1: the estimator always returns the same score, independent of perturbation (deterministic sampling). uid=5bdc

Input Perturbation Test
	IAC_{NR}=1.0000 (0.000) 	-----	Expectation=1.0 (should succeed: scores are the same!)
	IAC_{AR}=0.0000 (0.000) 	-----	Expectation=0.0 (should fail: scores are not different!)
	IEC_{NR}=1.0000 (0.000) 	-----	Expectation=1.0 (should succed: scores, then rankings are the same!)
	IEC_{AR}=0.0000 (0.000) 	-----	Expectation=0.0 (should fail: does not fulfil ranking condition '<' since '=')

Controlled scenario 2: the estimator always returns scores from a different distribution (stochastic sampling). uid=ed25

Input Perturbation Test
	IAC_{NR}=0.0000 (0.000) 	-----	Expectation≈0.0 (should fail: scores are different!)
	IAC_{AR}=1.0000 (0.000) 	-----	Expectation≈1.0 (should succeed: scores are different!)
	IEC_{NR}=0.2499 (0.002) 	-----	Expectation≈0.25 (should be =1/L, where L=4: no diff in scores between explainers)
	IEC_{AR}=0.0000 (0.

### Dummy=diff

In [37]:
##############################
# Dataset-specific settings. #
##############################

dataset_name = "MNIST"
model_name = "LeNet"
dataset_settings = {dataset_name: SETTINGS[dataset_name]}
dataset_kwargs = dataset_settings[dataset_name]["estimator_kwargs"]

# XAI methods.
xai_methods = setup_xai_methods(gc_layer=dataset_settings[dataset_name]["gc_layers"][model_name], img_size=dataset_kwargs["img_size"])

###########################
# Benchmarkinng settings. #
###########################

nr_perturbations = 10
iterations = 5

# Define master!
master = MasterAnalyserColab( #metaquantus.MetaEvaluation
    analyser_suite={
    "Model Resilience Test": 
        metaquantus.ModelPerturbationTest(**{
            "noise_type": "multiplicative",
            "mean": 1.0,
            "std": 0.0001,
            "type": "Resilience",
            }
        ),
    "Input Resilience Test": 
        metaquantus.InputPerturbationTest(**{
            "noise": 0.0001,
            "type": "Resilience",
            }
        ),
},
    xai_methods=xai_methods,
    iterations=iterations,
    fname="Adversarial_NR",
    write_to_file=True,
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
    sanity_check="NR",
    path=path_results+"adversarial/"
)

master(estimator=quantus.Metric,
       model=dataset_settings[dataset_name]["models"]["LeNet"],
       x_batch=dataset_settings[dataset_name]["x_batch"],
       y_batch=dataset_settings[dataset_name]["y_batch"],
       a_batch=None,
       s_batch=dataset_settings[dataset_name]["s_batch"],
       channel_first=True,
       softmax=False,
       device=device
       )
    

UID=f9ca
Model Resilience Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Sanity check dones
Sanity check dones
Sanity check dones
Sanity check dones
Sanity check dones
Input Resilience Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Sanity check dones
Sanity check dones
Sanity check dones
Sanity check dones
Sanity check dones


<__main__.MasterAnalyserColab at 0x7fca00b8fd50>

### Dummy=same

In [43]:
##############################
# Dataset-specific settings. #
##############################

dataset_name = "MNIST"
model_name = "LeNet"
dataset_settings = {dataset_name: SETTINGS[dataset_name]}
dataset_kwargs = dataset_settings[dataset_name]["estimator_kwargs"]

# XAI methods.
xai_methods = setup_xai_methods(gc_layer=dataset_settings[dataset_name]["gc_layers"][model_name], img_size=dataset_kwargs["img_size"])

###########################
# Benchmarkinng settings. #
###########################

nr_perturbations = 10
iterations = 5

# Define master!
master = MasterAnalyserColab( #metaquantus.MetaEvaluation
    analyser_suite={
    "Model Adversary Test": 
        metaquantus.ModelPerturbationTest(**{
            "noise_type": "multiplicative",
            "mean": 1.0,
            "std": 2.0,
            "type": "Adversary",
            }
        ),
    "Input Adversary Test": 
        metaquantus.InputPerturbationTest(**{
            "noise": 10.0,
            "type": "Adversary",
            }
        ),
},
    xai_methods=xai_methods,
    iterations=iterations,
    fname="Adversarial_AR",
    write_to_file=True,
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
    sanity_check="AR",
    path=path_results+"adversarial/"
)

master(estimator=quantus.Metric,
       model=dataset_settings[dataset_name]["models"]["LeNet"],
       x_batch=dataset_settings[dataset_name]["x_batch"],
       y_batch=dataset_settings[dataset_name]["y_batch"],
       a_batch=None,
       s_batch=dataset_settings[dataset_name]["s_batch"],
       channel_first=True,
       softmax=False,
       device=device
       )
    

UID=8da9
Model Adversary Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Sanity check dones
Sanity check dones
Sanity check dones
Sanity check dones
Sanity check dones
Input Adversary Test


Iterations:   0%|          | 0/5 [00:00<?, ?it/s]

Sanity check dones
Sanity check dones
Sanity check dones
Sanity check dones
Sanity check dones


<__main__.MasterAnalyserColab at 0x7fca02775890>

## Run master

In [None]:
##############################
# Dataset-specific settings. #
##############################

dataset_name = "MNIST"
model_name = "LeNet"
dataset_settings = {dataset_name: SETTINGS[dataset_name]}
dataset_kwargs = dataset_settings[dataset_name]["estimator_kwargs"]

# Estimators.
estimators = setup_estimators(features=dataset_kwargs["features"], num_classes=dataset_kwargs["num_classes"], img_size=dataset_kwargs["img_size"], percentage=dataset_kwargs["percentage"])
estimator_category = "Randomisation"
estimator_name = "Random Logit"

# XAI methods.
xai_methods = setup_xai_methods(gc_layer=dataset_settings[dataset_name]["gc_layers"][model_name], img_size=dataset_kwargs["img_size"])

###########################
# Benchmarkinng settings. #
###########################

nr_perturbations = 5
iterations = 2 

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite={"Model Resilience Test": 
        metaquantus.ModelPerturbationTest(**{
            "noise_type": "multiplicative",
            "mean": 1.0,
            "std": 0.0001,
            "type": "Resilience",
            }
        ),
        "Model Adversary Test": 
        metaquantus.ModelPerturbationTest(**{
            "noise_type": "multiplicative",
            "mean": 1.0,
            "std": 2.0,
            "type": "Adversary",
            }
        ),},
    xai_methods=xai_methods,
    iterations=iterations,
    fname="debug_pval",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)
master(estimator=estimators[estimator_category][estimator_name][0],
    model=dataset_settings[dataset_name]["models"]["LeNet"],
    x_batch=dataset_settings[dataset_name]["x_batch"],
    y_batch=dataset_settings[dataset_name]["y_batch"],
    a_batch=None,
    s_batch=dataset_settings[dataset_name]["s_batch"],
    channel_first=True,
    softmax=False,
    device=device,
    lower_is_better=estimators[estimator_category][estimator_name][1],
    )

An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
UID=3b1c
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
iac = np.array(list(master.intra_scores[f"Model Adversary Test"].values())).flatten()
iac.mean(), iac.std()

In [None]:
iac = np.array(list(master.intra_scores[f"Model Resilience Test"].values())).flatten()
iac.mean(), iac.std()

In [16]:
iac = np.array(list(master.intra_scores[f"Model Resilience Test"].values())).flatten()
iac.mean(), iac.std()

(0.0619437024580487, 0.16386525659719392)

In [10]:
iac = np.array(list(master.intra_scores[f"Model Resilience Test"].values())).flatten()
iac.mean(), iac.std()

(0.059484326464210514, 0.15888485883087192)

In [None]:
from scipy.stats import wilcoxon

perturbation_type = "Resilience"
analyser_name = f"Model {perturbation_type} Test"
method = "IntegratedGradients"



q=master.eval_scores[analyser_name][method]
q_hat=master.eval_scores_perturbed[analyser_name][i][method][p]
indices = master.indices_perturbed[analyser_name][i][p]
q = np.array(q)[np.array(indices)]
q_hat = np.array(q_hat)[np.array(indices)]

print(wilcoxon(q, q_hat, alternative="two-sided", zero_method="zsplit")[1])

In [7]:
from scipy.stats import wilcoxon

perturbation_type = "Resilience"
analyser_name = f"Model {perturbation_type} Test"
method = "IntegratedGradients"

for i in range(iterations):
    for p in range(nr_perturbations):

    q=master.eval_scores[analyser_name][method]
    q_hat=master.eval_scores_perturbed[analyser_name][i][method][p]
    indices = master.indices_perturbed[analyser_name][i][p]
    q = np.array(q)[np.array(indices)]
    q_hat = np.array(q_hat)[np.array(indices)]

    print(wilcoxon(q, q_hat, alternative="two-sided", zero_method="zsplit")[1])

2.591944078509876e-12

NameError: ignored

## Benchmarking

In [6]:
# Below is fixing so that we are explaining with respect to the original predicted class.

### MNIST

In [7]:
##############################
# Dataset-specific settings. #
##############################

dataset_name = "MNIST"
model_name = "LeNet"
dataset_settings = {dataset_name: SETTINGS[dataset_name]}
dataset_kwargs = dataset_settings[dataset_name]["estimator_kwargs"]

# Estimators.
estimators = setup_estimators(features=dataset_kwargs["features"], num_classes=dataset_kwargs["num_classes"], img_size=dataset_kwargs["img_size"], percentage=dataset_kwargs["percentage"])
estimators_sub = {
    "Localisation": estimators["Localisation"],
    "Complexity": estimators["Complexity"],
    "Randomisation": estimators["Randomisation"],
    "Robustness": estimators["Robustness"],
    "Faithfulness": estimators["Faithfulness"],
}

# XAI methods.
xai_methods = setup_xai_methods(gc_layer=dataset_settings[dataset_name]["gc_layers"][model_name], img_size=dataset_kwargs["img_size"])

###########################
# Benchmarkinng settings. #
###########################

nr_perturbations = 10
iterations = 3 

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)

# Benchmark!
benchmark = metaquantus.MetaEvaluationMultiple(
    master=master,
    estimators=estimators_sub,
    experimental_settings=dataset_settings,
    keep_results=True,
    channel_first=True,
    softmax=False,
    device=device,
    )()

An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
MNIST
  LeNet
    Localisation
      Pointing-Game
UID=b274
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Relevance Rank Accuracy
UID=b274
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Complexity
      Sparseness
UID=b274
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Complexity
UID=b274
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Randomisation
      Random Logit
UID=b274
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Model Parameter Randomisation Test
UID=b274
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Robustness
      Max-Sensitivity
UID=b274
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Local Lipschitz Estimate
UID=b274
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Faithfulness
      Faithfulness Correlation
UID=b274
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Pixel-Flipping
UID=b274
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

### fMNIST

In [8]:
##############################
# Dataset-specific settings. #
##############################

dataset_name = "fMNIST"
model_name = "LeNet"
dataset_settings = {dataset_name: SETTINGS[dataset_name]}
dataset_kwargs = dataset_settings[dataset_name]["estimator_kwargs"]

# Estimators.
estimators = setup_estimators(features=dataset_kwargs["features"], num_classes=dataset_kwargs["num_classes"], img_size=dataset_kwargs["img_size"], percentage=dataset_kwargs["percentage"])
estimators_sub = {
    "Localisation": estimators["Localisation"],
    "Complexity": estimators["Complexity"],
    "Randomisation": estimators["Randomisation"],
    "Robustness": estimators["Robustness"],
    "Faithfulness": estimators["Faithfulness"],
}

# XAI methods.
xai_methods = setup_xai_methods(gc_layer=dataset_settings[dataset_name]["gc_layers"][model_name], img_size=dataset_kwargs["img_size"])

###########################
# Benchmarkinng settings. #
###########################

nr_nr_perturbations = 10
iterations = 3

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)

# Benchmark!
benchmark = metaquantus.MetaEvaluationMultiple(
    master=master,
    estimators=estimators_sub,
    experimental_settings=dataset_settings,
    keep_results=True,
    channel_first=True,
    softmax=False,
    device=device,
    )()

An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
fMNIST
  LeNet
    Localisation
      Pointing-Game
UID=e169
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Relevance Rank Accuracy
UID=e169
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Complexity
      Sparseness
UID=e169
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Complexity
UID=e169
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Randomisation
      Random Logit
UID=e169
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Model Parameter Randomisation Test
UID=e169
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Robustness
      Max-Sensitivity
UID=e169
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Local Lipschitz Estimate
UID=e169
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Faithfulness
      Faithfulness Correlation
UID=e169
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Pixel-Flipping
UID=e169
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

### cMNIST

In [9]:
##############################
# Dataset-specific settings. #
##############################

dataset_name = "cMNIST"
model_name = "ResNet9"
dataset_settings = {dataset_name: SETTINGS[dataset_name]}
dataset_kwargs = dataset_settings[dataset_name]["estimator_kwargs"]

# Estimators.
estimators = setup_estimators(features=dataset_kwargs["features"], num_classes=dataset_kwargs["num_classes"], img_size=dataset_kwargs["img_size"], percentage=dataset_kwargs["percentage"])
estimators_sub = {
    "Localisation": estimators["Localisation"],
    "Complexity": estimators["Complexity"],
    "Randomisation": estimators["Randomisation"],
    "Robustness": estimators["Robustness"],
    "Faithfulness": estimators["Faithfulness"],
}

# XAI methods.
xai_methods = setup_xai_methods(gc_layer=dataset_settings[dataset_name]["gc_layers"][model_name], img_size=dataset_kwargs["img_size"])

###########################
# Benchmarkinng settings. #
###########################

nr_nr_perturbations = 10
iterations = 3

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)

# Benchmark!
benchmark = metaquantus.MetaEvaluationMultiple(
    master=master,
    estimators=estimators_sub,
    experimental_settings=dataset_settings,
    keep_results=True,
    channel_first=True,
    softmax=False,
    device=device,
    )()

An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
cMNIST
  ResNet9
    Localisation
      Pointing-Game
UID=aefa
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Relevance Rank Accuracy
UID=aefa
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Complexity
      Sparseness
UID=aefa
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Complexity
UID=aefa
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Randomisation
      Random Logit
UID=aefa
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Model Parameter Randomisation Test
UID=aefa
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Robustness
      Max-Sensitivity
UID=aefa
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Local Lipschitz Estimate
UID=aefa
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Faithfulness
      Faithfulness Correlation
UID=aefa
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Pixel-Flipping
UID=aefa
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

## Hyperparameter tuning

In [None]:
def setup_faithfulness_estimators(
    features: int,
    patch_size: int,
    num_classes: int,
    img_size: int,
    percentage: int,
) -> Dict:
    return {
        "Faithfulness": {
            "Faithfulness Correlation": (quantus.FaithfulnessCorrelation(
                subset_size=features,
                perturb_baseline="uniform",
                perturb_func=perturb_func.baseline_replacement_by_indices,
                nr_runs=10,
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                disable_warnings=True,
            ), False),
            "Pixel-Flipping": (quantus.PixelFlipping(
                features_in_step=features,
                perturb_baseline="uniform",
                perturb_func=perturb_func.baseline_replacement_by_indices,
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                return_auc_per_sample=True,
                disable_warnings=True,
            ), False),
            "ROAD": (quantus.ROAD( # Implement aggregation.
                noise=0.1,
                perturb_func=quantus.perturb_func.noisy_linear_imputation,
                percentages=list(range(1, 100, 5)),
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                return_auc_per_sample=True,
                disable_warnings=True,
            ), False),
            "MonotinicityCorrelation": (quantus.MonotonicityCorrelation(
                nr_samples=10,
                features_in_step=features,
                perturb_baseline="uniform",
                perturb_func=quantus.perturb_func.baseline_replacement_by_indices,
                similarity_func=quantus.similarity_func.correlation_spearman,
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                return_auc_per_sample=True,
                disable_warnings=True,
            ), False),
            "Infidelity": (quantus.PixelFlipping(
                perturb_baseline="uniform",
                perturb_func=quantus.perturb_func.baseline_replacement_by_indices,
                n_perturb_samples=10,
                perturb_patch_sizes=[patch_size], 
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                return_auc_per_sample=True,
                disable_warnings=True,
            ), False),
            "Infidelity": (quantus.PixelFlipping(
                perturb_baseline="uniform",
                perturb_func=quantus.perturb_func.baseline_replacement_by_indices,
                n_perturb_samples=10,
                perturb_patch_sizes=[patch_size], 
                abs=False,
                normalise=True,
                normalise_func=normalise_func.normalise_by_max,
                return_aggregate=False,
                aggregate_func=np.mean,
                return_auc_per_sample=True,
                disable_warnings=True,
            ), False),
            
        },
    }


In [None]:
##############################
# Dataset-specific settings. #
##############################

dataset_name = "MNIST"
model_name = "LeNet"
dataset_settings = {dataset_name: SETTINGS[dataset_name]}
dataset_kwargs = dataset_settings[dataset_name]["estimator_kwargs"]

# Estimators.
estimators = setup_faithfulness_estimators(features=dataset_kwargs["features"], num_classes=dataset_kwargs["num_classes"], img_size=dataset_kwargs["img_size"], percentage=dataset_kwargs["percentage"])

# XAI methods.
xai_methods = setup_xai_methods(gc_layer=dataset_settings[dataset_name]["gc_layers"][model_name], img_size=dataset_kwargs["img_size"])

###########################
# Benchmarkinng settings. #
###########################

nr_nr_perturbations = 10
iterations = 3

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)

# Benchmark!
benchmark = metaquantus.MetaEvaluationMultiple(
    master=master,
    estimators=estimators_sub,
    experimental_settings=dataset_settings,
    keep_results=True,
    channel_first=True,
    softmax=False,
    device=device,
    )()

## Experiments others

### MNIST non-normalised explanations

In [None]:
##############################
# Dataset-specific settings. #
##############################

dataset_name = "MNIST"
model_name = "LeNet"
dataset_settings = {dataset_name: SETTINGS[dataset_name]}
dataset_kwargs = dataset_settings[dataset_name]["estimator_kwargs"]

# Estimators.
estimators = setup_estimators(features=dataset_kwargs["features"], num_classes=dataset_kwargs["num_classes"], img_size=dataset_kwargs["img_size"], percentage=dataset_kwargs["percentage"])
estimators_sub = {
    "Localisation": estimators["Localisation"],
    "Complexity": estimators["Complexity"],
    "Randomisation": estimators["Randomisation"],
    "Robustness": estimators["Robustness"],
    "Faithfulness": estimators["Faithfulness"],
}

# XAI methods.
xai_methods = setup_xai_methods(gc_layer=dataset_settings[dataset_name]["gc_layers"][model_name], img_size=dataset_kwargs["img_size"])

###########################
# Benchmarkinng settings. #
###########################

nr_nr_perturbations = 10
iterations = 3

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="non_normalised_xai",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)

# Benchmark!
benchmark = metaquantus.MetaEvaluationMultiple(
    master=master,
    estimators=estimators_sub,
    experimental_settings=dataset_settings,
    keep_results=True,
    channel_first=True,
    softmax=False,
    device=device,
    )()

An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
MNIST
  LeNet
    Localisation
      Pointing-Game
UID=28d0
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Relevance Rank Accuracy
UID=28d0
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Complexity
      Sparseness
UID=28d0
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Complexity
UID=28d0
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Randomisation
      Random Logit
UID=28d0
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Model Parameter Randomisation Test
UID=28d0
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Robustness
      Max-Sensitivity
UID=28d0
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Local Lipschitz Estimate
UID=28d0
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Faithfulness
      Faithfulness Correlation
UID=28d0
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Pixel-Flipping
UID=28d0
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

### fMNIST non-normalised explanations

In [None]:
##############################
# Dataset-specific settings. #
##############################

dataset_name = "fMNIST"
model_name = "LeNet"
dataset_settings = {dataset_name: SETTINGS[dataset_name]}
dataset_kwargs = dataset_settings[dataset_name]["estimator_kwargs"]

# Estimators.
estimators = setup_estimators(features=dataset_kwargs["features"], num_classes=dataset_kwargs["num_classes"], img_size=dataset_kwargs["img_size"], percentage=dataset_kwargs["percentage"])
estimators_sub = {
    "Localisation": estimators["Localisation"],
    "Complexity": estimators["Complexity"],
    "Randomisation": estimators["Randomisation"],
    "Robustness": estimators["Robustness"],
    "Faithfulness": estimators["Faithfulness"],
}

# XAI methods.
xai_methods = setup_xai_methods(gc_layer=dataset_settings[dataset_name]["gc_layers"][model_name], img_size=dataset_kwargs["img_size"])

###########################
# Benchmarkinng settings. #
###########################

nr_nr_perturbations = 10
iterations = 3

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="non_normalised_xai",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)

# Benchmark!
benchmark = metaquantus.MetaEvaluationMultiple(
    master=master,
    estimators=estimators_sub,
    experimental_settings=dataset_settings,
    keep_results=True,
    channel_first=True,
    softmax=False,
    device=device,
    )()

An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
An absolute operation should be applied on the attributions, otherwise inconsistent results can be expected. Re-set 'abs' parameter.
fMNIST
  LeNet
    Localisation
      Pointing-Game
UID=1509
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Relevance Rank Accuracy
UID=1509
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Complexity
      Sparseness
UID=1509
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Complexity
UID=1509
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Randomisation
      Random Logit
UID=1509
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Model Parameter Randomisation Test
UID=1509
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Robustness
      Max-Sensitivity
UID=1509
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Local Lipschitz Estimate
UID=1509
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

    Faithfulness
      Faithfulness Correlation
UID=1509
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

      Pixel-Flipping
UID=1509
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Input Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]


### OPEN QUESTION 1. Select Faithfulness metric.

In [None]:
nr_perturbations = 3
iterations = 3

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)

estimator_category = "Localisation"
estimator_name = "Relevance Rank Accuracy" 

master(
    estimator=estimators_sub[estimator_category][estimator_name][0],
    model=dataset_settings[dataset_name]["models"]["LeNet"],
    x_batch=dataset_settings[dataset_name]["x_batch"],
    y_batch=dataset_settings[dataset_name]["y_batch"],
    a_batch=None,
    s_batch=dataset_settings[dataset_name]["s_batch"],
    channel_first=True,
    softmax=False,
    device=device,
    lower_is_better=estimators_sub[estimator_category][estimator_name][1],
    )

Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

<metaquantus.master.MetaEvaluation at 0x7f8ad20952d0>

In [None]:
master.get_statistics_inter_scores(test="Model Adversary Test")

(0.6704566810749859, 0.4700473591851151)

In [None]:
master.get_statistics_intra_scores(test="Model Adversary Test", method="Gradient")

(0.021459034158132884, 0.043612734338405006)

In [None]:
benchmark = metaquantus.MetaEvaluationMultiple(
    master=master,
    estimators=estimators_sub,
    experimental_settings=dataset_settings,
    keep_results=True,
    channel_first=True,
    softmax=False,
    device=device,
    )()

MNIST
	LeNet
		Localisation
			Pointing-Game
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

			Relevance Rank Accuracy
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

		Complexity
			Sparseness
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

			Complexity
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

		Randomisation
			Random Logit
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

			Model Parameter Randomisation Test
Model Resilience Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Model Adversary Test


Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
benchmark["MNIST"]["LeNet"]["Localisation"]["Pointing-Game"]["intra_scores"]["Model Resilience Test"]

{'Model Resilience Test': {'Gradient': array([[1., 1., 1.],
         [1., 1., 1.]]), 'Saliency': array([[1., 1., 1.],
         [1., 1., 1.]]), 'IntegratedGradients': array([[1., 1., 1.],
         [1., 1., 1.]]), 'GradCAM': array([[0.31731051, 0.31731051, 0.15729921],
         [0.15729921, 0.15729921, 1.        ]])},
 'Model Adversary Test': {'Gradient': array([[2.47912686e-05, 2.45732780e-04, 1.99440258e-08],
         [1.26235212e-07, 3.47524150e-08, 4.26672482e-03]]),
  'Saliency': array([[5.22993552e-08, 5.72436243e-08, 2.26428295e-15],
         [5.35805117e-07, 1.10889112e-07, 1.70187209e-03]]),
  'IntegratedGradients': array([[1., 1., 1.],
         [1., 1., 1.]]),
  'GradCAM': array([[2.88216926e-01, 5.72588710e-02, 4.73975488e-06],
         [3.53792297e-02, 4.38134571e-02, 4.45570906e-05]])}}

In [None]:
pd.DataFrame(np.hstack(benchmark["MNIST"]["LeNet"]["Localisation"]["Relevance Rank Accuracy"]["inter_scores"]["Model Adversary Test"])).describe()

Unnamed: 0,0
count,21368.0
mean,0.584846
std,0.49276
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [None]:
pd.DataFrame(np.hstack(benchmark["MNIST"]["LeNet"]["Localisation"]["Pointing-Game"]["inter_scores"]["Model Adversary Test"])).describe()

Unnamed: 0,0
count,21420.0
mean,0.082166
std,0.274624
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [None]:
pd.DataFrame(np.hstack(benchmark["MNIST"]["LeNet"]["Localisation"]["Relevance Rank Accuracy"]["inter_scores"]["Model Resilience Test"])).describe()

Unnamed: 0,0
count,24576.0
mean,0.995239
std,0.068835
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [None]:
pd.DataFrame(np.hstack(benchmark["MNIST"]["LeNet"]["Localisation"]["Pointing-Game"]["inter_scores"]["Model Resilience Test"])).describe()

Unnamed: 0,0
count,24576.0
mean,0.998698
std,0.036062
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


### P-val experiment

Goal is to see if the distribution of p-vals is generally higher for minor and disruptive perturbation.

- Dataset: MNIST
- Estimators: Complexity, Localisation, Robustness
- Test: 1. Model Perturbation Test, 2. Input Perturbation Test
- Input: various noise levels

- Goal: Generate a continous

In [None]:
# 15 min

# Settings.
test = "Model Resilience Test"
noise_levels = [0.001, 0.01, 0.1, 0.35, 0.5, 0.75, 1.0, 1.5, 2.0]
#[0.001, 0.01, 0.1, 0.2, 0.35, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]
nr_nr_perturbations = 10
iterations = 1

p_values = {}
correct_indices = {}


for estimator_category, estimator_meta in estimators_sub.items():

    p_values[estimator_category] = {}
    correct_indices[estimator_category] = {}

    for estimator_name, estimator in estimator_meta.items():

        p_values[estimator_category][estimator_name] = {}
        correct_indices[estimator_category][estimator_name] = {}
        
        print(f"{estimator_name}")

        for noise in noise_levels:

            print(f"\t{noise}")

            # metaquantus.
            analyser_suite = {
                
                test: 
                    metaquantus.ModelPerturbationTest(**{
                        "noise_type": "multiplicative",
                        "mean": 1.0,
                        "std": noise,
                        "type": "Resilience",
                        }
                    ),
                    #"Input Perturbation Test": 
                    #metaquantus.InputPerturbationTest(**{
                    #    "noise": 0.0001,
                    #    "type": "Resilience",
                    #    }
                    #), 
            }

            # Define master!
            master = metaquantus.MetaEvaluation(
                analyser_suite=analyser_suite,
                xai_methods=xai_methods,
                iterations=iterations,
                path=path_results,
                nr_perturbations=nr_perturbations,
                explain_func=quantus.explain,
            )

            # Run perturbation analysis and inference.
            master(
                metric=estimators_sub[estimator_category][estimator_name],
                model=dataset_settings[dataset_name]["models"]["LeNet"],
                x_batch=dataset_settings[dataset_name]["x_batch"],
                y_batch=dataset_settings[dataset_name]["y_batch"],
                a_batch=None,
                s_batch=dataset_settings[dataset_name]["s_batch"],
                channel_first=True,
                softmax=False,
                device=device
                )

            # Append results.
            p_values[estimator_category][estimator_name][noise] = master.iac_scores
            p_values[estimator_category][estimator_name][noise] = master.iec_scores
            correct_indices[estimator_category][estimator_name][noise] = master.indices_perturbed  

# Save the results.
uid = str(uuid.uuid4())
print(f"Saving as {uid}")
np.savez(path_results + f"p_val_experiment_{uid}.npz", noise_levels, p_values, correct_indices)

In [None]:
uid = "847ee319-4dbc-4d80-a0c3-0014db8ce54c"
data = np.load(path_results + f"p_val_experiment_{uid}.npz", allow_pickle=False)
noise_levels = data['arr_0']
p_values = data['arr_1']
correct_indices = data['arr_2']

In [None]:
def legend_without_duplicate_labels(ax):
    handles, labels = ax.get_legend_handles_labels()
    unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
    ax.legend(*zip(*unique))

In [None]:
x = {}
y = {}

markers = {a:b for a, b in zip(xai_methods.keys(), ["o", "^", "s", "X"])}
colours = {a:b for a, b in zip(estimators_sub.keys(), ["black", "green", "red", "blue"])}

fig, ax = plt.subplots(figsize=(15, 15))
for estimator_category, estimator_meta in estimators_sub.items():
    for estimator_name, estimator in estimator_meta.items():        
        for noise in noise_levels:
            x[noise] = np.array(list(correct_indices[estimator_category][estimator_name][noise][test].values())).astype(int).mean()
        for xai_method in xai_methods.keys():
            y[xai_method] = {}
            for noise in noise_levels:
                y[xai_method][noise] = np.array(p_values[estimator_category][estimator_name][noise][test][xai_method])
        for noise in noise_levels:
            sns.distplot(y["Saliency"][noise], 
                    label=estimator_name+"_"+xai_method,
                    #color=colours[estimator_category],
                    #marker=markers[xai_method]
                    ax=ax)
            
            #ax.plot(noise_levels,
            #        list(y[xai_method].values()), 
            #        label=estimator_name+"_"+xai_method,
            #        color=colours[estimator_category],
            #        marker=markers[xai_method]
            #        )
            

#plt.xlabel("# Correct predictions/ Noise Level")
#plt.ylabel("Intra-consitency")
plt.legend()
#legend_without_duplicate_labels(ax)

In [None]:
x = {}
y = {}

markers = {a:b for a, b in zip(xai_methods.keys(), ["o", "^", "s", "X"])}
colours = {a:b for a, b in zip(estimators_sub.keys(), ["black", "green", "red", "blue"])}

fig, ax = plt.subplots(figsize=(15, 15))
for estimator_category, estimator_meta in estimators_sub.items():
    for estimator_name, estimator in estimator_meta.items():        
        for noise in noise_levels:
            x[noise] = np.array(list(correct_indices[estimator_category][estimator_name][noise][test].values())).astype(int).mean()
        for xai_method in xai_methods.keys():
            y[xai_method] = {}
            for noise in noise_levels:
                y[xai_method][noise] = np.array(p_values[estimator_category][estimator_name][noise][test][xai_method]).mean()

        for xai_method in xai_methods.keys():#["Saliency"]:
            ax.plot(noise_levels,  
                    list(x.values()), 
                    label=estimator_name+"_"+xai_method,
                    color=colours[estimator_category],
                    marker=markers[xai_method]
                    )
            
            #ax.plot(noise_levels,
            #        list(y[xai_method].values()), 
            #        label=estimator_name+"_"+xai_method,
            #        color=colours[estimator_category],
            #        marker=markers[xai_method]
            #        )
            

#plt.xlabel("# Correct predictions/ Noise Level")
#plt.ylabel("Intra-consitency")
plt.legend()
#legend_without_duplicate_labels(ax)

### Run Example!

In [None]:
estimator_category = "Localisation" # Localisation
estimator_name = "Relevance Rank Accuracy" # Relevance Rank Accuracy
nr_nr_perturbations = 10
iterations = 3

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)

master.run_perturbation_analysis(
    metric=estimators_sub[estimator_category][estimator_name],
    model=dataset_settings[dataset_name]["models"]["LeNet"],
    x_batch=dataset_settings[dataset_name]["x_batch"],
    y_batch=dataset_settings[dataset_name]["y_batch"],
    a_batch=None,
    s_batch=dataset_settings[dataset_name]["s_batch"],
    channel_first=True,
    softmax=False,
    device=device
    )

master.run_inference()

In [None]:
master.run_inference()

### Localisation test

In [None]:
# metaquantus.
analyser_suite = {
       
    "Model Resilience Test": 
        metaquantus.ModelPerturbationTest(**{
            "noise_type": "multiplicative",
            "mean": 1.0,
            "std": 0.0001,
            "type": "Resilience",
            }
        ),
         "Model Adversary Test": 
        metaquantus.ModelPerturbationTest(**{
            "noise_type": "multiplicative",
            "mean": 1.0,
            "std": 2.0,
            "type": "Adversary",
            }
        ),
         "Input Resilience Test": 
        metaquantus.InputPerturbationTest(**{
            "noise": 0.001,
            "type": "Resilience",
            }
        ), 
 "Input Adversary Test": 
        metaquantus.InputPerturbationTest(**{
            "noise": 10.0,
            "type": "Adversary",
            }
        ),
}
    
    
estimator_category = "Localisation" # Localisation
estimator_name = "Relevance Rank Accuracy" # Relevance Rank Accuracy
nr_nr_perturbations = 10
iterations = 3

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)

master(
    metric=estimators_sub[estimator_category][estimator_name],
    model=dataset_settings[dataset_name]["models"]["LeNet"],
    x_batch=dataset_settings[dataset_name]["x_batch"],
    y_batch=dataset_settings[dataset_name]["y_batch"],
    a_batch=None,
    s_batch=dataset_settings[dataset_name]["s_batch"],
    channel_first=True,
    softmax=False,
    device=device
    )

In [None]:
def legend_without_duplicate_labels(ax):
    handles, labels = ax.get_legend_handles_labels()
    unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
    ax.legend(*zip(*unique))

In [None]:
# Plot it!
fig, axs = plt.subplots(4, 4, sharex=True, figsize=(20, 16))
for ti, test in enumerate(analyser_suite):
    for xi, xai_method in enumerate(xai_methods):
        sns.distplot(master.eval_scores[test][xai_method], color="blue", ax=axs[ti, xi], label="Q*")
        perturbed_scores=[]
        for i in range(iterations):
            perturbed_scores.append(master.eval_scores_perturbed[test][i][xai_method])
        sns.distplot(np.array(perturbed_scores).flatten(), color="red", ax=axs[ti, xi], label="Q")
        axs[ti, xi].set_title(f"{test.replace(' Test', '')} x {xai_method} \n p-val={np.mean(master.iac[test][xai_method]):.2f}")
plt.legend()
plt.show()

### IEC explorations

In [None]:
master.analyser_suite

In [None]:
master.eval_scores_perturbed[test][i]["Gradient"].mean(axis=0)

In [None]:
 master.eval_scores_perturbed[test][iter]["Gradient"].shape

In [None]:
pred

In [None]:
np.argsort(Q_star, axis=0)

In [None]:
np.argsort(Q_hat, axis=0).shape

In [None]:

t = "Adversary" # "Resilience" "Adversary"
test = f'Model {t} Test'

# Repeat this for iters, K perturbations.

for iter in range(iterations):

    Q_star = np.zeros((4, 1024))
    Q_star[0] = master.eval_scores[test]["Gradient"]
    Q_star[1] = master.eval_scores[test]["Saliency"]
    Q_star[2] = master.eval_scores[test]["IntegratedGradients"]
    Q_star[3] = master.eval_scores[test]["GradCAM"]

    for k in range(nr_perturbations):
        
        Q_hat = np.zeros((4, 1024))
        Q_hat[0] = master.eval_scores_perturbed[test][iter]["Gradient"][k]
        Q_hat[1] = master.eval_scores_perturbed[test][iter]["Saliency"][k]#.mean(axis=0)
        Q_hat[2] = master.eval_scores_perturbed[test][iter]["IntegratedGradients"][k]
        Q_hat[3] = master.eval_scores_perturbed[test][iter]["GradCAM"][k]

        # Create an agreement matrix U \in [0, 1] to specify if the condition is met.
        if t == "Adversary":
            U = []
            for true, preds in zip(Q_star, Q_hat):
                for i, (t, p) in enumerate(zip(true[master.indices_perturbed[test][iter][k]], preds[master.indices_perturbed[test][iter][k]])):    
                    if t > p:
                        U.append(1)
                    else:
                        U.append(0)

        if t == "Resilience":
            U = []
            R_star = np.argsort(Q_star, axis=0)
            R_hat = np.argsort(Q_hat, axis=0)
            for true, preds in zip(R_star, R_hat):
                for i, (t, p) in enumerate(zip(true[master.indices_perturbed[test][iter][k]], preds[master.indices_perturbed[test][iter][k]])):  
                    if t == p:
                        U.append(1)
                    else:
                        U.append(0)

        print(np.array(U).mean(), np.array(U).std())

In [None]:
# THIS IS MAYBE NOT THE BEST FOR RESILIENCE, CAUSE:

# [1,2,3,4] - [1,2,3,4] = -> sign(0, 0, 0, 0) -> 0
# [1,2,3,4] - [4,2,3,1] = -> sign(-3, 0, 0, 3) -> -1, 0, 0, 1 -> 0
# [1,2,3,4] - [2,1,3,4] = -> sign(-1, 1, 0, 0) -> 0
# [1,2,3,4] - [3,2,1,4] = -> sign(-2, 0, 2, 0) -> 0

# the magnitude does matter of the shifts but we want a change in loser to winner to matter, rather than one to second...

t = "Resilience" # "Resilience" "Adversary"
test = f'Model {t} Test'
iter = 0; k = 0
IEC = []
# Repeat this for iters, K perturbations.
for iter in range(5):
    for k in range(10):

        Q_star = np.zeros((4, 1024))
        Q_star[0] = master.eval_scores[test]["Gradient"]
        Q_star[1] = master.eval_scores[test]["Saliency"]
        Q_star[2] = master.eval_scores[test]["IntegratedGradients"]
        Q_star[3] = master.eval_scores[test]["GradCAM"]

        Q_hat = np.zeros((4, 1024))
        Q_hat[0] = master.eval_scores_perturbed[test][iter]["Gradient"][k]
        Q_hat[1] = master.eval_scores_perturbed[test][iter]["Saliency"][k]
        Q_hat[2] = master.eval_scores_perturbed[test][iter]["IntegratedGradients"][k]
        Q_hat[3] = master.eval_scores_perturbed[test][iter]["GradCAM"][k]

        # If Resilience, produce a ranking matrix R.
        R_star = np.zeros((4*1024))
        R_hat = np.zeros((4*1024))
        if t == "Resilience":
            R_star = np.argsort(Q_star, axis=0)
            R_hat = np.argsort(Q_hat, axis=0)
        
        if t == "Adversary":
            for i, (true, pred) in enumerate(zip(Q_star.flatten(), Q_hat.flatten())):
                if true > pred:
                    R_star[i] = 1
                    R_hat[i] = 0
                else:
                    R_star[i] = 0
                    R_hat[i] = 1
        
        # Create an agreement matrix U \in [0, 1] to specify if the condition is met.
        IEC_ik = np.zeros((4*1024))
        for i, (true, pred) in enumerate(zip(R_star.flatten(), R_hat.flatten())):
            IEC_ik[i] = np.sign(true-pred)

        # Aggregate for a single number of level of agreeement.
        IEC.append(IEC_ik.mean())

np.array(IEC).mean(), np.array(IEC).std()

In [None]:
"""
if t == "Resilience":
    if true == pred:
        U[i] = 1
    else:
        U[i] = 0
if t == "Adversary":
    if true > pred:
        U[i] = 1
    else:
        U[i] = 0
"""
    

In [None]:
# Aggregate scores!
q_bar = copy.deepcopy(master.eval_scores)
Q_bar = {k : {} for k in analyser_suite}

for test in analyser_suite:
    for xai_method in xai_methods:
        q_bar[test][xai_method] = np.mean(master.eval_scores[test][xai_method])
        Q_bar[test][xai_method] = []
        for i in range(iterations):
            sublist = []
            for p in range(nr_perturbations):
                sublist.append(np.mean(master.eval_scores_perturbed[test][xai_method][i][p]))
            Q_bar[test][xai_method] = np.array(sublist)
        Q_bar[test][xai_method] = np.array(Q_bar[test][xai_method])

In [None]:
test = 'Model Adversary Test'
q_bar_test = np.array(list(q_bar[test].values()))
Q_bar_test = np.hstack(np.array(list(Q_bar[test].values()))).reshape(len(xai_methods), -1)

q_bar_test

In [None]:
Q_bar_test

In [None]:
U = np.zeros_like(Q_bar_test)
for i, vals in enumerate(Q_bar_test):
    for j, val in enumerate(vals):
        U[i, j] = (q_bar_test[i] > val).astype(int)
U

In [None]:
U.mean() 

In [None]:
test = 'Input Adversary Test'
q_bar_test = np.array(list(q_bar[test].values()))
Q_bar_test = np.hstack(np.array(list(Q_bar[test].values()))).reshape(len(xai_methods), -1)

q_bar_test

In [None]:
Q_bar_test

In [None]:
U = np.zeros_like(Q_bar_test)
for i, vals in enumerate(Q_bar_test):
    for j, val in enumerate(vals):
        U[i, j] = (q_bar_test[i] > val).astype(int)
U

In [None]:
U.mean()

### Complexity test

In [None]:
estimator_category = "Complexity" # Localisation
estimator_name = "Sparseness" # Relevance Rank Accuracy
nr_perturbations = 10
iterations = 5

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)

master.run_perturbation_analysis(
    metric=estimators_sub[estimator_category][estimator_name],
    model_predict_kwargs={},
    model=dataset_settings[dataset_name]["models"]["LeNet"],
    x_batch=dataset_settings[dataset_name]["x_batch"],
    y_batch=dataset_settings[dataset_name]["y_batch"],
    a_batch=None,
    s_batch=dataset_settings[dataset_name]["s_batch"],
    channel_first=True,
    softmax=False,
    device=device
    )


p_vals = master.run_inference(method="fisher")

for test in analyser_suite:
    
    p_vals = [compute_p_value(q=master.eval_scores[test][xai_method],
                    q_hat=master.eval_scores_perturbed[test][xai_method][i][p],
                    indices=master.indices_perturbed[test][xai_method][i][p]) for i in range(iterations) for p in range(nr_perturbations) for xai_method in xai_methods]
    p_val = compute_joint_p_value(p_vals)
    
    print(f"{test} p-val = {p_val:.3f} {np.mean(p_vals):.2f}")

    for xai_method in xai_methods:
    
        sns.distplot(master.eval_scores[test][xai_method], color="blue")
        sns.distplot(master.eval_scores_perturbed[test][xai_method][:, :], color="red")
        
        p_vals = [compute_p_value(q=master.eval_scores[test][xai_method],
                        q_hat=master.eval_scores_perturbed[test][xai_method][i][p],
                        indices=master.indices_perturbed[test][xai_method][i][p]) for i in range(iterations) for p in range(nr_perturbations)]
        p_val = compute_joint_p_value(p_vals)
        
        plt.title(f"{test} x {xai_method} p-val = {p_val:.3f} {np.mean(p_vals):.2f}")
        plt.show()

In [None]:
# Aggregate scores!
q_bar = copy.deepcopy(master.eval_scores)
Q_bar = {k : {} for k in analyser_suite}

for test in analyser_suite:
    for xai_method in xai_methods:
        q_bar[test][xai_method] = np.mean(master.eval_scores[test][xai_method])
        Q_bar[test][xai_method] = []
        for i in range(iterations):
            sublist = []
            for p in range(nr_perturbations):
                sublist.append(np.mean(master.eval_scores_perturbed[test][xai_method][i][p]))
            Q_bar[test][xai_method] = np.array(sublist)
        Q_bar[test][xai_method] = np.array(Q_bar[test][xai_method])

In [None]:
test = 'Model Adversary Test'
q_bar_test = np.array(list(q_bar[test].values()))
Q_bar_test = np.hstack(np.array(list(Q_bar[test].values()))).reshape(len(xai_methods), -1)

q_bar_test

In [None]:
Q_bar_test

In [None]:
U = np.zeros_like(Q_bar_test)
for i, vals in enumerate(Q_bar_test):
    for j, val in enumerate(vals):
        U[i, j] = (q_bar_test[i] > val).astype(int)
U

In [None]:
U.mean() 

In [None]:
test = 'Input Adversary Test'
q_bar_test = np.array(list(q_bar[test].values()))
Q_bar_test = np.hstack(np.array(list(Q_bar[test].values()))).reshape(len(xai_methods), -1)

q_bar_test

In [None]:
Q_bar_test

In [None]:
U = np.zeros_like(Q_bar_test)
for i, vals in enumerate(Q_bar_test):
    for j, val in enumerate(vals):
        U[i, j] = (q_bar_test[i] > val).astype(int)
U

In [None]:
U.mean()

### Randomisation test

In [None]:
estimator_category = "Randomisation" # Localisation
estimator_name = "Model Parameter Randomisation Test" # Relevance Rank Accuracy
nr_perturbations = 10
iterations = 5

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=xai_methods,
    iterations=iterations,
    fname="",
    nr_perturbations=nr_perturbations,
    explain_func=quantus.explain,
)

master.run_perturbation_analysis(
    metric=estimators_sub[estimator_category][estimator_name],
    model_predict_kwargs={},
    model=dataset_settings[dataset_name]["models"]["LeNet"],
    x_batch=dataset_settings[dataset_name]["x_batch"][:500],
    y_batch=dataset_settings[dataset_name]["y_batch"][:500],
    a_batch=None,
    s_batch=dataset_settings[dataset_name]["s_batch"][:500],
    channel_first=True,
    softmax=False,
    device=device
    )


p_vals = master.run_inference(method="fisher")

for test in analyser_suite:
    
    p_vals = [compute_p_value(q=master.eval_scores[test][xai_method],
                    q_hat=master.eval_scores_perturbed[test][xai_method][i][p],
                    indices=master.indices_perturbed[test][xai_method][i][p]) for i in range(iterations) for p in range(nr_perturbations) for xai_method in xai_methods]
    p_val = compute_joint_p_value(p_vals)
    
    print(f"{test} p-val = {p_val:.3f} {np.mean(p_vals):.2f}")

    for xai_method in xai_methods:
    
        sns.distplot(master.eval_scores[test][xai_method], color="blue")
        sns.distplot(master.eval_scores_perturbed[test][xai_method][:, :], color="red")
        
        p_vals = [compute_p_value(q=master.eval_scores[test][xai_method],
                        q_hat=master.eval_scores_perturbed[test][xai_method][i][p],
                        indices=master.indices_perturbed[test][xai_method][i][p]) for i in range(iterations) for p in range(nr_perturbations)]
        p_val = compute_joint_p_value(p_vals)
        
        plt.title(f"{test} x {xai_method} p-val = {p_val:.3f} {np.mean(p_vals):.2f}")
        plt.show()

In [None]:
# Aggregate scores!
q_bar = copy.deepcopy(master.eval_scores)
Q_bar = {k : {} for k in analyser_suite}

for test in analyser_suite:
    for xai_method in xai_methods:
        q_bar[test][xai_method] = np.mean(master.eval_scores[test][xai_method])
        Q_bar[test][xai_method] = []
        for i in range(iterations):
            sublist = []
            for p in range(nr_perturbations):
                sublist.append(np.mean(master.eval_scores_perturbed[test][xai_method][i][p]))
            Q_bar[test][xai_method] = np.array(sublist)
        Q_bar[test][xai_method] = np.array(Q_bar[test][xai_method])

In [None]:
test = 'Model Adversary Test'
q_bar_test = np.array(list(q_bar[test].values()))
Q_bar_test = np.hstack(np.array(list(Q_bar[test].values()))).reshape(len(xai_methods), -1)

q_bar_test

In [None]:
Q_bar_test

In [None]:
U = np.zeros_like(Q_bar_test)
for i, vals in enumerate(Q_bar_test):
    for j, val in enumerate(vals):
        U[i, j] = (q_bar_test[i] < val).astype(int)
U

In [None]:
test = 'Input Adversary Test'
q_bar_test = np.array(list(q_bar[test].values()))
Q_bar_test = np.hstack(np.array(list(Q_bar[test].values()))).reshape(len(xai_methods), -1)

q_bar_test

In [None]:
Q_bar_test

In [None]:
U = np.zeros_like(Q_bar_test)
for i, vals in enumerate(Q_bar_test):
    for j, val in enumerate(vals):
        U[i, j] = (q_bar_test[i] < val).astype(int)
U

### Others

In [None]:
from scipy.stats import spearmanr, pearsonr
import numpy as np

In [None]:
## SIMULATE DATA.
test = 'Model Adversary Test - AR'
category = "Complexity"
metric = "Sparseness"
xai_methods = ['Gradient', 'Saliency', 'GradCAM', 'IntegratedGradients']
K = 5

q = np.zeros((len(xai_methods)))
Q = np.zeros((len(xai_methods), K))


for mx, method in enumerate(xai_methods):
    for iter in range(K):
        results = results_benchmark["MNIST"]["LeNet"][category][metric][test][method][iter]
        if iter == 0:
            q[mx] = np.mean(results[0])
        Q[mx][iter] = np.mean(results[1])

Q = Q.T

"""
q_rank = np.argsort(q)
print(q_rank)

Q_ranks = np.argsort(Q)
print(Q_ranks)

U = np.zeros_like(Q_ranks)
for i, q_i in enumerate(Q_ranks):
    U[i] = (q_i == q_rank).astype(int)
    

for q_i in Q_ranks:
    print(spearmanr(q_rank, q_i)[0])

print(np.mean([spearmanr(q_rank, q_i)[0] for q_i in Q_ranks]))

U_same = [np.argsort([Q[i], q], axis=0)[0] for i in range(K)]
print(U_same)
print(np.mean(U_same))

"""

In [None]:
## rows=K x cols=M

Q_AR = np.array([[0.56089843, 0.5532037 , 0.41645586, 0.92468283],
       [0.58627198, 0.62516774, 0.41122482, 0.93105764],
       [0.5300449 , 0.57782965, 0.43588665, 0.92718048],
       [0.73128368, 0.68010518, 0.40083521, 0.92604811],
       [0.53431268, 0.5185008 , 0.43107529, 0.92810497]])
Q_NR = np.array([[0.6397346 , 0.63902931, 0.42124708, 0.92411322],
       [0.63970828, 0.63895992, 0.42121219, 0.92410557],
       [0.63975087, 0.63899386, 0.42123599, 0.92410654],
       [0.63973842, 0.63905245, 0.4212166 , 0.92410473],
       [0.63973644, 0.63900556, 0.42124647, 0.92410828]])
q = np.array([0.63972324, 0.63899934, 0.42121537, 0.92410432])

# Uniform explanation...
rB = np.array([0.4 , 0.3, 0.2, 0.4],)

In [None]:
rB_rank = np.argsort(rB)
print(rB_rank)

Q_ranks = np.argsort(Q_NR)
print(Q_ranks)

U = [np.argsort([Q_ranks[i], rB_rank], axis=0)[0] for i in range(5)]


In [None]:
#np.random.uniform(-1, 1, size=(1024, 1, 28, 28))

In [None]:
#Q_AR = Q
Q_AR

In [None]:
Q_AR.mean(axis=0), Q_AR.std(axis=0)

In [None]:
#Q_NR = Q
Q_NR

In [None]:
Q_NR.mean(axis=0), Q_NR.std(axis=0)

In [None]:
q

In [None]:
np.argsort(q)

In [None]:
np.argsort(Q_AR)

In [None]:
np.argsort(Q_NR)

In [None]:
for q_i in Q_AR:
    print(pearsonr(q, q_i)[0])

In [None]:
for q_i in Q_NR:
    print(pearsonr(q, q_i)[0])

In [None]:
# For each explanation method, is the quality estimator able to consistently discrimate a disruptive vs non-disruptve estimates? 
# Consistenly discriminating between the perturbed and unperturbed quality estimates????

# Is perturbed always ranked higher or lower for every explanation method?
U = np.array([np.argsort([Q_AR[i], q], axis=0)[0] for i in range(K)])
U

In [None]:
np.std(U, axis=0) / K

In [None]:
np.std(Q)

In [None]:
Q = np.asarray(([2, 1, 0, 3],
                [2, 1, 0, 3],
                [2, 1, 0, 3],
                [2, 1, 0, 3],
                [2, 1, 0, 3],
                ))
q = np.asarray([2, 1, 0, 3])
U = np.array([np.argsort([Q[i], q], axis=0)[0] for i in range(K)])

In [None]:
path_results = "/content/drive/MyDrive/Projects/analysers/results/"
datasets = ["cMNIST", "MNIST", "fMNIST", "ImageNet"]
acategories = ['Complexity', 'Localisation', 'Randomisation', 'Faithfulness', 'Robustness']
xai_methods = ['Gradient', 'Saliency', 'GradCAM', 'IntegratedGradients']
models = {"MNIST": "LeNet", "fMNIST": "LeNet", "cMNIST": "ResNet9", "ImageNet": "ResNet18"}
analyser_suite = ['Data Variability Test', 'Parameter Sensitivity Test', 'Model Adversary Test', 'Explanation Adversary Test']

# All.
#estimators ={'Complexity': ['Sparseness', 'Complexity', 'Effective Complexity'], 'Faithfulness': ['Faithfulness Correlation', 'Pixel-Flipping'], 'Localisation': ['Pointing-Game', 'Relevance Rank Accuracy', 'Top-K Intersection'], 'Randomisation': ['Random Logit', 'Model Parameter Randomisation Test'], 'Robustness': ['Max-Sensitivity', 'Local Lipschitz Estimate']} 
#metrics = ['Sparseness', 'Complexity', 'Effective Complexity', 'Pointing-Game', 'Relevance Rank Accuracy', 'Top-K Intersection', 'Random Logit', 'Model Parameter Randomisation Test',  'Faithfulness Correlation', 'Pixel-Flipping', 'Max-Sensitivity', 'Local Lipschitz Estimate'] # [item for sublist in list(estimators.values()) for item in sublist]
#{k : list(v.keys()) for k, v in ESTIMATORS.items()}

# 2 of each.
estimators = { "Complexity": ["Sparseness", "Complexity"], "Faithfulness": ["Faithfulness Correlation", "Pixel-Flipping"], "Localisation": ["Pointing-Game", "Relevance Rank Accuracy"], "Randomisation": ["Random Logit", "Model Parameter Randomisation Test"], "Robustness": ["Max-Sensitivity", "Local Lipschitz Estimate"], }
metrics = ['Sparseness', 'Complexity', 'Faithfulness Correlation', 'Pixel-Flipping', 'Pointing-Game', 'Relevance Rank Accuracy', 'Random Logit', 'Model Parameter Randomisation Test', 'Max-Sensitivity', 'Local Lipschitz Estimate']
from typing import List, Optional, Dict
import pathlib
import numpy as np
import pandas as pd

#from .utils import load_obj

def get_resources_per_dataset(
    dataset_name: str,
    models: dict,
    estimators: Dict[str, List[str]] = {
        "Complexity": ["Sparseness", "Complexity"],
        "Faithfulness": ["Faithfulness Correlation", "Pixel-Flipping"],
        "Localisation": ["Pointing-Game", "Relevance Rank Accuracy"],
        "Randomisation": ["Random Logit", "Model Parameter Randomisation Test"],
        "Robustness": ["Max-Sensitivity", "Local Lipschitz Estimate"],
    },
    path_results: str = "/content/drive/MyDrive/Projects/analysers/results/",
) -> dict:
    """Get resources per dataset."""

    # Get fpaths etc.
    fpaths = [
        str(i)
        for i in pathlib.Path(f"{path_results}{dataset_name}time").glob("*")
        if i.is_file()
    ]
    model = models[dataset_name]

    resources = {}
    for category in estimators:
        for metric in estimators[category]:
            try:
                fname = [
                    f
                    for f in fpaths
                    if f.startswith(
                        f"{path_results}{dataset_name}time/_results_{dataset_name}_{model}_{category}_{metric}_"
                    )
                ]
                print(f"{path_results}{dataset_name}time/_results_{dataset_name}_{model}_{category}_{metric}_",fname)
                
                # For toy datasets.
                if len(fname) == 1:
                    resources[metric] = load_obj(path=fname[0], fname="", use_json=False)
                else:
                    
                    # ImageNet have separate resources.
                    resources[metric] = []
                    for f in fname:
                        resources[metric].append(load_obj(path=f, fname="", use_json=False))
            except:
                print(
                    f"ERROR: Couldn't find results file - {dataset_name} - metric {metric} ({category})."
                )
    return resources


def get_results_from_parts(
    resources_parts: dict, metric: str, analysis_type: str = "intra"
):
    analysers = resources_parts[metric].keys()
    results = {}
    for analyser in analysers:

        if analysis_type == "inter":
            results_analyser = np.array(
                list(
                    resources_parts[metric][analyser][
                        f"results_{analysis_type}_analysis_"
                    ][analyser]
                )
            ).flatten()
        else:
            results_analyser = np.array(
                list(
                    resources_parts[metric][analyser][
                        f"results_{analysis_type}_analysis_"
                    ][analyser].values()
                )
            ).flatten()
        results[analyser] = {
            "mean": results_analyser.mean(),
            "std": results_analyser.std(),
        }
    return results


def append_inter_reliability_summary_tables(
    resource: dict,
    metrics: List[str],
    analyser_suite: List[str] = [
        "Parameter Sensitivity Test",
        "Data Variability Test",
        "Model Adversary Test",
        "Explanation Adversary Test",
    ],
) -> None:
    inter_metrics = ["alphas", "spearmans", "average_cohen_kappa"]

    for metric in metrics:
        for inter_metric in inter_metrics:
            try:
                resource[metric][f"results_{inter_metric}_summary_table_"] = {}
            except:
                print(
                    f"The metric {metric} does not exist in table {resource}. Check spelling."
                )
            for analyser in analyser_suite:
                results = resource[metric][f"results_{inter_metric}_"][analyser]
                resource[metric][f"results_{inter_metric}_summary_table_"][analyser] = {
                    "mean": results.mean(),
                    "std": results.std(),
                }
    return resource



def average_dfs_over_datasets(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    """Average dfs over datasets."""
    df = pd.concat([each.stack() for each in dfs],axis=1)\
                .apply(lambda x:x.mean(),axis=1)\
                .unstack()
    return df 
    #return df[["Data Variability Test",	"Parameter Sensitivity Test", "Model Adversary Test", "Explanation Adversary Test"]]	#.to_latex()


def convert_summary_table_to_df(
    resource: dict,
    metrics: List[str],
    analysis_type: str = "intra",
    inter_metric: Optional[str] = None,
    analyser_suite: List[str] = [
        "Parameter Sensitivity Test",
        "Data Variability Test",  
        "Model Adversary Test",
        "Explanation Adversary Test",
    ],
    desc: bool = True,
) -> pd.DataFrame:
    
    if desc:
        print(analysis_type)

    if inter_metric:
        table = f"results_{inter_metric}_summary_table_"
    else:
        table = f"results_{analysis_type}_summary_table_"

    pds = []
    keys = []
    for metric in metrics:
        if metric in resource:
            try:
                # ImageNet data is split over batches, so we need to agg scores per metric.
                if isinstance(resource[metric], list):
                    tables_data = []
                    for i in range(len(resource[metric])):
                        table_data = resource[metric][i][table]
                        tables_data.append(table_data)
                    print(f"{len(tables_data)} resources for {metric}.")
                else:
                    data = resource[metric][table]
                    tables_data = None
            except:
                print(
                    f"The resource {table} of metric {metric} does not exist. Check spelling."
                )
                data = None
                tables_data = None
        
        # If we have more than one resource, append each results and average over and return as data.
        if tables_data:
            tables_data_dfs = [] 
            for d in tables_data:
                tables_data_dfs.append(pd.DataFrame(d))
            data = average_dfs_over_datasets(dfs=tables_data_dfs)
        
        pds.append(pd.DataFrame(data))
        keys.append(metric)
        
    df = pd.concat(pds, keys=keys)

    return df[analyser_suite]#, tables_data_dfs



def recalcualte_inter_reliability_summary_table(resource, metrics, analyser_suite, iterations=5):

    table = "results_reliability_data_"
    for metric in metrics:

        #try:
        #    resource[metric][f"results_inter_summary_table_"] = {}
        #except:
        #    print(
        #        f"The metric {metric} does not exist in table {resource}. Check spelling."
        #    )

        for analyser in analyser_suite:
            
        
            if isinstance(resource[metric], list):
            
                for i in range(len(resource[metric])):

                    if analyser in ["Explanation Adversary Test", "Model Adversary Test"]:

                        reliability_data = resource[metric][i][table][analyser]
                        
                        results = []
                        
                        for j in range(iterations):
                        
                            r = np.nanmean(scipy.stats.spearmanr(reliability_data[j])[0])
                        #else:
                        #    r = np.nanmean(scipy.stats.spearmanr(reliability_data[i].T)[0])
                            results.append(r)

                        results = np.array(results)
                        resource[metric][i][f"results_inter_summary_table_"][analyser] = {
                            "mean": results.mean(),
                            "std": results.std(),
                        }
                print(f"{len(resource[metric])} resources for {metric}.")

            else:

            
                if analyser in ["Explanation Adversary Test", "Model Adversary Test"]:

                    reliability_data = resource[metric][table][analyser]

                    results = []
                    for i in range(iterations):
                    
                        r = np.nanmean(scipy.stats.spearmanr(reliability_data[i])[0])
                    #else:
                    #    r = np.nanmean(scipy.stats.spearmanr(reliability_data[i].T)[0])
                        results.append(r)
                    results = np.array(results)
                    resource[metric][f"results_inter_summary_table_"][analyser] = {
                        "mean": results.mean(),
                        "std": results.std(),
                    }
        
    return resource



In [None]:
resource_mnist = get_resources_per_dataset(dataset_name="MNIST", models=models, estimators=estimators, path_results="/content/drive/MyDrive/Projects/analysers/results/")
resources = {"MNIST": resource_mnist}

In [None]:
# Gather time data!
batches = 4
dfs = []
for dataset_name, resource in resources.items():
    times = {metric: [] for metric in metrics}

    for metric in metrics:
        try:
            if isinstance(resource[metric], list):
                time_batches = []
                for b in range(batches):
                    time_batches.append(np.array(list(resources[dataset_name][metric][b]["results_time_efficiency_"].values())).flatten())
                times[metric] = np.mean(time_batches, axis=0)
            else:
                times[metric] = np.array(list(resource[metric]["results_time_efficiency_"].values())).flatten()

        except:
            print(f"Didin't work for {metric} in {dataset_name}!")

        category = [k for k, v in estimators.items() if metric in v][0]
        scores = times[metric]
        methods = np.tile(xai_methods, 5)
        resource_name = np.tile([dataset_name], 20)
        metric_name = np.tile(metric, len(scores))
        category_name = np.tile(category, len(scores))
        df = pd.DataFrame({"Dataset": pd.Series(resource_name, dtype=str), "Time": pd.Series(scores, dtype=float), "Method":  pd.Series(methods, dtype=str), "Estimator" :  pd.Series(metric_name, dtype=str), "Category" :  pd.Series(category_name, dtype=str)})
        dfs.append(df)

    #df_times = pd.DataFrame(times).T
    #df_times["Estimator"] = df_times.index
    #df_times.index = np.arange(0, len(df_times))
    #df_times

df_times = pd.concat(dfs) #df_times.info()
df_times.index = np.arange(0, len(df_times))
df_times

In [None]:
with sns.axes_style("whitegrid"), sns.plotting_context("notebook", font_scale=1.5):

    # Settings.
    #matplotlib.rcParams.update({'font.size': 15})
    #matplotlib.rcParams.update({'font.serif': "Times"})
    #sns.plotting_context("notebook", rc={"font.scale": 1.5, "font.serif": "Times"}):

    sns.catplot(x="Time", y="Category", hue="Method", palette="Paired", kind="bar", edgecolor=".6", data=df_times, orient="h", height=8, aspect=10/8, legend=None)
    plt.xlabel("Time per Estimator Call")
    plt.ylabel("")
    legend = plt.legend(loc="upper right")
    legend.get_frame().set_facecolor('none')
    plt.show()

### Others

In [None]:
###############
# Simple run! #
###############

estimator_category = "Complexity"
estimator_name = "Sparseness"
analyser_suite["Parameter Sensitivity Test"].parameterisation = PARAMETERISATION[estimator_category]

results = master(
    metric=ESTIMATORS[estimator_category][estimator_name][0],
    metric_kwargs=ESTIMATORS[estimator_category][estimator_name][1],
    model=SETTINGS[dataset_name]["models"]["LeNet"],
    x_batch=SETTINGS[dataset_name]["x_batch"],
    y_batch=SETTINGS[dataset_name]["y_batch"],
    a_batch=None,
    s_batch=None,
    **{"explain_func": quantus.explain, 
       "disable_warnings": True, 
       "gc_layer": 'list(model.named_modules())[3][1]'}
)

In [None]:
# Wlcxon is sensitive to the range ... higher v....

In [None]:
pvals = []
for _ in range(100):
    pvals.append(compute_p_value(
    intra_func=scipy.stats.mannwhitneyu,
    x=np.random.randint(0, 10000, (64, )),
    y=np.random.randint(0, 10000, (64, ))
a) )
pvals = np.array(pvals)
pvals.mean(), pvals.std(), pvals.min(), pvals.max()

In [None]:
pvals = []
for _ in range(10):
    pvals.append(compute_p_value(
    intra_func=scipy.stats.wilcoxon,
    x=np.random.randint(0, 10000, (32, )),
    y=np.random.randint(0, 10000, (32, ))
) )
pvals = np.array(pvals)
pvals.mean(), pvals.std(), pvals.min(), pvals.max()

In [None]:
pvals = []
for _ in range(10):
    pvals.append(compute_p_value(
    intra_func=scipy.stats.wilcoxon,
    x=np.random.randint(0, 10000, (1024, )),
    y=np.random.randint(0, 10000, (1024, ))
) )
pvals = np.array(pvals)
pvals.mean(), pvals.std(), pvals.min(), pvals.max()

In [None]:
pvals = []
for _ in range(10):
    pvals.append(compute_p_value(
    intra_func=scipy.stats.mannwhitneyu,
    x=np.random.uniform(0, 1, (1024, )),
    y=np.random.uniform(0, 1, (1024, ))
) )
pvals = np.array(pvals)
pvals.mean(), pvals.std(), pvals.min(), pvals.max()

In [None]:
pvals = []
for _ in range(10):
    pvals.append(compute_p_value(
    intra_func=scipy.stats.wilcoxon,
    x=np.random.uniform(0, 1, (1024, )),
    y=np.random.uniform(0, 1, (1024, ))
) )
pvals = np.array(pvals)
pvals.mean(), pvals.std(), pvals.min(), pvals.max()

In [None]:
# Define experimental settings.
from configs import GENERAL_KWARGS, XAI_METHODS, PARAMETERISATION, generate_estimator_configs

# Get settings and define estimators depending on the dataset.
dataset_name = "MNIST"
settings = {dataset_name: SETTINGS[dataset_name]}

dataset_kwargs = settings[dataset_name]["estimator_kwargs"]
ESTIMATORS = generate_estimator_configs(features=dataset_kwargs["features"], num_classes=dataset_kwargs["num_classes"], k=dataset_kwargs["k"])

#estimators = ESTIMATORS
#estimators = {"Robustness": {"Local Lipschitz Estimate": ESTIMATORS["Robustness"]["Local Lipschitz Estimate"]}}
estimators = {"Adversarial": {"Random Generator": ESTIMATORS["Adversarial"]["Random Generator"]}}

# Set analysers!
analyser_suite = {
    "Time Efficiency Test": 
        metaquantus.TimeEfficiencyTest(),
    "Parameter Sensitivity Test": 
        metaquantus.ParameterSensitivityTest(**{
            "parameterisation": {
             }
             }
        ),
        "Model Adversary Test": 
        metaquantus.ModelAdversaryTest(**{
            "noise_type": "multiplicative",
            "mean": 1.0,
            "std": 2.0,
            "do_model_check": False,
            }
        ),
    "Explanation Adversary Test": 
        metaquantus.ExplanationAdversaryTest(**{
            "perturb_method": "uniform",
            "noise_lower_bound": 0.0, 
            "noise_upper_bound": 1.0,
            }
        ),
    "Data Variability Test": 
        metaquantus.DataVariabilityTest(**{
            "sample_size": dataset_kwargs["sample_size"]
            }       
        ),
}

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=XAI_METHODS,
    iterations=5,
    use_nominal_inputs=True,
    inter_reliability_metric="spearmans",
    print_results=True,
    write_results=True,
    write_progress=True,
    use_json=False,
    path=path_results,
    dataset_name=dataset_name,
    id=id,
)

id = str(f"{np.random.randint(1000)}")
print("ID", id)

# Benchmark!
results_benchmark = metaquantus.MetaEvaluationMultiple(master=master,
                estimators=estimators,
                analyser_suite=analyser_suite,
                settings=settings,
                parameterisation=PARAMETERISATION,
                path=path_results,
                id=id,
                keep_results=False,
                save=False,
                **{**GENERAL_KWARGS, **{"device": device}})()


In [None]:
# Define experimental settings.
from configs import GENERAL_KWARGS, XAI_METHODS, PARAMETERISATION, generate_estimator_configs

# Get settings and define estimators depending on the dataset.
dataset_name = "fMNIST"
settings = {dataset_name: SETTINGS[dataset_name]}

dataset_kwargs = settings[dataset_name]["estimator_kwargs"]
ESTIMATORS = generate_estimator_configs(features=dataset_kwargs["features"], num_classes=dataset_kwargs["num_classes"], k=dataset_kwargs["k"])

#estimators = ESTIMATORS
#estimators = {"Robustness": {"Local Lipschitz Estimate": ESTIMATORS["Robustness"]["Local Lipschitz Estimate"]}}
estimators = {"Adversarial": {"Random Generator": ESTIMATORS["Adversarial"]["Random Generator"]}}

# Set analysers!
analyser_suite = {
    "Time Efficiency Test": 
        metaquantus.TimeEfficiencyTest(),
    "Parameter Sensitivity Test": 
        metaquantus.ParameterSensitivityTest(**{
            "parameterisation": {
             }
             }
        ),
        "Model Adversary Test": 
        metaquantus.ModelAdversaryTest(**{
            "noise_type": "multiplicative",
            "mean": 1.0,
            "std": 2.0,
            "do_model_check": False,
            }
        ),
    "Explanation Adversary Test": 
        metaquantus.ExplanationAdversaryTest(**{
            "perturb_method": "uniform",
            "noise_lower_bound": 0.0, 
            "noise_upper_bound": 1.0,
            }
        ),
    "Data Variability Test": 
        metaquantus.DataVariabilityTest(**{
            "sample_size": dataset_kwargs["sample_size"]
            }       
        ),
}

# Define master!
master = metaquantus.MetaEvaluation(
    analyser_suite=analyser_suite,
    xai_methods=XAI_METHODS,
    iterations=5,
    use_nominal_inputs=True,
    inter_reliability_metric="spearmans",
    print_results=True,
    write_results=True,
    write_progress=True,
    use_json=False,
    path=path_results,
    dataset_name=dataset_name,
    id=id,
)

id = str(f"{np.random.randint(1000)}")
print("ID", id)

# Benchmark!
results_benchmark = metaquantus.MetaEvaluationMultiple(master=master,
                estimators=estimators,
                analyser_suite=analyser_suite,
                settings=settings,
                parameterisation=PARAMETERISATION,
                path=path_results,
                id=id,
                keep_results=False,
                save=False,
                **{**GENERAL_KWARGS, **{"device": device}})()


In [None]:
###############
# Simple run! #
###############

#estimator_category = "Randomisation"
#estimator_name = "Random Logit"
#analyser_suite["Parameter Sensitivity Test"].parameterisation = PARAMETERISATION[estimator_category]
#estimators = {"Randomisation": ESTIMATORS["Randomisation"],}

#results = master(
#    metric=ESTIMATORS[estimator_category][estimator_name][0],
#    metric_kwargs=ESTIMATORS[estimator_category][estimator_name][1],
#    model=SETTINGS[dataset_name]["models"]["LeNet"],
#    x_batch=SETTINGS[dataset_name]["x_batch"],
#    y_batch=SETTINGS[dataset_name]["y_batch"],
#    a_batch=None,
#    s_batch=None,
#    **{"explain_func": quantus.explain, 
#       "disable_warnings": True, 
#       "gc_layer": 'list(model.named_modules())[3][1]'}
#)