# An Introduction to XAI for Opaque models and the research objectives

#### Objective:
Develop methods for interpreting black‐box models (like deep neural networks) by comparing techniques such as LIME, SHAP, Anchors, and saliency maps. We will evaluate these methods on different datasets and model architectures, propose a benchmark for explanation quality (e.g., fidelity), and discuss trade‐offs (fidelity vs. interpretability).
#### Environment & Dataset Setup:
How to install the necessary Python packages and load two example datasets: a tabular dataset (Breast Cancer from scikit‐learn) and an image dataset (MNIST).
#### Model Building:
Constructing a feedforward neural network for the tabular dataset and a CNN for MNIST.
#### Explainability Techniques:
- Using LIME and SHAP on the tabular model.
- Explaining a CNN via saliency maps on MNIST.
- Running a simple Anchors explanation (using the Alibi library).
- Implementing a hybrid approach by combining LIME and SHAP explanations.
#### Benchmarking & Evaluation:
A simplified fidelity metric (how much the prediction drops when top features are perturbed) and evaluation over multiple instances.
#### Visualization & Comparative Analysis:
Visualizing explanations and summarizing quantitative results.

## Environmental Setup

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
# Import XAI libraries
import lime
from lime.lime_tabular import LimeTabularExplainer
import shap

print("All libraries imported successfully!")

All libraries imported successfully!


## Data Processing

> ### Image Data Example (MNIST Dataset)

In [3]:
# Load MNIST dataset
mnist = tf.keras.datasets.mnist
(X_train_img, y_train_img), (X_test_img, y_test_img) = mnist.load_data()

# Normalize images and add channel dimension
X_train_img = X_train_img.astype('float32') / 255.0
X_test_img = X_test_img.astype('float32') / 255.0

X_train_img = np.expand_dims(X_train_img, -1)
X_test_img = np.expand_dims(X_test_img, -1)

# One-hot encode labels for classification
y_train_img_cat = to_categorical(y_train_img, 10)
y_test_img_cat = to_categorical(y_test_img, 10)

print("MNIST dataset shape:", X_train_img.shape, X_test_img.shape)

MNIST dataset shape: (60000, 28, 28, 1) (10000, 28, 28, 1)


## Model Building

> ### Image Model: CNN on MNIST

In [6]:
# Build a simple CNN for MNIST
def build_mnist_model():
    model = models.Sequential()
    model.add(layers.Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=X_train_img.shape[1:]))
    model.add(layers.MaxPooling2D(pool_size=(2,2)))
    model.add(layers.Conv2D(64, (3,3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2,2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

mnist_model = build_mnist_model()
history = mnist_model.fit(X_train_img, y_train_img_cat, epochs=5, batch_size=128, validation_split=0.1, verbose=1)
test_loss, test_acc = mnist_model.evaluate(X_test_img, y_test_img_cat)
print("MNIST model accuracy:", test_acc)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 29ms/step - accuracy: 0.8447 - loss: 0.5319 - val_accuracy: 0.9815 - val_loss: 0.0655
Epoch 2/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 28ms/step - accuracy: 0.9825 - loss: 0.0600 - val_accuracy: 0.9857 - val_loss: 0.0488
Epoch 3/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - accuracy: 0.9861 - loss: 0.0464 - val_accuracy: 0.9852 - val_loss: 0.0478
Epoch 4/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - accuracy: 0.9901 - loss: 0.0305 - val_accuracy: 0.9900 - val_loss: 0.0370
Epoch 5/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 30ms/step - accuracy: 0.9924 - loss: 0.0236 - val_accuracy: 0.9910 - val_loss: 0.0336
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9854 - loss: 0.0422
MNIST model accuracy: 0.989300012588501


## Explainability AI Techniques

## Saliency Maps for the CNN on MNIST

In [15]:
import tensorflow.keras.backend as K

def compute_saliency(model, image, target_class):
    """
    Compute a saliency map for a given input image and target class.
    """
    image_tensor = tf.convert_to_tensor(image.reshape(1, 28, 28, 1))
    with tf.GradientTape() as tape:
        tape.watch(image_tensor)
        predictions = model(image_tensor)
        loss = predictions[:, target_class]
    # Compute gradients of the target class score with respect to the input image
    grads = tape.gradient(loss, image_tensor)
    # Process gradients: take maximum across color channels and absolute values
    saliency = tf.reduce_max(tf.abs(grads), axis=-1).numpy().squeeze()
    return saliency

# Select a sample image from the test set
sample_index = 0
sample_image = x_test_img[sample_index]
predicted_class = np.argmax(model_cnn.predict(sample_image.reshape(1, 28, 28, 1)))
saliency_map = compute_saliency(model_cnn, sample_image, predicted_class)

# Visualize the original image and its saliency map
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.title("Original MNIST Image")
plt.imshow(sample_image.squeeze(), cmap='gray')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.title("Saliency Map")
plt.imshow(saliency_map, cmap='jet')
plt.colorbar()
plt.axis('off')
plt.show()

NameError: name 'x_test_img' is not defined

## Hybrid Approach: Combining LIME and SHAP

In [16]:
# Get LIME explanation scores for the selected instance and convert to a dictionary
lime_exp_list = exp_lime.as_list()
lime_scores = dict(lime_exp_list)

# Compute SHAP explanation for the same instance (using KernelExplainer)
# Note: Here we use explainer_shap for one instance. The output is a list of arrays.
shap_val_instance = explainer_shap.shap_values(X_test[instance_idx].reshape(1, -1))[0][0]

# Create a hybrid explanation by averaging scores from LIME and SHAP
hybrid_scores = {}
for idx, feature in enumerate(data.feature_names):
    # LIME might not list every feature (if not important, assume 0)
    lime_score = lime_scores.get(feature, 0)
    shap_score = shap_val_instance[idx]
    hybrid_scores[feature] = (lime_score + shap_score) / 2

print("Hybrid Explanation Scores (Averaged LIME + SHAP):")
for feature, score in hybrid_scores.items():
    print(f"{feature}: {score}")

NameError: name 'exp_lime' is not defined

## Benchmarking and Evaluation

In [None]:
def fidelity_metric(model, instance, explanation, num_features=3):
    """
    Compute a simple fidelity score by perturbing the top features
    identified by the explanation and measuring the drop in prediction probability.
    """
    # Make a copy of the instance and create a baseline (zero vector)
    instance_perturbed = instance.copy()
    baseline = np.zeros_like(instance)
    
    # Get the top features (sorted by the absolute importance score)
    sorted_features = sorted(explanation.items(), key=lambda x: abs(x[1]), reverse=True)
    features_to_remove = [feat for feat, _ in sorted_features[:num_features]]
    
    # Perturb these features (set them to baseline)
    for feature in features_to_remove:
        idx = list(data.feature_names).index(feature)
        instance_perturbed[idx] = baseline[idx]
    
    # Compute prediction before and after perturbation
    original_pred = model.predict(instance.reshape(1, -1))[0][0]
    perturbed_pred = model.predict(instance_perturbed.reshape(1, -1))[0][0]
    
    fidelity = original_pred - perturbed_pred
    return fidelity

# Compute fidelity for our chosen instance using LIME explanation
fidelity_score = fidelity_metric(model_tabular, X_test[instance_idx].copy(), lime_scores, num_features=3)
print("Fidelity Score (LIME):", fidelity_score)

def compute_average_fidelity(model, X, explainer, num_samples=10, num_features=3):
    fidelities = []
    for i in range(num_samples):
        instance = X[i].copy()
        # Get LIME explanation for the instance
        exp = explainer.explain_instance(instance, model_predict, num_features=5)
        exp_dict = dict(exp.as_list())
        fidelity = fidelity_metric(model, instance, exp_dict, num_features=num_features)
        fidelities.append(fidelity)
    return np.mean(fidelities)

avg_fidelity_lime = compute_average_fidelity(model_tabular, X_test, explainer_lime, num_samples=10, num_features=3)
print("Average Fidelity (LIME):", avg_fidelity_lime)

In [None]:
def fidelity_metric(model, instance, explanation, num_features=3):
    """
    Compute a fidelity score by perturbing the top features
    identified by the explanation and measuring the drop in prediction probability.
    """
    instance_perturbed = instance.copy()
    baseline = np.zeros_like(instance)
    
    # Sort features by absolute importance and choose the top ones
    sorted_features = sorted(explanation.items(), key=lambda x: abs(x[1]), reverse=True)
    features_to_remove = [feat for feat, _ in sorted_features[:num_features]]
    
    # Perturb the chosen features (set them to baseline)
    for feature in features_to_remove:
        idx = list(data.feature_names).index(feature)
        instance_perturbed[idx] = baseline[idx]
    
    original_pred = model.predict(instance.reshape(1, -1))[0][0]
    perturbed_pred = model.predict(instance_perturbed.reshape(1, -1))[0][0]
    
    fidelity = original_pred - perturbed_pred
    return fidelity

# Example: Compute fidelity for the previously explained instance using LIME
instance_idx = 1  # index used previously
lime_fidelity = fidelity_metric(model_tabular, X_test[instance_idx].copy(), dict(exp_lime.as_list()), num_features=3)
print("Fidelity Score (LIME) for instance", instance_idx, ":", lime_fidelity)
