# Counterfactuals benchmark on tabular datasets

In [1]:
from tensorflow.compat.v1 import enable_eager_execution
from tensorflow import executing_eagerly
import os
enable_eager_execution()

BASE_PATH = "./counterfactuals"
print("Current working directory:", os.getcwd())

2025-06-17 09:56:43.545762: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-17 09:56:43.837913: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-17 09:56:43.837951: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-17 09:56:43.839422: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-17 09:56:43.967195: I tensorflow/core/platform/cpu_feature_g

Current working directory: /home/ahmed/prototype


## Imports and preprocessing

In [2]:
# Install the dev version of the Alibi package if not already installed
try:
    from alibi import __version__ as alibi_version
    print(f"Alibi version: {alibi_version}")
except ImportError:
    print("Alibi package not found, installing...")
    # Install the dev version of Alibi
    !pip install git+https://github.com/SeldonIO/alibi.git > /dev/null


import logging

alibi_logger = logging.getLogger("alibi")
alibi_logger.setLevel("CRITICAL")


print(f"Is TensorFlow running in eager execution mode? -----→ {executing_eagerly()}")
!nvidia-smi -L

  from .autonotebook import tqdm as notebook_tqdm


Alibi version: 0.9.7.dev0
Is TensorFlow running in eager execution mode? -----→ True
GPU 0: NVIDIA GeForce RTX 4060 Laptop GPU (UUID: GPU-ed7340f2-1910-df12-4a83-29feeba52695)


In [3]:
from datetime import datetime

if not os.path.exists(BASE_PATH):
    os.makedirs(BASE_PATH)


date = datetime.now().strftime('%Y-%m-%d')
EXPERIMENT_PATH = f"{BASE_PATH}/diabetes_{date}"
MODELS_EXPERIMENT_PATH = f"{BASE_PATH}/diabetes_2020-09-09"
if not os.path.exists(EXPERIMENT_PATH):
    os.makedirs(EXPERIMENT_PATH)
    

## Data import and preprocessing

In [4]:
import json
# import pickle
# import time
# from matplotlib import offsetbox
# from matplotlib.colors import ListedColormap
# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
# from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.layers import Dense, Add, Input, ActivityRegularization, Concatenate, Multiply
from tensorflow.keras import optimizers, Model, regularizers, Input

from tensorflow.keras.models import Sequential
from tensorflow.random import set_seed
# from tensorflow.keras.models import load_model
import os

print("Current working directory:", os.getcwd())

INITIAL_CLASS = 0
DESIRED_CLASS = 1
N_CLASSES = 2
n_training_iterations = 10


np.set_printoptions(precision=2)
set_seed(2020)
np.random.seed(2020)

# Pima indians Diabetes dataset
# https://www.kaggle.com/uciml/pima-indians-diabetes-database
df = pd.read_csv("diabetes.csv", index_col=False)
target_column = "Outcome"
immutable_features = {"Pregnancies", "DiabetesPedigreeFunction", "Age"}

features = set(df.columns) - {target_column}
mutable_features = features - immutable_features
features = list(mutable_features) + list(immutable_features)

x = df[features]
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(df[features].values, y, test_size=0.2)

standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

df[features].sample(5)

Current working directory: /home/ahmed/prototype


Unnamed: 0,SkinThickness,BloodPressure,Insulin,BMI,Glucose,DiabetesPedigreeFunction,Age,Pregnancies
379,39,100,72,43.4,93,1.021,35,0
741,20,44,94,30.8,102,0.4,26,3
102,0,96,0,22.5,125,0.262,21,0
641,0,70,0,34.3,128,0.303,24,4
14,19,72,175,25.8,166,0.587,51,5


In [5]:
def compute_reconstruction_error(x, autoencoder):
    """Compute the reconstruction error for a given autoencoder and data points."""
    preds = autoencoder.predict(x)
    preds_flat = preds.reshape((preds.shape[0], -1))
    x_flat = x.reshape((x.shape[0], -1))
    return np.linalg.norm(x_flat - preds_flat, axis=1)

def format_metric(metric):
    """Return a formatted version of a metric, with the confidence interval."""
    return f"{metric.mean():.3f} ± {1.96*metric.std()/np.sqrt(len(metric)):.3f}"

def compute_metrics(samples, counterfactuals, latencies, classifier, autoencoder,
                    batch_latency=None):
    """ Summarize the relevant metrics in a dictionary. """
    reconstruction_error = compute_reconstruction_error(counterfactuals, autoencoder)
    delta = np.abs(samples-counterfactuals)
    l1_distances = delta.reshape(delta.shape[0], -1).sum(axis=1)
    prediction_gain = (
        classifier.predict(counterfactuals)[:, DESIRED_CLASS] - 
        classifier.predict(samples)[:, DESIRED_CLASS]
    )

    metrics = dict()
    metrics["reconstruction_error"] = format_metric(reconstruction_error)
    metrics["prediction_gain"] = format_metric(prediction_gain)
    metrics["sparsity"] = format_metric(l1_distances)
    metrics["latency"] = format_metric(latencies)
    batch_latency = batch_latency if batch_latency else sum(latencies)
    metrics["latency_batch"] = f"{batch_latency:.3f}"

    return metrics

def save_experiment(method_name, samples, counterfactuals, latencies, 
                    batch_latency=None):
    """Create an experiment folder and save counterfactuals, latencies and metrics."""
    if not os.path.exists(f"{EXPERIMENT_PATH}/{method_name}"):
        os.makedirs(f"{EXPERIMENT_PATH}/{method_name}")   

    np.save(f"{EXPERIMENT_PATH}/{method_name}/counterfactuals.npy", counterfactuals)
    np.save(f"{EXPERIMENT_PATH}/{method_name}/latencies.npy", latencies)

    metrics = compute_metrics(samples, counterfactuals, latencies, classifier, autoencoder)
    json.dump(metrics, open(f"{EXPERIMENT_PATH}/{method_name}/metrics.json", "w"))
    pprint(metrics)

In [6]:
set_seed(2020)
np.random.seed(2020)

def create_classifier(input_shape):
    """Define and compile a neural network binary classifier.""" 
    model = Sequential([
        Dense(20, activation='relu', input_shape=input_shape),
        Dense(20, activation='relu'),
        Dense(2, activation='softmax'),
    ], name="classifier")
    optimizer = optimizers.Adam(learning_rate=0.0002, beta_1=0.5)
    model.compile(optimizer, 'binary_crossentropy', ['accuracy'])
    return model

classifier = create_classifier((x.shape[1],))
print(X_train.dtype, y_train.dtype)
print(X_test.dtype, y_test.dtype)

X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)
training = classifier.fit(X_train, y_train, batch_size=32, epochs=200, verbose=0,
                          validation_data=(X_test, y_test),)
print(f"Training: loss={training.history['loss'][-1]:.4f}, "
      f"accuracy={training.history['accuracy'][-1]:.4f}")
print(f"Validation: loss={training.history['val_loss'][-1]:.4f}, "
      f"accuracy={training.history['val_accuracy'][-1]:.4f}")

classifier.save(f"{EXPERIMENT_PATH}/classifier.keras")



2025-06-17 09:56:51.358161: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-06-17 09:56:51.504406: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-06-17 09:56:51.504473: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-06-17 09:56:51.506954: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-06-17 09:56:51.506998: I tensorflow/compile

float64 float32
float64 float32


2025-06-17 09:56:53.509081: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7dc875e6fd70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-06-17 09:56:53.509137: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-06-17 09:56:53.516187: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-06-17 09:56:53.543324: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8600
2025-06-17 09:56:53.624589: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Training: loss=0.4222, accuracy=0.7997
Validation: loss=0.4584, accuracy=0.7662


## Estimate density with the reconstruction error of a (denoising) autoencoder


In [7]:
def add_noise(x, noise_factor=1e-6):
    x_noisy = x + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=x.shape) 
    return x_noisy

    
def create_autoencoder(in_shape=(x.shape[1],)):
    input_ = Input(shape=in_shape) 

    x = Dense(32, activation="relu")(input_)
    encoded = Dense(8)(x)
    x = Dense(32, activation="relu")(encoded)
    decoded = Dense(in_shape[0], activation="tanh")(x)

    autoencoder = Model(input_, decoded)
    optimizer = optimizers.Nadam()
    autoencoder.compile(optimizer, 'mse')
    return autoencoder

autoencoder = create_autoencoder()
training = autoencoder.fit(
    add_noise(X_train), X_train, epochs=100, batch_size=32, shuffle=True, 
    validation_data=(X_test, X_test), verbose=0
)
print(f"Training loss: {training.history['loss'][-1]:.4f}")
print(f"Validation loss: {training.history['val_loss'][-1]:.4f}")

n_samples = 1000
# Compute the reconstruction error of noise data
samples = np.random.randn(n_samples, X_train.shape[1])
reconstruction_error_noise = compute_reconstruction_error(samples, autoencoder)

# Save and print the autoencoder metrics
reconstruction_error = compute_reconstruction_error(X_test, autoencoder)
autoencoder_metrics = {
    "reconstruction_error": format_metric(reconstruction_error),
    "reconstruction_error_noise": format_metric(reconstruction_error_noise),
}
json.dump(autoencoder_metrics, open(f"{EXPERIMENT_PATH}/autoencoder_metrics.json", "w"))
pprint(autoencoder_metrics)

autoencoder.save(f"{EXPERIMENT_PATH}/autoencoder.keras")



Training loss: 0.2300
Validation loss: 0.2437
{'reconstruction_error': '1.092 ± 0.137',
 'reconstruction_error_noise': '1.083 ± 0.032'}
