### Load the data and train ACXplainer to generate Counterfactual Rules

In [23]:
import numpy as np
from acv_explainers import ACXplainer
from acv_explainers.utils import *
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from utils import MyTabNetClassifier
from utils import DatasetHelper, DATASETS_NAME
from sklearn.metrics import roc_auc_score, accuracy_score

import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

model= 'X'

np.random.seed(0)

if(model=='L'):
    print('* Classifier: LogisticRegression')
    mdl = LogisticRegression(penalty='l2', C=1.0, solver='liblinear')
    print('\t* C: {}'.format(mdl.C)); print('\t* penalty: {}'.format(mdl.penalty));
elif(model=='X'):
    print('* Classifier: LightGBM')
    mdl = LGBMClassifier(n_estimators=50, num_leaves=8)
    print('\t* n_estimators: {}'.format(mdl.n_estimators)); print('\t* num_leaves: {}'.format(mdl.num_leaves));
elif(model=='T'):
    print('* Classifier: TabNet')
    mdl = MyTabNetClassifier(D.feature_types, verbose=0)

seed = 2022    

seed = 2022

GAMMA = 1.0
dataset = 'h'
dataset_name = DATASETS_NAME[dataset]
model= 'X'

np.random.seed(0)
LAMBDA = 0.01
GAMMA = 1.0

D = DatasetHelper(dataset=dataset, feature_prefix_index=False)
# X_tr, X_ts, y_tr, y_ts = D.train_test_split()

seed = 2022

X = pd.read_csv('data/diabetes/train.csv')
y = X.Outcome

X.drop(['Id', 'Outcome'], axis=1, inplace=True)

# X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.20, random_state=seed)

from sklearn.ensemble import IsolationForest

isolation = IsolationForest()
isolation.fit(X_tr)

# y_tr = 1 - y_tr
# y_ts = 1 - y_ts

mdl = LGBMClassifier(n_estimators=50, num_leaves=8)
mdl.fit(X_tr, y_tr)

# X = X_tr[mdl.predict(X_tr)==1]; X_vl = X_ts[mdl.predict(X_ts)==1];
# print('\t* train score: ', mdl.score(X_tr, y_tr)); print('\t* train denied: ', X.shape[0]);
# print('\t* test score: ', mdl.score(X_ts, y_ts)); print('\t* test denied: ', X_vl.shape[0]); print();

x_train = X_tr.copy()
x_test = X_ts.copy()

y_train = mdl.predict(X_tr)
y_test = mdl.predict(X_ts)


x_train = x_train.values
x_test = x_test.values
# y_train = y_train.values
# y_test = y_test.values

### Train Explainer (ACXplainer)
ac_explainer = ACXplainer(classifier=True, n_estimators=20, max_depth=12)
ac_explainer.fit(x_train, y_train)

print('# Trained ACXplainer -- score = {}'.format(accuracy_score(y_test, ac_explainer.predict(x_test))))

# idx = 0
# size = idx + 500
x, y = x_test[:500], y_test[:500]
x_rules, y_rules = x_train[:1000], y_train[:1000]

columns_name = [col.replace(' ', '') for col in X_tr.columns]
# columns_name = D.feature_names

* Classifier: LightGBM
	* n_estimators: 50
	* num_leaves: 8
# Trained ACXplainer -- score = 0.9608540925266904


In [24]:
results = RunExperiments(ac_explainer, x_train, x_test, y_train, y_test, columns_name, model=mdl)

In [25]:
results.run_local_divergent_set(x, y)

### Computing the local divergent set of (x, y)


100%|███████████████████████████████████████████| 20/20 [00:00<00:00, 96.08it/s]
 38%|████████████████▉                            | 3/8 [00:33<00:56, 11.30s/it]


In [26]:
results.run_local_counterfactual_rules(x, y, acc_level=0.9, pi_level=0.9)

### Computing the local counterfactual rules of (x, y)


100%|█████████████████████████████████████████| 281/281 [04:24<00:00,  1.06it/s]


In [27]:
results.run_local_counterfactual_rules(x, y, acc_level=0.9, pi_level=0.9)

### Computing the local counterfactual rules of (x, y)


100%|█████████████████████████████████████████| 281/281 [04:19<00:00,  1.08it/s]


In [28]:
results.run_sampling_local_counterfactuals(x, y, batch=1000, max_iter=1000, temp=0.5)

### Sampling using the local counterfactual rules of (x, y)


100%|█████████████████████████████████████████| 281/281 [15:53<00:00,  3.39s/it]


In [29]:
print('Local Accuracy = {} -- Local Coverage = {}'.format(results.accuracy_local, results.coverage_local))

Local Accuracy = 0.961038961038961 -- Local Coverage = 0.8220640569395018


In [30]:
results.run_sufficient_rules(x_rules, y_rules, pi_level=0.9)

### Computing the Sufficient Explanations and the Sufficient Rules


100%|█████████████████████████████████████████████| 8/8 [00:54<00:00,  6.78s/it]
100%|███████████████████████████████████████| 1000/1000 [07:26<00:00,  2.24it/s]


In [31]:
results.run_regional_divergent_set(stop=True, pi_level=0.9)

### Computing the regional divergent set of (x, y)


 38%|████████████████▉                            | 3/8 [02:49<04:42, 56.42s/it]


In [32]:
results.run_regional_counterfactual_rules(acc_level=0.9, pi_level=0.9)

### Computing the regional counterfactual rules of (x, y)


100%|███████████████████████████████████████| 1000/1000 [11:00<00:00,  1.51it/s]


In [33]:
results.run_sampling_regional_counterfactuals_alltests(max_obs=x_test.shape[0],batch=1000, max_iter=1000, temp=0.5)

### Sampling using the regional counterfactual rules


100%|█████████████████████████████████████████| 281/281 [14:05<00:00,  3.01s/it]


In [34]:
print('Regional Accuracy = {} -- Regional Coverage = {}'.format(results.accuracy_regional, results.coverage_regional))

Regional Accuracy = 0.9948717948717949 -- Regional Coverage = 0.693950177935943


In [39]:
if np.mean(mdl.predict(results.x_test) == results.y_test):
    print('CONSISTENT')
else:
    raise ValueError


CONSISTENT


In [200]:
x = []
for i, c in enumerate(results.counterfactuals_samples_local):
    if len(c) !=0:
        x.append(results.x_test[i])

x = np.array(x)
ce = np.array(results.dist_local)
ce_r = np.array(results.dist_regional)

print('all acc', np.mean(mdl.predict(x_test) != mdl.predict(ce_r)))

all acc 1.0


In [201]:
x_pos = x[mdl.predict(x) == 1]
ce_pos = ce[mdl.predict(x) == 1]

print('LOCAL positive accuracy', np.mean(mdl.predict(x_pos) != mdl.predict(ce_pos)))

LOCAL positive accuracy 0.9714285714285714


In [202]:
print('LOCAL positive sparsity', np.mean(np.sum(x_pos-ce_pos!=0, axis=1)))

LOCAL positive sparsity 3.4857142857142858


In [203]:
inlier_pos = np.mean(results.isolation.predict(ce_pos) == 1)
print('LOCAL positive inlier', inlier_pos)

LOCAL positive inlier 0.9904761904761905


In [204]:
x_neg = x[mdl.predict(x) == 0]
ce_neg = ce[mdl.predict(x) == 0]

print('LOCAL negative accuracy', np.mean(mdl.predict(x_neg) != mdl.predict(ce_neg)))

LOCAL negative accuracy 1.0


In [205]:
print('LOCAL negative sparsity', np.mean(np.sum(x_neg-ce_neg!=0, axis=1)))

LOCAL negative sparsity 3.8174603174603177


In [206]:
inlier_neg = np.mean(results.isolation.predict(ce_neg) == 1)
print('LOCAL negative inlier', inlier_neg)

LOCAL negative inlier 0.8095238095238095


In [209]:
inlier_pos = np.mean(results.isolation.predict(ce_pos_r) == 1)
print('REGIONAL positive inlier', inlier_pos)

REGIONAL positive inlier 0.8782608695652174


In [214]:
print('Regional Accuracy = {}'.format(results.accuracy_regional))

Regional Accuracy = 0.9948717948717949


In [213]:
print('Local Coverage = {} -- Global Coverage {}'.format(results.coverage_local, 
                                                        results.coverage_regional))

Local Coverage = 0.8220640569395018 -- Global Coverage 0.693950177935943


In [42]:
save_model(results, name='DIDiabetesCR_results')