In [None]:
import numpy as np
np.random.seed(1)
import sys
import sklearn
import sklearn.ensemble
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/home/avl/git/fork-anchor/')

from anchor import utils
from anchor import anchor_tabular

### Loading the dataset
This dataset is about predicting if a person makes more or less than 50,000 dollars

In [None]:
dataset_folder = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/'
dataset = utils.load_dataset('adult', balance=True, discretize=False, dataset_folder=dataset_folder)

### Create feature transformation pipeline
Create feature pre-processor. Needs to have 'fit' and 'transform' methods. Different types of pre-processing can be applied to all or part of the features. In the example below we will standardize ordinal features and apply one-hot-encoding to categorical features.

Ordinal features:

In [None]:
# ['Age', 'Hours per week']
ordinal_features = [x for x in range(len(dataset.feature_names)) if x not in dataset.categorical_features]
ordinal_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                      ('scaler', StandardScaler())])

Categorical features:

In [None]:
categorical_features = list(dataset.categorical_features)
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])

Combine:

In [None]:
preprocessor = ColumnTransformer(transformers=[('num', ordinal_transformer, ordinal_features),
                                               ('cat', categorical_transformer, categorical_features)])

Fit to all data (train + test); just for example...

In [None]:
preprocessor.fit(dataset.data)

### Train Random Forest model

Fit on pre-processed (imputing, OHE, standardizing) data.

In [None]:
np.random.seed(0)
c = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
c.fit(preprocessor.transform(dataset.train), dataset.labels_train)

Define predict function

In [None]:
predict_fn = lambda x: c.predict(preprocessor.transform(x))
print('Train', sklearn.metrics.accuracy_score(dataset.labels_train, predict_fn(dataset.train)))
print('Test', sklearn.metrics.accuracy_score(dataset.labels_test, predict_fn(dataset.test)))

### Initiate explainer

In [None]:
explainer = anchor_tabular.AnchorTabularExplainer(predict_fn, dataset.train, dataset.feature_names, 
                                                  dataset.categorical_names, discretizer='quartile')

### Getting an anchor

Below, we get an anchor for prediction number 0. An anchor is a sufficient condition - that is, when the anchor holds, the prediction should be the same as the prediction for this instance.

In [None]:
idx = 0
np.random.seed(0)
print('Prediction: ', dataset.class_names[predict_fn(dataset.test[idx].reshape(1, -1))[0]])

In [None]:
exp = explainer.explain_instance(dataset.test[idx])
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print('Coverage: %.2f' % exp.coverage())

Note that we set threshold to 0.95, so we guarantee (with high probability) that precision will be above 0.95 - that is, that predictions on instances where the anchor holds will be the same as the original prediction at least 95% of the time. Let's try it out on the test set

In [None]:
# Get test examples where the anchor applies
fit_anchor = np.where(np.all(dataset.test[:, exp.features()] == dataset.test[idx][exp.features()], axis=1))[0]
print('Anchor test coverage: %.2f' % (fit_anchor.shape[0] / float(dataset.test.shape[0])))
print('Anchor test precision: %.2f' % (np.mean(predict_fn(dataset.test[fit_anchor]) == predict_fn(dataset.test[idx].reshape(1, -1)))))

### Looking at a partial anchor
You can look at just part of the anchor - for example, the first two clauses. Note how these do not have enough precision, which is why the explainer added a third one

In [None]:
print('Partial anchor: %s' % (' AND '.join(exp.names(1))))
print('Partial precision: %.2f' % exp.precision(1))
print('Partial coverage: %.2f' % exp.coverage(1))

In [None]:
fit_partial = np.where(np.all(dataset.test[:, exp.features(1)] == dataset.test[idx][exp.features(1)], axis=1))[0]
print('Partial anchor test precision: %.2f' % (np.mean(predict_fn(dataset.test[fit_partial]) == predict_fn(dataset.test[idx].reshape(1, -1)))))
print('Partial anchor test coverage: %.2f' % (fit_partial.shape[0] / float(dataset.test.shape[0])))