In [1]:
import numpy as np
np.random.seed(1)
import sys
import sklearn
import sklearn.ensemble
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/home/avl/git/fork-anchor/')

from anchor import utils
from anchor import anchor_tabular

### Loading the dataset
This dataset is about predicting if a person makes more or less than 50,000 dollars

In [2]:
dataset_folder = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/'
dataset = utils.load_dataset('adult', balance=True, discretize=False, dataset_folder=dataset_folder)

### Create feature transformation pipeline
Create feature pre-processor. Needs to have 'fit' and 'transform' methods. Different types of pre-processing can be applied to all or part of the features. In the example below we will standardize ordinal features and apply one-hot-encoding to categorical features.

Ordinal features:

In [3]:
# ['Age', 'Hours per week']
ordinal_features = [x for x in range(len(dataset.feature_names)) if x not in dataset.categorical_features]
ordinal_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                      ('scaler', StandardScaler())])

Categorical features:

In [4]:
categorical_features = list(dataset.categorical_features)
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])

Combine:

In [5]:
preprocessor = ColumnTransformer(transformers=[('num', ordinal_transformer, ordinal_features),
                                               ('cat', categorical_transformer, categorical_features)])

Fit to all data (train + test); just for example...

In [6]:
preprocessor.fit(dataset.data)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]), [0, 10]), ('cat', Pipeline(memory=None,
     steps=[('imput...>, handle_unknown='ignore',
       n_values=None, sparse=True))]), [1, 2, 3, 4, 5, 6, 7, 8, 9, 11])])

### Train Random Forest model

Fit on pre-processed (imputing, OHE, standardizing) data.

In [7]:
np.random.seed(0)
c = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
c.fit(preprocessor.transform(dataset.train), dataset.labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=5,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Define predict function

In [8]:
predict_fn = lambda x: c.predict(preprocessor.transform(x))
print('Train', sklearn.metrics.accuracy_score(dataset.labels_train, predict_fn(dataset.train)))
print('Test', sklearn.metrics.accuracy_score(dataset.labels_test, predict_fn(dataset.test)))

Train 0.9847748106815465
Test 0.8852772466539197


### Initiate explainer

In [9]:
explainer = anchor_tabular.AnchorTabularExplainer(predict_fn, dataset.train, dataset.labels_train,
                                                  dataset.class_names, dataset.feature_names, 
                                                  dataset.categorical_names, discretizer='quartile')

### Getting an anchor

Below, we get an anchor for prediction number 0. An anchor is a sufficient condition - that is, when the anchor holds, the prediction should be the same as the prediction for this instance.

In [10]:
idx = 0
np.random.seed(0)
print('Prediction: ', explainer.class_names[predict_fn(dataset.test[idx].reshape(1, -1))[0]])

Prediction:  b'>50K'


In [11]:
exp = explainer.explain_instance(dataset.test[idx])
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print('Coverage: %.2f' % exp.coverage())

Anchor: Education = Bachelors AND Relationship = Husband AND Occupation = Sales
Precision: 0.95
Coverage: 0.02


Note that we set threshold to 0.95, so we guarantee (with high probability) that precision will be above 0.95 - that is, that predictions on instances where the anchor holds will be the same as the original prediction at least 95% of the time. Let's try it out on the test set

In [12]:
# Get test examples where the anchor applies
fit_anchor = np.where(np.all(dataset.test[:, exp.features()] == dataset.test[idx][exp.features()], axis=1))[0]
print('Anchor test coverage: %.2f' % (fit_anchor.shape[0] / float(dataset.test.shape[0])))
print('Anchor test precision: %.2f' % (np.mean(predict_fn(dataset.test[fit_anchor]) == predict_fn(dataset.test[idx].reshape(1, -1)))))

Anchor test coverage: 0.02
Anchor test precision: 0.97


### Looking at a partial anchor
You can look at just part of the anchor - for example, the first two clauses. Note how these do not have enough precision, which is why the explainer added a third one

In [13]:
print('Partial anchor: %s' % (' AND '.join(exp.names(1))))
print('Partial precision: %.2f' % exp.precision(1))
print('Partial coverage: %.2f' % exp.coverage(1))

Partial anchor: Education = Bachelors AND Relationship = Husband
Partial precision: 0.86
Partial coverage: 0.12


In [14]:
fit_partial = np.where(np.all(dataset.test[:, exp.features(1)] == dataset.test[idx][exp.features(1)], axis=1))[0]
print('Partial anchor test precision: %.2f' % (np.mean(predict_fn(dataset.test[fit_partial]) == predict_fn(dataset.test[idx].reshape(1, -1)))))
print('Partial anchor test coverage: %.2f' % (fit_partial.shape[0] / float(dataset.test.shape[0])))

Partial anchor test precision: 0.95
Partial anchor test coverage: 0.12
