In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import DecisionListClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

clfs = {
    "CART": DecisionTreeClassifier(random_state=1234, max_depth=1000),
    "RNF": RandomForestClassifier(random_state=1234),
    "XGB": XGBClassifier(use_label_encoder=False),
    "CAT": CatBoostClassifier(random_state=1234),
    "ADA": AdaBoostClassifier(DecisionTreeClassifier(random_state=1234, max_depth=1000)),
    "BAG": BaggingClassifier(DecisionTreeClassifier(random_state=1234, max_depth=1000)),
    "EBM": ExplainableBoostingClassifier(),
    "LR_l2": LogisticRegression(penalty="l2",random_state=1234),
    "GNB": GaussianNB(),
    "LR": LogisticRegression(penalty="none", random_state=1234),
    "DL": DecisionListClassifier(random_state=1234) 
  
}

dataset = 'breast'

import worstcase_helper
import importlib
importlib.reload(worstcase_helper)

preprocess, X, y = worstcase_helper.load_dataset_with_preprocess(dataset)

from sklearn.pipeline import make_pipeline
def make_pipeline_clf(clf_name):
    clf = make_pipeline(
        preprocess,
        clfs[clf_name]
    )
    return clf

clfs_list = []
for clf_id, clf_name in enumerate(clfs):
    clf = make_pipeline_clf(clf_name)
    clf.fit(X, y)
    clfs_list.append(clf)

import pickle

for clf_id, clf_name in enumerate(clfs):
    pickle.dump(clfs_list[clf_id], open(f"./new_models/breast_{clf_name}.p", 'wb'))


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Learning rate set to 0.008098
0:	learn: 0.6800996	total: 140ms	remaining: 2m 19s
1:	learn: 0.6650387	total: 257ms	remaining: 2m 8s
2:	learn: 0.6514931	total: 317ms	remaining: 1m 45s
3:	learn: 0.6380976	total: 351ms	remaining: 1m 27s
4:	learn: 0.6255931	total: 394ms	remaining: 1m 18s
5:	learn: 0.6125230	total: 438ms	remaining: 1m 12s
6:	learn: 0.6038672	total: 482ms	remaining: 1m 8s
7:	learn: 0.5925656	total: 530ms	remaining: 1m 5s
8:	learn: 0.5822920	total: 586ms	remaining: 1m 4s
9:	learn: 0.5704427	total: 626ms	remaining: 1m 1s
10:	learn: 0.5613287	total: 687ms	remaining: 1m 1s
11:	learn: 0.5524407	total: 786ms	remaining: 1m 4s
12:	learn: 0.5428621	total: 831ms	remaining: 1m 3s
13:	learn: 0.5318166	total: 868ms	remaining: 1m 1s
14:	learn: 0.5221519	total: 903ms	remaining: 59.3s
15:	learn: 0.5120747	total: 948ms	remaining: 58.3s
16:	learn: 0.5031776	total: 1.01s	remaining: 58.3s
17:	learn: 0.4939190	total: 1.05s	remaining: 57.5s
18:	learn: 0.4843503	total: 1.09s	remaining: 56.5s
19:	le

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [2]:
# clf_cart_exp = dx.Explainer(clf_cart, X, y, label="CART")
import dalex as dx
clfs_explainers = []
for clf_id, clf_name in enumerate(clfs):
    clf_exp = dx.Explainer(clfs_list[clf_id], X, y, label=clf_name)
    clfs_explainers.append(clf_exp)

Preparation of a new explainer is initiated

  -> data              : 569 rows 30 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 569 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : CART
  -> predict function  : <function yhat_proba_default at 0x7f35c2188ee0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.373, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = 0.0, mean = 0.0, max = 0.0
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 569 rows 30 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a nump

In [3]:
# cat_feat = ['WorkClass', 'Education', 'MaritalStatus',
#                                   'Occupation', 'Relationship', 'Race',
#                                   'Gender', 'NativeCountry']
cont_feat = ['radius_mean', 'texture_mean',
                                  'perimeter_mean', 'area_mean',
                                  'smoothness_mean', 'compactness_mean',
                                  'concavity_mean', 'concave points_mean',
                                  'symmetry_mean', 'fractal_dimension_mean',
                                  'radius_se', 'texture_se', 'perimeter_se',
                                  'area_se', 'smoothness_se', 'compactness_se',
                                  'concavity_se', 'concave points_se',
                                  'symmetry_se', 'fractal_dimension_se',
                                  'radius_worst', 'texture_worst',
                                  'perimeter_worst', 'area_worst',
                                  'smoothness_worst', 'compactness_worst',
                                  'concavity_worst', 'concave points_worst',
                                  'symmetry_worst',
                                  'fractal_dimension_worst']

In [4]:
# import dalex as dx
clfs_pd_cont = []


for clf_id, clf_name in enumerate(clfs):
    clf_pd_cont = clfs_explainers[clf_id].model_profile( variables = cont_feat)
    clfs_pd_cont.append(clf_pd_cont)

Calculating ceteris paribus: 100%|██████████| 30/30 [00:03<00:00,  8.47it/s]
Calculating ceteris paribus: 100%|██████████| 30/30 [00:14<00:00,  2.09it/s]
Calculating ceteris paribus: 100%|██████████| 30/30 [00:04<00:00,  7.17it/s]
Calculating ceteris paribus: 100%|██████████| 30/30 [00:06<00:00,  4.40it/s]
Calculating ceteris paribus: 100%|██████████| 30/30 [00:04<00:00,  7.11it/s]
Calculating ceteris paribus: 100%|██████████| 30/30 [00:07<00:00,  4.27it/s]
Calculating ceteris paribus: 100%|██████████| 30/30 [00:06<00:00,  4.48it/s]
Calculating ceteris paribus: 100%|██████████| 30/30 [00:04<00:00,  6.03it/s]
Calculating ceteris paribus: 100%|██████████| 30/30 [00:03<00:00,  7.53it/s]
Calculating ceteris paribus: 100%|██████████| 30/30 [00:04<00:00,  6.47it/s]
Calculating ceteris paribus: 100%|██████████| 30/30 [00:20<00:00,  1.49it/s]


In [5]:
# import dalex as dx
clfs_mp = []
for clf_id, clf_name in enumerate(clfs):
    clf_mp = clfs_explainers[clf_id].model_parts()
    clfs_mp.append(clf_mp)

In [6]:
import numpy as np 
breast_mp_results = np.zeros(shape=len(clfs))

for i in range(len(clfs)):
    breast_mp_results[i] = np.abs(np.subtract(clfs_mp[0].result.dropout_loss[1:-1], clfs_mp[i].result.dropout_loss[1:-1])).sum()

In [7]:
import pandas as pd
pd.DataFrame(breast_mp_results).to_csv(f"./results/mp_breast.csv")

In [8]:
# adult_pd_cat_results = np.zeros(shape=len(clfs))
breast_pd_cont_results = np.zeros(shape=len(clfs))
for i in range(len(clfs)):
    # adult_pd_cat_results[i] = np.abs(np.subtract(clfs_pd_cat[0].result._yhat_, clfs_pd_cat[i].result._yhat_)).sum()
    breast_pd_cont_results[i] = np.abs(np.subtract(clfs_pd_cont[0].result._yhat_, clfs_pd_cont[i].result._yhat_)).sum()

In [10]:
pd.DataFrame(breast_pd_cont_results).to_csv(f"./results/pd_breast.csv")