In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import DecisionListClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

clfs = {
    "CART": DecisionTreeClassifier(random_state=1234, max_depth=1000),
    "RNF": RandomForestClassifier(random_state=1234),
    "XGB": XGBClassifier(use_label_encoder=False),
    "CAT": CatBoostClassifier(random_state=1234),
    "ADA": AdaBoostClassifier(DecisionTreeClassifier(random_state=1234, max_depth=1000)),
    "BAG": BaggingClassifier(DecisionTreeClassifier(random_state=1234, max_depth=1000)),
    "EBM": ExplainableBoostingClassifier(),
    "LR_l2": LogisticRegression(penalty="l2",random_state=1234),
    "GNB": GaussianNB(),
    "LR": LogisticRegression(penalty="none", random_state=1234),
    "DL": DecisionListClassifier(random_state=1234) 
  
}

dataset = 'german'

import worstcase_helper
import importlib
importlib.reload(worstcase_helper)

preprocess, X, y = worstcase_helper.load_dataset_with_preprocess(dataset)

from sklearn.pipeline import make_pipeline
def make_pipeline_clf(clf_name):
    clf = make_pipeline(
        preprocess,
        clfs[clf_name]
    )
    return clf

clfs_list = []
for clf_id, clf_name in enumerate(clfs):
    clf = make_pipeline_clf(clf_name)
    clf.fit(X, y)
    clfs_list.append(clf)

import pickle

for clf_id, clf_name in enumerate(clfs):
    pickle.dump(clfs_list[clf_id], open(f"./new_models/german_{clf_name}.p", 'wb'))


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Learning rate set to 0.010298
0:	learn: 0.6893013	total: 18.1ms	remaining: 18.1s
1:	learn: 0.6852891	total: 42.7ms	remaining: 21.3s
2:	learn: 0.6810563	total: 65.9ms	remaining: 21.9s
3:	learn: 0.6771276	total: 93.6ms	remaining: 23.3s
4:	learn: 0.6731918	total: 100ms	remaining: 19.9s
5:	learn: 0.6687944	total: 149ms	remaining: 24.7s
6:	learn: 0.6652476	total: 160ms	remaining: 22.7s
7:	learn: 0.6614817	total: 187ms	remaining: 23.2s
8:	learn: 0.6575609	total: 211ms	remaining: 23.2s
9:	learn: 0.6541293	total: 226ms	remaining: 22.4s
10:	learn: 0.6509054	total: 231ms	remaining: 20.7s
11:	learn: 0.6476905	total: 239ms	remaining: 19.6s
12:	learn: 0.6441217	total: 269ms	remaining: 20.4s
13:	learn: 0.6416342	total: 273ms	remaining: 19.2s
14:	learn: 0.6388796	total: 277ms	remaining: 18.2s
15:	learn: 0.6355795	total: 290ms	remaining: 17.8s
16:	learn: 0.6325726	total: 299ms	remaining: 17.3s
17:	learn: 0.6295537	total: 333ms	remaining: 18.2s
18:	learn: 0.6264594	total: 342ms	remaining: 17.6s
19:	lea

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [10]:
# clf_cart_exp = dx.Explainer(clf_cart, X, y, label="CART")
import dalex as dx
clfs_explainers = []
for clf_id, clf_name in enumerate(clfs):
    clf_exp = dx.Explainer(clfs_list[clf_id], X, y, label=clf_name)
    clfs_explainers.append(clf_exp)

Preparation of a new explainer is initiated

  -> data              : 999 rows 20 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 999 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : CART
  -> predict function  : <function yhat_proba_default at 0x7fd214440040> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.7, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = 0.0, mean = 0.0, max = 0.0
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 999 rows 20 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.

In [11]:
cat_feat = ['att1', 'att3', 'att4', 'att6', 'att7',
            'att9', 'att10', 'att12', 'att14', 'att15',
            'att17', 'att19', 'att20']
cont_feat = ['att2', 'att5', 'att8', 'att11', 'att13',
            'att16', 'att18']

In [12]:
# import dalex as dx
clfs_pd_cat = []
clfs_pd_cont = []
for clf_id, clf_name in enumerate(clfs):
    clf_pd_cat = clfs_explainers[clf_id].model_profile( variables = cat_feat,
                                                        variable_type='categorical')
    clfs_pd_cat.append(clf_pd_cat)

for clf_id, clf_name in enumerate(clfs):
    clf_pd_cont = clfs_explainers[clf_id].model_profile( variables = cont_feat)
    clfs_pd_cont.append(clf_pd_cont)

Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 20.16it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:01<00:00, 10.39it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:01<00:00, 11.31it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:01<00:00,  9.55it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 27.29it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 18.61it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:04<00:00,  3.21it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 13.68it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 25.68it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 15.02it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:06<00:00,  2.05it/s]
Calculating ceteris paribus: 100%|██████████| 7/7 [00:02<00:00,  3.28it/s]
Calculating ceteris paribus: 100%|██████████| 7/7 [00:05<00:00,  1.30it/s]
Cal

In [13]:
# import dalex as dx
clfs_mp = []
for clf_id, clf_name in enumerate(clfs):
    clf_mp = clfs_explainers[clf_id].model_parts()
    clfs_mp.append(clf_mp)

In [14]:
import numpy as np 
diabetes_mp_results = np.zeros(shape=len(clfs))

for i in range(len(clfs)):
    diabetes_mp_results[i] = np.abs(np.subtract(clfs_mp[0].result.dropout_loss[1:-1], clfs_mp[i].result.dropout_loss[1:-1])).sum()

In [15]:
import pandas as pd
pd.DataFrame(diabetes_mp_results).to_csv(f"./results/mp_diabetes.csv")

In [16]:
diabetes_pd_cat_results = np.zeros(shape=len(clfs))
diabetes_pd_cont_results = np.zeros(shape=len(clfs))
for i in range(len(clfs)):
    diabetes_pd_cat_results[i] = np.abs(np.subtract(clfs_pd_cat[0].result._yhat_, clfs_pd_cat[i].result._yhat_)).sum()
    diabetes_pd_cont_results[i] = np.abs(np.subtract(clfs_pd_cont[0].result._yhat_, clfs_pd_cont[i].result._yhat_)).sum()

In [17]:
diabetes_pd_results = diabetes_pd_cat_results + diabetes_pd_cont_results

In [18]:
pd.DataFrame(diabetes_pd_results).to_csv(f"./results/pd_diabetes.csv")