In [41]:
import dalex as dx
from sklearn.tree import DecisionTreeClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import DecisionListClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


clfs = {
    "CART": DecisionTreeClassifier(random_state=1234),
    "EBM": ExplainableBoostingClassifier(),
    "LR_l2": LogisticRegression(penalty="l2",random_state=1234),
    "GNB": GaussianNB(),
    "LR": LogisticRegression(penalty="none", random_state=1234),
    "DL": DecisionListClassifier(random_state=1234) 
}

dataset = 'bank'
clf_name = "LR"

import worstcase_helper
import importlib
importlib.reload(worstcase_helper)

preprocess, X, y = worstcase_helper.load_dataset_with_preprocess(clf_name, dataset)

from sklearn.pipeline import make_pipeline
def make_pipeline_clf(clf_name):
    clf = make_pipeline(
        preprocess,
        clfs[clf_name]
    )
    return clf

clf_cart = make_pipeline_clf("CART")
clf_cart.fit(X, y)

clf_ebm = make_pipeline_clf("EBM")
clf_ebm.fit(X, y)

clf_lr_l2 = make_pipeline_clf("LR_l2")
clf_lr_l2.fit(X, y)

clf_gnb = make_pipeline_clf("GNB")
clf_gnb.fit(X, y)

clf_lr = make_pipeline_clf("LR")
clf_lr.fit(X, y)

clf_dl = make_pipeline_clf("DL")
clf_dl.fit(X, y)

clf_cart_exp = dx.Explainer(clf_cart, X, y, label="CART")
clf_ebm_exp = dx.Explainer(clf_ebm, X, y, label="EBM")
clf_lr_l2_exp = dx.Explainer(clf_lr_l2, X, y, label="LR_l2")
clf_gnb_exp = dx.Explainer(clf_gnb, X, y, label="GNB")
clf_lr_exp = dx.Explainer(clf_lr, X, y, label="LR")
clf_dl_exp = dx.Explainer(clf_dl, X, y, label="DL")




A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Preparation of a new explainer is initiated

  -> data              : 45211 rows 15 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 45211 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : CART
  -> predict function  : <function yhat_proba_default at 0x7f859cd1be50> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.062, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = 0.0, mean = 0.0, max = 0.0
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 45211 rows 15 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to 

### Model Partial Dependence profile

In [42]:
preprocess

ColumnTransformer(transformers=[('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['job', 'marital', 'education', 'default',
                                  'housing', 'loan', 'contact', 'month',
                                  'poutcome']),
                                ('standardscaler', StandardScaler(),
                                 ['age', 'balance', 'day', 'duration',
                                  'campaign', 'previous'])])

In [43]:
cat_feat = ['job', 'marital', 'education', 'default',
                                  'housing', 'loan', 'contact', 'month',
                                  'poutcome']
cont_feat = ['age', 'balance', 'day', 'duration',
                                  'campaign', 'previous']

In [44]:
#categorical pd profiles
pd_cart_cat = clf_cart_exp.model_profile(variable_type="categorical", variables=cat_feat)

pd_ebm_cat = clf_ebm_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_lr_l2_cat = clf_lr_l2_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_gnb_cat = clf_gnb_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_lr_cat = clf_lr_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_dl_cat = clf_dl_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

#continous pd profiles
pd_cart = clf_cart_exp.model_profile(variables= cont_feat)

pd_ebm = clf_ebm_exp.model_profile(variables= cont_feat)

pd_lr_l2 = clf_lr_l2_exp.model_profile(variables= cont_feat)

pd_gnb = clf_gnb_exp.model_profile(variables= cont_feat)

pd_lr = clf_lr_exp.model_profile(variables= cont_feat)

pd_dl = clf_dl_exp.model_profile(variables= cont_feat)

Calculating ceteris paribus: 100%|██████████| 9/9 [00:00<00:00, 12.28it/s]
Calculating ceteris paribus: 100%|██████████| 9/9 [00:04<00:00,  1.81it/s]
Calculating ceteris paribus: 100%|██████████| 9/9 [00:01<00:00,  7.48it/s]
Calculating ceteris paribus: 100%|██████████| 9/9 [00:00<00:00, 11.18it/s]
Calculating ceteris paribus: 100%|██████████| 9/9 [00:01<00:00,  8.22it/s]
Calculating ceteris paribus: 100%|██████████| 9/9 [00:04<00:00,  2.18it/s]
Calculating ceteris paribus: 100%|██████████| 6/6 [00:02<00:00,  2.63it/s]
Calculating ceteris paribus: 100%|██████████| 6/6 [00:54<00:00,  9.10s/it]
Calculating ceteris paribus: 100%|██████████| 6/6 [00:03<00:00,  1.97it/s]
Calculating ceteris paribus: 100%|██████████| 6/6 [00:02<00:00,  2.04it/s]
Calculating ceteris paribus: 100%|██████████| 6/6 [00:02<00:00,  2.17it/s]
Calculating ceteris paribus: 100%|██████████| 6/6 [00:05<00:00,  1.08it/s]


### Plot Aggregated Profiles for Categorical

In [45]:
# pd_cart_cat.plot([pd_ebm_cat, pd_gnb_cat, pd_lr_cat, pd_lr_l2_cat, pd_dl_cat], variables=cat_feat)

### Plot Aggregated Profiles for Continous

In [46]:
# pd_cart.plot([pd_ebm, pd_gnb, pd_lr, pd_lr_l2, pd_dl], variables=cont_feat)

### FEATURES ANALYSIS

In [47]:
import pandas as pd
import numpy as np

def calc(feat_gen, feat_comp):
    return np.abs(np.subtract(feat_gen["_yhat_"], feat_comp["_yhat_"])).sum()

def select_feat_result_cont(feat_name):
    feat_cart = pd_cart.result[pd_cart.result["_vname_"] == feat_name]
    feat_ebm = pd_ebm.result[pd_ebm.result["_vname_"] == feat_name]
    feat_lr = pd_lr.result[pd_lr.result["_vname_"] == feat_name]
    feat_gnb = pd_gnb.result[pd_gnb.result["_vname_"] == feat_name]
    feat_lr_l2 = pd_lr_l2.result[pd_lr_l2.result["_vname_"] == feat_name]
    feat_dl = pd_dl.result[pd_dl.result["_vname_"] == feat_name]
    
    #define generator clf
    gen_res = feat_lr

    cart_res = calc(gen_res, feat_cart)
    ebm_res = calc(gen_res, feat_ebm)
    lr_res = calc(gen_res, feat_lr)
    gnb_res = calc(gen_res, feat_gnb)
    lr_l2_res = calc(gen_res, feat_lr_l2)
    dl_res = calc(gen_res, feat_dl)

    dict_result = {
        "Param": feat_name,
        "CART": cart_res,
        "EBM": ebm_res,
        "LR": lr_res,
        "GNB": gnb_res,
        "LR_l2": lr_l2_res,
        "DL": dl_res
    }

    return dict_result

def select_feat_result_cat(feat_name):
    feat_cart = pd_cart_cat.result[pd_cart_cat.result["_vname_"] == feat_name]
    feat_ebm = pd_ebm_cat.result[pd_ebm_cat.result["_vname_"] == feat_name]
    feat_lr = pd_lr_cat.result[pd_lr_cat.result["_vname_"] == feat_name]
    feat_gnb = pd_gnb_cat.result[pd_gnb_cat.result["_vname_"] == feat_name]
    feat_lr_l2 = pd_lr_l2_cat.result[pd_lr_l2_cat.result["_vname_"] == feat_name]
    feat_dl = pd_dl_cat.result[pd_dl_cat.result["_vname_"] == feat_name]
    
    #define generator clf
    gen_res = feat_lr

    cart_res = calc(gen_res, feat_cart)
    ebm_res = calc(gen_res, feat_ebm)
    lr_res = calc(gen_res, feat_lr)
    gnb_res = calc(gen_res, feat_gnb)
    lr_l2_res = calc(gen_res, feat_lr_l2)
    dl_res = calc(gen_res, feat_dl)

    dict_result = {
        "Param": feat_name,
        "CART": cart_res,
        "EBM": ebm_res,
        "LR": lr_res,
        "GNB": gnb_res,
        "LR_l2": lr_l2_res,
        "DL": dl_res
    }

    return dict_result

In [48]:
#**Continous results**
cont_results = []
for param in cont_feat:
    param_result = select_feat_result_cont(param)
    param_result = pd.DataFrame(param_result, index =list(param_result.keys()))
    cont_results.append(param_result)
cont_final_res = pd.concat(cont_results)
print(cont_final_res)

#Categorical results
cat_results = []
for param in cat_feat:
    param_result = select_feat_result_cat(param)
    param_result = pd.DataFrame(param_result, index =list(param_result.keys()))
    cat_results.append(param_result)
cat_final_res = pd.concat(cat_results)
print(cat_final_res)
pd.concat([cont_final_res, cat_final_res]).to_csv(f"../worst-case_results/{dataset}_{clf_name}.csv",index=False)

          Param       CART        EBM   LR        GNB     LR_l2         DL
Param       age   3.285423   1.102898  0.0   8.698937  3.482423   4.643256
CART        age   3.285423   1.102898  0.0   8.698937  3.482423   4.643256
EBM         age   3.285423   1.102898  0.0   8.698937  3.482423   4.643256
LR          age   3.285423   1.102898  0.0   8.698937  3.482423   4.643256
GNB         age   3.285423   1.102898  0.0   8.698937  3.482423   4.643256
LR_l2       age   3.285423   1.102898  0.0   8.698937  3.482423   4.643256
DL          age   3.285423   1.102898  0.0   8.698937  3.482423   4.643256
Param   balance   2.307880   3.546941  0.0  63.123988  4.204960   2.935449
CART    balance   2.307880   3.546941  0.0  63.123988  4.204960   2.935449
EBM     balance   2.307880   3.546941  0.0  63.123988  4.204960   2.935449
LR      balance   2.307880   3.546941  0.0  63.123988  4.204960   2.935449
GNB     balance   2.307880   3.546941  0.0  63.123988  4.204960   2.935449
LR_l2   balance   2.30788