In [42]:
import dalex as dx
from sklearn.tree import DecisionTreeClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import DecisionListClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


clfs = {
    "CART": DecisionTreeClassifier(random_state=1234),
    "EBM": ExplainableBoostingClassifier(),
    "LR_l2": LogisticRegression(penalty="l2",random_state=1234),
    "GNB": GaussianNB(),
    "LR": LogisticRegression(penalty="none", random_state=1234),
    "DL": DecisionListClassifier(random_state=1234) 
}

dataset = 'german'
clf_name = "LR"

import worstcase_helper
import importlib
importlib.reload(worstcase_helper)

preprocess, X, y = worstcase_helper.load_dataset_with_preprocess(clf_name, dataset)

from sklearn.pipeline import make_pipeline
def make_pipeline_clf(clf_name):
    clf = make_pipeline(
        preprocess,
        clfs[clf_name]
    )
    return clf

clf_cart = make_pipeline_clf("CART")
clf_cart.fit(X, y)

clf_ebm = make_pipeline_clf("EBM")
clf_ebm.fit(X, y)

clf_lr_l2 = make_pipeline_clf("LR_l2")
clf_lr_l2.fit(X, y)

clf_gnb = make_pipeline_clf("GNB")
clf_gnb.fit(X, y)

clf_lr = make_pipeline_clf("LR")
clf_lr.fit(X, y)

clf_dl = make_pipeline_clf("DL")
clf_dl.fit(X, y)

clf_cart_exp = dx.Explainer(clf_cart, X, y, label="CART")
clf_ebm_exp = dx.Explainer(clf_ebm, X, y, label="EBM")
clf_lr_l2_exp = dx.Explainer(clf_lr_l2, X, y, label="LR_l2")
clf_gnb_exp = dx.Explainer(clf_gnb, X, y, label="GNB")
clf_lr_exp = dx.Explainer(clf_lr, X, y, label="LR")
clf_dl_exp = dx.Explainer(clf_dl, X, y, label="DL")




A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Preparation of a new explainer is initiated

  -> data              : 999 rows 20 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 999 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : CART
  -> predict function  : <function yhat_proba_default at 0x7fe26e9a5e50> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.759, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = 0.0, mean = 0.0, max = 0.0
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 999 rows 20 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a nump

### Model Partial Dependence profile

In [43]:
preprocess

ColumnTransformer(transformers=[('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['att1', 'att3', 'att4', 'att6', 'att7',
                                  'att9', 'att10', 'att12', 'att14', 'att15',
                                  'att17', 'att19', 'att20']),
                                ('standardscaler', StandardScaler(),
                                 ['att2', 'att5', 'att8', 'att11', 'att13',
                                  'att16', 'att18'])])

In [44]:
cat_feat = ['att1', 'att3', 'att4', 'att6', 'att7',
                                  'att9', 'att10', 'att12', 'att14', 'att15',
                                  'att17', 'att19', 'att20']
cont_feat = ['att2', 'att5', 'att8', 'att11', 'att13',
                                  'att16', 'att18']

In [45]:
#categorical pd profiles
pd_cart_cat = clf_cart_exp.model_profile(variable_type="categorical", variables=cat_feat)

pd_ebm_cat = clf_ebm_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_lr_l2_cat = clf_lr_l2_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_gnb_cat = clf_gnb_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_lr_cat = clf_lr_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_dl_cat = clf_dl_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

#continous pd profiles
pd_cart = clf_cart_exp.model_profile(variables= cont_feat)

pd_ebm = clf_ebm_exp.model_profile(variables= cont_feat)

pd_lr_l2 = clf_lr_l2_exp.model_profile(variables= cont_feat)

pd_gnb = clf_gnb_exp.model_profile(variables= cont_feat)

pd_lr = clf_lr_exp.model_profile(variables= cont_feat)

pd_dl = clf_dl_exp.model_profile(variables= cont_feat)

Calculating ceteris paribus: 100%|██████████| 13/13 [00:01<00:00, 10.76it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:06<00:00,  1.88it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:01<00:00,  8.53it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 13.48it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:01<00:00,  7.60it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:06<00:00,  2.02it/s]
Calculating ceteris paribus: 100%|██████████| 7/7 [00:03<00:00,  2.25it/s]
Calculating ceteris paribus: 100%|██████████| 7/7 [01:10<00:00, 10.00s/it]
Calculating ceteris paribus: 100%|██████████| 7/7 [00:03<00:00,  2.11it/s]
Calculating ceteris paribus: 100%|██████████| 7/7 [00:03<00:00,  2.03it/s]
Calculating ceteris paribus: 100%|██████████| 7/7 [00:03<00:00,  2.05it/s]
Calculating ceteris paribus: 100%|██████████| 7/7 [00:08<00:00,  1.21s/it]


### Plot Aggregated Profiles for Categorical

In [46]:
# pd_cart_cat.plot([pd_ebm_cat, pd_gnb_cat, pd_lr_cat, pd_lr_l2_cat, pd_dl_cat], variables=cat_feat)

### Plot Aggregated Profiles for Continous

In [47]:
# pd_cart.plot([pd_ebm, pd_gnb, pd_lr, pd_lr_l2, pd_dl], variables=cont_feat)

### FEATURES ANALYSIS

In [48]:
import pandas as pd
import numpy as np

def calc(feat_gen, feat_comp):
    return np.abs(np.subtract(feat_gen["_yhat_"], feat_comp["_yhat_"])).sum()

def select_feat_result_cont(feat_name):
    feat_cart = pd_cart.result[pd_cart.result["_vname_"] == feat_name]
    feat_ebm = pd_ebm.result[pd_ebm.result["_vname_"] == feat_name]
    feat_lr = pd_lr.result[pd_lr.result["_vname_"] == feat_name]
    feat_gnb = pd_gnb.result[pd_gnb.result["_vname_"] == feat_name]
    feat_lr_l2 = pd_lr_l2.result[pd_lr_l2.result["_vname_"] == feat_name]
    feat_dl = pd_dl.result[pd_dl.result["_vname_"] == feat_name]
    
    #define generator clf
    gen_res = feat_lr

    cart_res = calc(gen_res, feat_cart)
    ebm_res = calc(gen_res, feat_ebm)
    lr_res = calc(gen_res, feat_lr)
    gnb_res = calc(gen_res, feat_gnb)
    lr_l2_res = calc(gen_res, feat_lr_l2)
    dl_res = calc(gen_res, feat_dl)

    dict_result = {
        "Param": feat_name,
        "CART": cart_res,
        "EBM": ebm_res,
        "LR": lr_res,
        "GNB": gnb_res,
        "LR_l2": lr_l2_res,
        "DL": dl_res
    }

    return dict_result

def select_feat_result_cat(feat_name):
    feat_cart = pd_cart_cat.result[pd_cart_cat.result["_vname_"] == feat_name]
    feat_ebm = pd_ebm_cat.result[pd_ebm_cat.result["_vname_"] == feat_name]
    feat_lr = pd_lr_cat.result[pd_lr_cat.result["_vname_"] == feat_name]
    feat_gnb = pd_gnb_cat.result[pd_gnb_cat.result["_vname_"] == feat_name]
    feat_lr_l2 = pd_lr_l2_cat.result[pd_lr_l2_cat.result["_vname_"] == feat_name]
    feat_dl = pd_dl_cat.result[pd_dl_cat.result["_vname_"] == feat_name]
    
    #define generator clf
    gen_res = feat_lr

    cart_res = calc(gen_res, feat_cart)
    ebm_res = calc(gen_res, feat_ebm)
    lr_res = calc(gen_res, feat_lr)
    gnb_res = calc(gen_res, feat_gnb)
    lr_l2_res = calc(gen_res, feat_lr_l2)
    dl_res = calc(gen_res, feat_dl)

    dict_result = {
        "Param": feat_name,
        "CART": cart_res,
        "EBM": ebm_res,
        "LR": lr_res,
        "GNB": gnb_res,
        "LR_l2": lr_l2_res,
        "DL": dl_res
    }

    return dict_result

In [49]:
#**Continous results**
cont_results = []
for param in cont_feat:
    param_result = select_feat_result_cont(param)
    param_result = pd.DataFrame(param_result, index =list(param_result.keys()))
    cont_results.append(param_result)
cont_final_res = pd.concat(cont_results)
print(cont_final_res)

#Categorical results
cat_results = []
for param in cat_feat:
    param_result = select_feat_result_cat(param)
    param_result = pd.DataFrame(param_result, index =list(param_result.keys()))
    cat_results.append(param_result)
cat_final_res = pd.concat(cat_results)
print(cat_final_res)
pd.concat([cont_final_res, cat_final_res]).to_csv(f"../worst-case_results/{dataset}_{clf_name}.csv",index=False)

       Param       CART       EBM   LR        GNB     LR_l2         DL
Param   att2   6.108433  4.684365  0.0  15.348068  3.211787  14.381691
CART    att2   6.108433  4.684365  0.0  15.348068  3.211787  14.381691
EBM     att2   6.108433  4.684365  0.0  15.348068  3.211787  14.381691
LR      att2   6.108433  4.684365  0.0  15.348068  3.211787  14.381691
GNB     att2   6.108433  4.684365  0.0  15.348068  3.211787  14.381691
LR_l2   att2   6.108433  4.684365  0.0  15.348068  3.211787  14.381691
DL      att2   6.108433  4.684365  0.0  15.348068  3.211787  14.381691
Param   att5  14.099630  6.864481  0.0  10.938970  6.093666  25.776194
CART    att5  14.099630  6.864481  0.0  10.938970  6.093666  25.776194
EBM     att5  14.099630  6.864481  0.0  10.938970  6.093666  25.776194
LR      att5  14.099630  6.864481  0.0  10.938970  6.093666  25.776194
GNB     att5  14.099630  6.864481  0.0  10.938970  6.093666  25.776194
LR_l2   att5  14.099630  6.864481  0.0  10.938970  6.093666  25.776194
DL    