In [169]:
import dalex as dx
from sklearn.tree import DecisionTreeClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import DecisionListClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


clfs = {
    "CART": DecisionTreeClassifier(random_state=1234),
    "EBM": ExplainableBoostingClassifier(),
    "LR_l2": LogisticRegression(penalty="l2",random_state=1234),
    "GNB": GaussianNB(),
    "LR": LogisticRegression(penalty="none", random_state=1234),
    "DL": DecisionListClassifier(random_state=1234) 
}

dataset = 'heart'
clf_name = "CART"

In [170]:
import worstcase_helper
import importlib
importlib.reload(worstcase_helper)

preprocess, X, y = worstcase_helper.load_dataset_with_preprocess(clf_name, dataset)

In [171]:
preprocess

ColumnTransformer(transformers=[('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['Sex', 'cp', 'Place']),
                                ('standardscaler', StandardScaler(),
                                 ['Age', 'trestbps', 'chol', 'fbs', 'restecg',
                                  'thalach', 'exang', 'oldpeak'])])

## Make Pipelines

In [172]:
from sklearn.pipeline import make_pipeline
def make_pipeline_clf(clf_name):
    clf = make_pipeline(
        preprocess,
        clfs[clf_name]
    )
    return clf

In [173]:
clf_cart = make_pipeline_clf("CART")
clf_cart.fit(X, y)

clf_ebm = make_pipeline_clf("EBM")
clf_ebm.fit(X, y)

clf_lr_l2 = make_pipeline_clf("LR_l2")
clf_lr_l2.fit(X, y)

clf_gnb = make_pipeline_clf("GNB")
clf_gnb.fit(X, y)

clf_lr = make_pipeline_clf("LR")
clf_lr.fit(X, y)

clf_dl = make_pipeline_clf("DL")
clf_dl.fit(X, y)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  ['Sex', 'cp', 'Place']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['Age', 'trestbps', 'chol',
                                                   'fbs', 'restecg', 'thalach',
                                                   'exang', 'oldpeak'])])),
                ('decisionlistclassifier',
                 <interpret.glassbox.skoperules.DecisionListClassifier object at 0x7f16d53db910>)])

## Add Explainers

In [174]:
clf_cart_exp = dx.Explainer(clf_cart, X, y, label="CART")
clf_ebm_exp = dx.Explainer(clf_ebm, X, y, label="EBM")
clf_lr_l2_exp = dx.Explainer(clf_lr_l2, X, y, label="LR_l2")
clf_gnb_exp = dx.Explainer(clf_gnb, X, y, label="GNB")
clf_lr_exp = dx.Explainer(clf_lr, X, y, label="LR")
clf_dl_exp = dx.Explainer(clf_dl, X, y, label="DL")

Preparation of a new explainer is initiated

  -> data              : 457 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 457 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : CART
  -> predict function  : <function yhat_proba_default at 0x7f16e2023dc0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.46, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = 0.0, mean = 0.0, max = 0.0
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 457 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy

## Model partial dependence profiles

In [175]:
cat_feat = ['Sex', 'cp', 'Place']
cont_feat = ['Age', 'trestbps', 'chol',
             'fbs', 'restecg', 'thalach',
             'exang', 'oldpeak']

In [176]:
#categorical pd profiles
pd_cart_cat = clf_cart_exp.model_profile(variable_type="categorical", variables=cat_feat)

pd_ebm_cat = clf_ebm_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_lr_l2_cat = clf_lr_l2_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_gnb_cat = clf_gnb_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_lr_cat = clf_lr_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_dl_cat = clf_dl_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")


Calculating ceteris paribus: 100%|██████████| 3/3 [00:00<00:00, 38.60it/s]
Calculating ceteris paribus: 100%|██████████| 3/3 [00:00<00:00, 15.52it/s]
Calculating ceteris paribus: 100%|██████████| 3/3 [00:00<00:00, 39.12it/s]
Calculating ceteris paribus: 100%|██████████| 3/3 [00:00<00:00, 41.24it/s]
Calculating ceteris paribus: 100%|██████████| 3/3 [00:00<00:00, 40.97it/s]
Calculating ceteris paribus: 100%|██████████| 3/3 [00:00<00:00,  4.56it/s]


In [177]:
#continous pd profiles
pd_cart = clf_cart_exp.model_profile(variables= cont_feat)

pd_ebm = clf_ebm_exp.model_profile(variables= cont_feat)

pd_lr_l2 = clf_lr_l2_exp.model_profile(variables= cont_feat)

pd_gnb = clf_gnb_exp.model_profile(variables= cont_feat)

pd_lr = clf_lr_exp.model_profile(variables= cont_feat)

pd_dl = clf_dl_exp.model_profile(variables= cont_feat)

Calculating ceteris paribus: 100%|██████████| 8/8 [00:00<00:00,  9.85it/s]
Calculating ceteris paribus: 100%|██████████| 8/8 [00:11<00:00,  1.49s/it]
Calculating ceteris paribus: 100%|██████████| 8/8 [00:00<00:00,  8.04it/s]
Calculating ceteris paribus: 100%|██████████| 8/8 [00:01<00:00,  7.10it/s]
Calculating ceteris paribus: 100%|██████████| 8/8 [00:01<00:00,  7.13it/s]
Calculating ceteris paribus: 100%|██████████| 8/8 [00:02<00:00,  3.40it/s]


### Plot Aggregated Profiles for Categorical

In [178]:
pd_cart_cat.plot([pd_ebm_cat, pd_gnb_cat, pd_lr_cat, pd_lr_l2_cat, pd_dl_cat], variables=cat_feat)

### Plot Aggregated Profiles for Continous

In [179]:
pd_cart.plot([pd_ebm, pd_gnb, pd_lr, pd_lr_l2, pd_dl], variables=cont_feat)

### FEATURES ANALYSIS

In [180]:
import pandas as pd

In [181]:
cat_params = ['Sex', 'cp', 'Place']
cont_params = ['Age', 'trestbps', 'chol',
             'fbs', 'restecg', 'thalach',
             'exang', 'oldpeak']

In [182]:
def calc(feat_gen, feat_comp):
    return np.abs(np.subtract(feat_gen["_yhat_"], feat_comp["_yhat_"])).sum()

In [183]:
def select_feat_result_cont(feat_name):
    feat_cart = pd_cart.result[pd_cart.result["_vname_"] == feat_name]
    feat_ebm = pd_ebm.result[pd_ebm.result["_vname_"] == feat_name]
    feat_lr = pd_lr.result[pd_lr.result["_vname_"] == feat_name]
    feat_gnb = pd_gnb.result[pd_gnb.result["_vname_"] == feat_name]
    feat_lr_l2 = pd_lr_l2.result[pd_lr_l2.result["_vname_"] == feat_name]
    feat_dl = pd_dl.result[pd_dl.result["_vname_"] == feat_name]
    
    #define generator clf
    gen_res = feat_cart

    cart_res = calc(gen_res, feat_cart)
    ebm_res = calc(gen_res, feat_ebm)
    lr_res = calc(gen_res, feat_lr)
    gnb_res = calc(gen_res, feat_gnb)
    lr_l2_res = calc(gen_res, feat_lr_l2)
    dl_res = calc(gen_res, feat_dl)

    dict_result = {
        "Param": feat_name,
        "CART": cart_res,
        "EBM": ebm_res,
        "LR": lr_res,
        "GNB": gnb_res,
        "LR_l2": lr_l2_res,
        "DL": dl_res
    }

    return dict_result

In [184]:
def select_feat_result_cat(feat_name):
    feat_cart = pd_cart_cat.result[pd_cart_cat.result["_vname_"] == feat_name]
    feat_ebm = pd_ebm_cat.result[pd_ebm_cat.result["_vname_"] == feat_name]
    feat_lr = pd_lr_cat.result[pd_lr_cat.result["_vname_"] == feat_name]
    feat_gnb = pd_gnb_cat.result[pd_gnb_cat.result["_vname_"] == feat_name]
    feat_lr_l2 = pd_lr_l2_cat.result[pd_lr_l2_cat.result["_vname_"] == feat_name]
    feat_dl = pd_dl_cat.result[pd_dl_cat.result["_vname_"] == feat_name]
    
    #define generator clf
    gen_res = feat_cart

    cart_res = calc(gen_res, feat_cart)
    ebm_res = calc(gen_res, feat_ebm)
    lr_res = calc(gen_res, feat_lr)
    gnb_res = calc(gen_res, feat_gnb)
    lr_l2_res = calc(gen_res, feat_lr_l2)
    dl_res = calc(gen_res, feat_dl)

    dict_result = {
        "Param": feat_name,
        "CART": cart_res,
        "EBM": ebm_res,
        "LR": lr_res,
        "GNB": gnb_res,
        "LR_l2": lr_l2_res,
        "DL": dl_res
    }

    return dict_result

**Continous results**

In [185]:
cont_results = []
for param in cont_feat:
    param_result = select_feat_result_cont(param)
    param_result = pd.DataFrame(param_result, index =list(param_result.keys()))
    cont_results.append(param_result)

In [186]:
cont_final_res = pd.concat(cont_results)
cont_final_res

Unnamed: 0,Param,CART,EBM,LR,GNB,LR_l2,DL
Param,Age,0.0,6.866657,6.430196,7.009807,6.108174,14.136835
CART,Age,0.0,6.866657,6.430196,7.009807,6.108174,14.136835
EBM,Age,0.0,6.866657,6.430196,7.009807,6.108174,14.136835
LR,Age,0.0,6.866657,6.430196,7.009807,6.108174,14.136835
GNB,Age,0.0,6.866657,6.430196,7.009807,6.108174,14.136835
LR_l2,Age,0.0,6.866657,6.430196,7.009807,6.108174,14.136835
DL,Age,0.0,6.866657,6.430196,7.009807,6.108174,14.136835
Param,trestbps,0.0,5.733917,4.731534,4.962601,5.934755,14.999044
CART,trestbps,0.0,5.733917,4.731534,4.962601,5.934755,14.999044
EBM,trestbps,0.0,5.733917,4.731534,4.962601,5.934755,14.999044


**Categorical results**

In [187]:
cat_results = []
for param in cat_feat:
    param_result = select_feat_result_cat(param)
    param_result = pd.DataFrame(param_result, index =list(param_result.keys()))
    cat_results.append(param_result)

In [188]:
cat_final_res = pd.concat(cat_results)
cat_final_res

Unnamed: 0,Param,CART,EBM,LR,GNB,LR_l2,DL
Param,Sex,0.0,0.031532,0.039022,0.112552,0.025217,0.42246
CART,Sex,0.0,0.031532,0.039022,0.112552,0.025217,0.42246
EBM,Sex,0.0,0.031532,0.039022,0.112552,0.025217,0.42246
LR,Sex,0.0,0.031532,0.039022,0.112552,0.025217,0.42246
GNB,Sex,0.0,0.031532,0.039022,0.112552,0.025217,0.42246
LR_l2,Sex,0.0,0.031532,0.039022,0.112552,0.025217,0.42246
DL,Sex,0.0,0.031532,0.039022,0.112552,0.025217,0.42246
Param,cp,0.0,0.313618,0.221388,0.458507,0.242411,0.401694
CART,cp,0.0,0.313618,0.221388,0.458507,0.242411,0.401694
EBM,cp,0.0,0.313618,0.221388,0.458507,0.242411,0.401694


In [189]:
pd.concat([cont_final_res, cat_final_res]).to_csv(f"../worst-case_results/{dataset}_{clf_name}.csv",index=False)