In [38]:
import dalex as dx
from sklearn.tree import DecisionTreeClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import DecisionListClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


clfs = {
    "CART": DecisionTreeClassifier(random_state=1234),
    "EBM": ExplainableBoostingClassifier(),
    "LR_l2": LogisticRegression(penalty="l2",random_state=1234),
    "GNB": GaussianNB(),
    "LR": LogisticRegression(penalty="none", random_state=1234),
    "DL": DecisionListClassifier(random_state=1234) 
}

dataset = 'climate'
clf_name = "LR_l2"

import worstcase_helper
import importlib
importlib.reload(worstcase_helper)

preprocess, X, y = worstcase_helper.load_dataset_with_preprocess(clf_name, dataset)

from sklearn.pipeline import make_pipeline
def make_pipeline_clf(clf_name):
    clf = make_pipeline(
        preprocess,
        clfs[clf_name]
    )
    return clf

clf_cart = make_pipeline_clf("CART")
clf_cart.fit(X, y)

clf_ebm = make_pipeline_clf("EBM")
clf_ebm.fit(X, y)

clf_lr_l2 = make_pipeline_clf("LR_l2")
clf_lr_l2.fit(X, y)

clf_gnb = make_pipeline_clf("GNB")
clf_gnb.fit(X, y)

clf_lr = make_pipeline_clf("LR")
clf_lr.fit(X, y)

clf_dl = make_pipeline_clf("DL")
clf_dl.fit(X, y)

clf_cart_exp = dx.Explainer(clf_cart, X, y, label="CART")
clf_ebm_exp = dx.Explainer(clf_ebm, X, y, label="EBM")
clf_lr_l2_exp = dx.Explainer(clf_lr_l2, X, y, label="LR_l2")
clf_gnb_exp = dx.Explainer(clf_gnb, X, y, label="GNB")
clf_lr_exp = dx.Explainer(clf_lr, X, y, label="LR")
clf_dl_exp = dx.Explainer(clf_dl, X, y, label="DL")




A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Preparation of a new explainer is initiated

  -> data              : 540 rows 18 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 540 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : CART
  -> predict function  : <function yhat_proba_default at 0x7f9b76b80e50> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.941, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = 0.0, mean = 0.0, max = 0.0
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 540 rows 18 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a nump

### Model Partial Dependence profile

In [39]:
preprocess

ColumnTransformer(transformers=[('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 []),
                                ('standardscaler', StandardScaler(),
                                 ['vconst_corr', 'vconst_2', 'vconst_3',
                                  'vconst_4', 'vconst_5', 'vconst_7', 'ah_corr',
                                  'ah_bolus', 'slm_corr', 'efficiency_factor',
                                  'tidal_mix_max', 'vertical_decay_scale',
                                  'convect_corr', 'bckgrnd_vdc1',
                                  'bckgrnd_vdc_ban', 'bckgrnd_vdc_eq',
                                  'bckgrnd_vdc_psim', 'Prandtl'])])

In [40]:
cont_feat = ['vconst_corr', 'vconst_2', 'vconst_3',
                                  'vconst_4', 'vconst_5', 'vconst_7', 'ah_corr',
                                  'ah_bolus', 'slm_corr', 'efficiency_factor',
                                  'tidal_mix_max', 'vertical_decay_scale',
                                  'convect_corr', 'bckgrnd_vdc1',
                                  'bckgrnd_vdc_ban', 'bckgrnd_vdc_eq',
                                  'bckgrnd_vdc_psim', 'Prandtl']

In [41]:


#continous pd profiles
pd_cart = clf_cart_exp.model_profile(variables= cont_feat)

pd_ebm = clf_ebm_exp.model_profile(variables= cont_feat)

pd_lr_l2 = clf_lr_l2_exp.model_profile(variables= cont_feat)

pd_gnb = clf_gnb_exp.model_profile(variables= cont_feat)

pd_lr = clf_lr_exp.model_profile(variables= cont_feat)

pd_dl = clf_dl_exp.model_profile(variables= cont_feat)

Calculating ceteris paribus: 100%|██████████| 18/18 [00:01<00:00, 13.37it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:03<00:00,  5.03it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:02<00:00,  8.03it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:01<00:00, 10.36it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:02<00:00,  6.69it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:18<00:00,  1.01s/it]


### Plot Aggregated Profiles for Continous

In [42]:
pd_cart.plot([pd_ebm, pd_gnb, pd_lr, pd_lr_l2, pd_dl], variables=cont_feat)

### FEATURES ANALYSIS

In [43]:
import pandas as pd
import numpy as np

def calc(feat_gen, feat_comp):
    return np.abs(np.subtract(feat_gen["_yhat_"], feat_comp["_yhat_"])).sum()

def select_feat_result_cont(feat_name):
    feat_cart = pd_cart.result[pd_cart.result["_vname_"] == feat_name]
    feat_ebm = pd_ebm.result[pd_ebm.result["_vname_"] == feat_name]
    feat_lr = pd_lr.result[pd_lr.result["_vname_"] == feat_name]
    feat_gnb = pd_gnb.result[pd_gnb.result["_vname_"] == feat_name]
    feat_lr_l2 = pd_lr_l2.result[pd_lr_l2.result["_vname_"] == feat_name]
    feat_dl = pd_dl.result[pd_dl.result["_vname_"] == feat_name]
    
    #define generator clf
    gen_res = feat_lr_l2

    cart_res = calc(gen_res, feat_cart)
    ebm_res = calc(gen_res, feat_ebm)
    lr_res = calc(gen_res, feat_lr)
    gnb_res = calc(gen_res, feat_gnb)
    lr_l2_res = calc(gen_res, feat_lr_l2)
    dl_res = calc(gen_res, feat_dl)

    dict_result = {
        "Param": feat_name,
        "CART": cart_res,
        "EBM": ebm_res,
        "LR": lr_res,
        "GNB": gnb_res,
        "LR_l2": lr_l2_res,
        "DL": dl_res
    }

    return dict_result

In [44]:
#**Continous results**
cont_results = []
for param in cont_feat:
    param_result = select_feat_result_cont(param)
    param_result = pd.DataFrame(param_result, index =list(param_result.keys()))
    cont_results.append(param_result)
cont_final_res = pd.concat(cont_results)
print(cont_final_res)

cont_final_res.to_csv(f"../worst-case_results/{dataset}_{clf_name}.csv",index=False)

             Param      CART       EBM        LR       GNB  LR_l2        DL
Param  vconst_corr  1.651872  2.331451  1.401976  1.230381    0.0  5.468151
CART   vconst_corr  1.651872  2.331451  1.401976  1.230381    0.0  5.468151
EBM    vconst_corr  1.651872  2.331451  1.401976  1.230381    0.0  5.468151
LR     vconst_corr  1.651872  2.331451  1.401976  1.230381    0.0  5.468151
GNB    vconst_corr  1.651872  2.331451  1.401976  1.230381    0.0  5.468151
...            ...       ...       ...       ...       ...    ...       ...
EBM        Prandtl  0.407467  0.459805  1.067185  0.540151    0.0  5.246932
LR         Prandtl  0.407467  0.459805  1.067185  0.540151    0.0  5.246932
GNB        Prandtl  0.407467  0.459805  1.067185  0.540151    0.0  5.246932
LR_l2      Prandtl  0.407467  0.459805  1.067185  0.540151    0.0  5.246932
DL         Prandtl  0.407467  0.459805  1.067185  0.540151    0.0  5.246932

[126 rows x 7 columns]
