In [42]:
import dalex as dx
from sklearn.tree import DecisionTreeClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import DecisionListClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


clfs = {
    "CART": DecisionTreeClassifier(random_state=1234),
    "EBM": ExplainableBoostingClassifier(),
    "LR_l2": LogisticRegression(penalty="l2",random_state=1234),
    "GNB": GaussianNB(),
    "LR": LogisticRegression(penalty="none", random_state=1234),
    "DL": DecisionListClassifier(random_state=1234) 
}

dataset = 'student'
clf_name = "LR"

import worstcase_helper
import importlib
importlib.reload(worstcase_helper)

preprocess, X, y = worstcase_helper.load_dataset_with_preprocess(clf_name, dataset)

from sklearn.pipeline import make_pipeline
def make_pipeline_clf(clf_name):
    clf = make_pipeline(
        preprocess,
        clfs[clf_name]
    )
    return clf

clf_cart = make_pipeline_clf("CART")
clf_cart.fit(X, y)

clf_ebm = make_pipeline_clf("EBM")
clf_ebm.fit(X, y)

clf_lr_l2 = make_pipeline_clf("LR_l2")
clf_lr_l2.fit(X, y)

clf_gnb = make_pipeline_clf("GNB")
clf_gnb.fit(X, y)

clf_lr = make_pipeline_clf("LR")
clf_lr.fit(X, y)

clf_dl = make_pipeline_clf("DL")
clf_dl.fit(X, y)

clf_cart_exp = dx.Explainer(clf_cart, X, y, label="CART")
clf_ebm_exp = dx.Explainer(clf_ebm, X, y, label="EBM")
clf_lr_l2_exp = dx.Explainer(clf_lr_l2, X, y, label="LR_l2")
clf_gnb_exp = dx.Explainer(clf_gnb, X, y, label="GNB")
clf_lr_exp = dx.Explainer(clf_lr, X, y, label="LR")
clf_dl_exp = dx.Explainer(clf_dl, X, y, label="DL")




A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Preparation of a new explainer is initiated

  -> data              : 395 rows 30 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 395 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : CART
  -> predict function  : <function yhat_proba_default at 0x7f6ff8356e50> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.58, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = 0.0, mean = 0.0, max = 0.0
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 395 rows 30 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy

### Model Partial Dependence profile

In [43]:
preprocess

ColumnTransformer(transformers=[('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['school', 'sex', 'address', 'famsize',
                                  'Pstatus', 'Mjob', 'Fjob', 'reason',
                                  'guardian', 'schoolsup', 'famsup', 'paid',
                                  'activities', 'nursery', 'higher', 'internet',
                                  'romantic']),
                                ('standardscaler', StandardScaler(),
                                 ['age', 'Medu', 'Fedu', 'traveltime',
                                  'studytime', 'failures', 'famrel', 'freetime',
                                  'goout', 'Dalc', 'Walc', 'health',
                                  'absences'])])

In [44]:
cat_feat = ['school', 'sex', 'address', 'famsize',
                                  'Pstatus', 'Mjob', 'Fjob', 'reason',
                                  'guardian', 'schoolsup', 'famsup', 'paid',
                                  'activities', 'nursery', 'higher', 'internet',
                                  'romantic']
cont_feat = ['age', 'Medu', 'Fedu', 'traveltime',
                                  'studytime', 'failures', 'famrel', 'freetime',
                                  'goout', 'Dalc', 'Walc', 'health',
                                  'absences']

In [45]:
#categorical pd profiles
pd_cart_cat = clf_cart_exp.model_profile(variable_type="categorical", variables=cat_feat)

pd_ebm_cat = clf_ebm_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_lr_l2_cat = clf_lr_l2_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_gnb_cat = clf_gnb_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_lr_cat = clf_lr_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

pd_dl_cat = clf_dl_exp.model_profile(variables= cat_feat,
                                         variable_type="categorical")

#continous pd profiles
pd_cart = clf_cart_exp.model_profile(variables= cont_feat)

pd_ebm = clf_ebm_exp.model_profile(variables= cont_feat)

pd_lr_l2 = clf_lr_l2_exp.model_profile(variables= cont_feat)

pd_gnb = clf_gnb_exp.model_profile(variables= cont_feat)

pd_lr = clf_lr_exp.model_profile(variables= cont_feat)

pd_dl = clf_dl_exp.model_profile(variables= cont_feat)

Calculating ceteris paribus: 100%|██████████| 17/17 [00:01<00:00, 14.86it/s]
Calculating ceteris paribus: 100%|██████████| 17/17 [00:05<00:00,  3.24it/s]
Calculating ceteris paribus: 100%|██████████| 17/17 [00:02<00:00,  7.87it/s]
Calculating ceteris paribus: 100%|██████████| 17/17 [00:01<00:00, 14.30it/s]
Calculating ceteris paribus: 100%|██████████| 17/17 [00:02<00:00,  6.19it/s]
Calculating ceteris paribus: 100%|██████████| 17/17 [00:08<00:00,  1.95it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:07<00:00,  1.84it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [01:48<00:00,  8.32s/it]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:08<00:00,  1.62it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:08<00:00,  1.57it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:05<00:00,  2.31it/s]
Calculating ceteris paribus: 100%|██████████| 13/13 [00:08<00:00,  1.48it/s]


### Plot Aggregated Profiles for Categorical

In [46]:
# pd_cart_cat.plot([pd_ebm_cat, pd_gnb_cat, pd_lr_cat, pd_lr_l2_cat, pd_dl_cat], variables=cat_feat)

### Plot Aggregated Profiles for Continous

In [47]:
# pd_cart.plot([pd_ebm, pd_gnb, pd_lr, pd_lr_l2, pd_dl], variables=cont_feat)

### FEATURES ANALYSIS

In [48]:
import pandas as pd
import numpy as np

def calc(feat_gen, feat_comp):
    return np.abs(np.subtract(feat_gen["_yhat_"], feat_comp["_yhat_"])).sum()

def select_feat_result_cont(feat_name):
    feat_cart = pd_cart.result[pd_cart.result["_vname_"] == feat_name]
    feat_ebm = pd_ebm.result[pd_ebm.result["_vname_"] == feat_name]
    feat_lr = pd_lr.result[pd_lr.result["_vname_"] == feat_name]
    feat_gnb = pd_gnb.result[pd_gnb.result["_vname_"] == feat_name]
    feat_lr_l2 = pd_lr_l2.result[pd_lr_l2.result["_vname_"] == feat_name]
    feat_dl = pd_dl.result[pd_dl.result["_vname_"] == feat_name]
    
    #define generator clf
    gen_res = feat_lr

    cart_res = calc(gen_res, feat_cart)
    ebm_res = calc(gen_res, feat_ebm)
    lr_res = calc(gen_res, feat_lr)
    gnb_res = calc(gen_res, feat_gnb)
    lr_l2_res = calc(gen_res, feat_lr_l2)
    dl_res = calc(gen_res, feat_dl)

    dict_result = {
        "Param": feat_name,
        "CART": cart_res,
        "EBM": ebm_res,
        "LR": lr_res,
        "GNB": gnb_res,
        "LR_l2": lr_l2_res,
        "DL": dl_res
    }

    return dict_result

def select_feat_result_cat(feat_name):
    feat_cart = pd_cart_cat.result[pd_cart_cat.result["_vname_"] == feat_name]
    feat_ebm = pd_ebm_cat.result[pd_ebm_cat.result["_vname_"] == feat_name]
    feat_lr = pd_lr_cat.result[pd_lr_cat.result["_vname_"] == feat_name]
    feat_gnb = pd_gnb_cat.result[pd_gnb_cat.result["_vname_"] == feat_name]
    feat_lr_l2 = pd_lr_l2_cat.result[pd_lr_l2_cat.result["_vname_"] == feat_name]
    feat_dl = pd_dl_cat.result[pd_dl_cat.result["_vname_"] == feat_name]
    
    #define generator clf
    gen_res = feat_lr

    cart_res = calc(gen_res, feat_cart)
    ebm_res = calc(gen_res, feat_ebm)
    lr_res = calc(gen_res, feat_lr)
    gnb_res = calc(gen_res, feat_gnb)
    lr_l2_res = calc(gen_res, feat_lr_l2)
    dl_res = calc(gen_res, feat_dl)

    dict_result = {
        "Param": feat_name,
        "CART": cart_res,
        "EBM": ebm_res,
        "LR": lr_res,
        "GNB": gnb_res,
        "LR_l2": lr_l2_res,
        "DL": dl_res
    }

    return dict_result

In [49]:
#**Continous results**
cont_results = []
for param in cont_feat:
    param_result = select_feat_result_cont(param)
    param_result = pd.DataFrame(param_result, index =list(param_result.keys()))
    cont_results.append(param_result)
cont_final_res = pd.concat(cont_results)
print(cont_final_res)

#Categorical results
cat_results = []
for param in cat_feat:
    param_result = select_feat_result_cat(param)
    param_result = pd.DataFrame(param_result, index =list(param_result.keys()))
    cat_results.append(param_result)
cat_final_res = pd.concat(cat_results)
print(cat_final_res)
pd.concat([cont_final_res, cat_final_res]).to_csv(f"../worst-case_results/{dataset}_{clf_name}.csv",index=False)

          Param      CART       EBM   LR        GNB     LR_l2         DL
Param       age  5.669493  4.554415  0.0   3.265673  5.591855  18.297975
CART        age  5.669493  4.554415  0.0   3.265673  5.591855  18.297975
EBM         age  5.669493  4.554415  0.0   3.265673  5.591855  18.297975
LR          age  5.669493  4.554415  0.0   3.265673  5.591855  18.297975
GNB         age  5.669493  4.554415  0.0   3.265673  5.591855  18.297975
...         ...       ...       ...  ...        ...       ...        ...
EBM    absences  5.104586  4.616018  0.0  19.212098  5.777106  21.693291
LR     absences  5.104586  4.616018  0.0  19.212098  5.777106  21.693291
GNB    absences  5.104586  4.616018  0.0  19.212098  5.777106  21.693291
LR_l2  absences  5.104586  4.616018  0.0  19.212098  5.777106  21.693291
DL     absences  5.104586  4.616018  0.0  19.212098  5.777106  21.693291

[91 rows x 7 columns]
          Param      CART       EBM   LR       GNB     LR_l2        DL
Param    school  0.028467  0.0