In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import DecisionListClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

clfs = {
    "CART": DecisionTreeClassifier(random_state=1234, max_depth=1000),
    "RNF": RandomForestClassifier(random_state=1234),
    "XGB": XGBClassifier(use_label_encoder=False),
    "CAT": CatBoostClassifier(random_state=1234),
    "ADA": AdaBoostClassifier(DecisionTreeClassifier(random_state=1234, max_depth=1000)),
    "BAG": BaggingClassifier(DecisionTreeClassifier(random_state=1234, max_depth=1000)),
    "EBM": ExplainableBoostingClassifier(),
    "LR_l2": LogisticRegression(penalty="l2",random_state=1234),
    "GNB": GaussianNB(),
    "LR": LogisticRegression(penalty="none", random_state=1234),
    "DL": DecisionListClassifier(random_state=1234) 
  
}

dataset = 'climate'

import worstcase_helper
import importlib
importlib.reload(worstcase_helper)

preprocess, X, y = worstcase_helper.load_dataset_with_preprocess(dataset)

from sklearn.pipeline import make_pipeline
def make_pipeline_clf(clf_name):
    clf = make_pipeline(
        preprocess,
        clfs[clf_name]
    )
    return clf

clfs_list = []
for clf_id, clf_name in enumerate(clfs):
    clf = make_pipeline_clf(clf_name)
    clf.fit(X, y)
    clfs_list.append(clf)

import pickle

for clf_id, clf_name in enumerate(clfs):
    pickle.dump(clfs_list[clf_id], open(f"./new_models/climate_{clf_name}.p", 'wb'))


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Learning rate set to 0.007919
0:	learn: 0.6828603	total: 91.5ms	remaining: 1m 31s
1:	learn: 0.6727308	total: 109ms	remaining: 54.3s
2:	learn: 0.6627139	total: 128ms	remaining: 42.5s
3:	learn: 0.6524652	total: 147ms	remaining: 36.7s
4:	learn: 0.6451967	total: 163ms	remaining: 32.4s
5:	learn: 0.6370684	total: 181ms	remaining: 30s
6:	learn: 0.6276676	total: 216ms	remaining: 30.6s
7:	learn: 0.6189083	total: 244ms	remaining: 30.2s
8:	learn: 0.6095272	total: 260ms	remaining: 28.6s
9:	learn: 0.6016644	total: 283ms	remaining: 28s
10:	learn: 0.5937911	total: 314ms	remaining: 28.2s
11:	learn: 0.5869008	total: 356ms	remaining: 29.3s
12:	learn: 0.5800963	total: 405ms	remaining: 30.7s
13:	learn: 0.5709766	total: 441ms	remaining: 31s
14:	learn: 0.5633487	total: 456ms	remaining: 29.9s
15:	learn: 0.5576049	total: 482ms	remaining: 29.6s
16:	learn: 0.5516256	total: 493ms	remaining: 28.5s
17:	learn: 0.5456959	total: 517ms	remaining: 28.2s
18:	learn: 0.5381734	total: 544ms	remaining: 28.1s
19:	learn: 0.53

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [2]:
# clf_cart_exp = dx.Explainer(clf_cart, X, y, label="CART")
import dalex as dx
clfs_explainers = []
for clf_id, clf_name in enumerate(clfs):
    clf_exp = dx.Explainer(clfs_list[clf_id], X, y, label=clf_name)
    clfs_explainers.append(clf_exp)

Preparation of a new explainer is initiated

  -> data              : 540 rows 18 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 540 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : CART
  -> predict function  : <function yhat_proba_default at 0x7f4e294f1310> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.915, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = 0.0, mean = 0.0, max = 0.0
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 540 rows 18 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a nump

In [3]:
cont_feat = ['vconst_corr', 'vconst_2', 'vconst_3',
                                  'vconst_4', 'vconst_5', 'vconst_7', 'ah_corr',
                                  'ah_bolus', 'slm_corr', 'efficiency_factor',
                                  'tidal_mix_max', 'vertical_decay_scale',
                                  'convect_corr', 'bckgrnd_vdc1',
                                  'bckgrnd_vdc_ban', 'bckgrnd_vdc_eq',
                                  'bckgrnd_vdc_psim', 'Prandtl']

In [4]:
# import dalex as dx
# clfs_pd_cat = []
clfs_pd_cont = []
# for clf_id, clf_name in enumerate(clfs):
#     clf_pd_cat = clfs_explainers[clf_id].model_profile( variables = cat_feat,
#                                                         variable_type='categorical')
#     clfs_pd_cat.append(clf_pd_cat)

for clf_id, clf_name in enumerate(clfs):
    clf_pd_cont = clfs_explainers[clf_id].model_profile( variables = cont_feat)
    clfs_pd_cont.append(clf_pd_cont)

Calculating ceteris paribus: 100%|██████████| 18/18 [00:01<00:00, 17.41it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:07<00:00,  2.46it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:03<00:00,  5.27it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:02<00:00,  7.03it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:00<00:00, 20.99it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:02<00:00,  7.59it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:01<00:00,  9.10it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:01<00:00, 14.59it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:01<00:00, 15.17it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:01<00:00, 10.48it/s]
Calculating ceteris paribus: 100%|██████████| 18/18 [00:09<00:00,  1.85it/s]


In [5]:
# import dalex as dx
clfs_mp = []
for clf_id, clf_name in enumerate(clfs):
    clf_mp = clfs_explainers[clf_id].model_parts()
    clfs_mp.append(clf_mp)

In [6]:
import numpy as np 
climate_mp_results = np.zeros(shape=len(clfs))

for i in range(len(clfs)):
    climate_mp_results[i] = np.abs(np.subtract(clfs_mp[0].result.dropout_loss[1:-1], clfs_mp[i].result.dropout_loss[1:-1])).sum()

In [7]:
import pandas as pd
pd.DataFrame(climate_mp_results).to_csv(f"./results/mp_climate.csv")

In [8]:
# adult_pd_cat_results = np.zeros(shape=len(clfs))
climate_pd_cont_results = np.zeros(shape=len(clfs))
for i in range(len(clfs)):
    # adult_pd_cat_results[i] = np.abs(np.subtract(clfs_pd_cat[0].result._yhat_, clfs_pd_cat[i].result._yhat_)).sum()
    climate_pd_cont_results[i] = np.abs(np.subtract(clfs_pd_cont[0].result._yhat_, clfs_pd_cont[i].result._yhat_)).sum()

In [9]:
pd.DataFrame(climate_pd_cont_results).to_csv(f"./results/pd_climate.csv")