In [1]:
import copy
import os
from os.path import join as oj
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, ElasticNetCV
from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor, RandomForestPlusClassifier
# sys.path.append(".")
# sys.path.append("..")
# sys.path.append("../..")
warnings.filterwarnings("ignore", message="Bins whose width")
from sklearn.model_selection import train_test_split
from sklearn import model_selection
import openml
import shap
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import *
from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.ensemble import GradientBoostingClassifier
import sklearn

In [2]:
def apply_splitting_strategy(X: np.ndarray,
                             y: np.ndarray,
                             splitting_strategy: str,
                             split_seed: str):
    if splitting_strategy in {'train-test-lowdata', 'train-tune-test-lowdata'}:
        test_size = 0.90
    elif splitting_strategy == "train-test":
        test_size = 0.33
    else:
        test_size = 0.2

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=test_size, random_state=split_seed)
    X_tune = None
    y_tune = None

    if splitting_strategy in {'train-tune-test', 'train-tune-test-lowdata'}:
        X_train, X_tune, y_train, y_tune = model_selection.train_test_split(
            X_train, y_train, test_size=0.2, random_state=split_seed)

    return X_train, X_tune, X_test, y_train, y_tune, y_test

def mask_unimportant_features(X, sorted_feature, mask, mask_values):
    array = copy.deepcopy(X)
    num_features = array.shape[1]
    num_masked = int(np.ceil(num_features * mask))
    selected_indices = sorted_feature[:, num_masked:]
    for row_idx in range(array.shape[0]):
        for col_idx in selected_indices[row_idx]:
            if isinstance(mask_values[col_idx], (int, float, np.integer, np.floating)):
                array[row_idx, col_idx] = mask_values[col_idx]
            else:
                unique_vals = mask_values[col_idx]
                array[row_idx, col_idx] = unique_vals[1] if array[row_idx, col_idx] == unique_vals[0] else unique_vals[0] 
    
    return num_masked, array

def LFI_evaluation_MDIRFPlus_all_ranking_retrain(X_train, y_train, X_test, fit=None, mode="absolute"):
    assert isinstance(fit, RandomForestPlusRegressor) or isinstance(fit, RandomForestPlusClassifier)
    rf_plus_mdi = RFPlusMDI(fit, mode = 'only_k', evaluate_on="all")
    local_fi_score_train = rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)
    local_fi_score_test = rf_plus_mdi.explain_linear_partial(X=X_test, y=None, ranking = True)
    if mode == "absolute":
        return np.abs(local_fi_score_train), np.abs(local_fi_score_test)
    else:
        return local_fi_score_train, local_fi_score_test

def tree_shap_evaluation_RF_retrain(X_train, y_train, X_test, fit=None, mode="absolute"):
    """
    Compute average treeshap value across observations.
    Larger absolute values indicate more important features.
    :param X: design matrix
    :param y: response
    :param fit: fitted model of interest (tree-based)
    :return: dataframe of shape: (n_samples, n_features)
    """
    explainer = shap.TreeExplainer(fit)
    local_fi_score_train = explainer.shap_values(X_train, check_additivity=False)
    local_fi_score_test = explainer.shap_values(X_test, check_additivity=False)
    if isinstance(fit, GradientBoostingClassifier):
        if mode == "absolute":
            return np.abs(local_fi_score_train), np.abs(local_fi_score_test)
        else:
            return local_fi_score_train, local_fi_score_test
    if sklearn.base.is_classifier(fit):
        if mode == "absolute":
            return np.abs(local_fi_score_train[:,:,1]), np.abs(local_fi_score_test[:,:,1])
        else:
            return local_fi_score_train[:,:,1], local_fi_score_test[:,:,1]
    if mode == "absolute":
        return np.abs(local_fi_score_train), np.abs(local_fi_score_test)
    else:
        return local_fi_score_train, local_fi_score_test

In [3]:
task = openml.tasks.get_task(43)
dataset = task.get_dataset()
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute,dataset_format="array")
keep_idx = np.random.choice(X.shape[0], 1000, replace=False)
X = X[keep_idx, :]
y = y[keep_idx]
X_train, X_tune, X_test, y_train, y_tune, y_test = apply_splitting_strategy(X, y, "train-test", 0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  exec(code_obj, self.user_global_ns, self.user_ns)
  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)
  return datasets.get_dataset(self.dataset_id)
  X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute,dataset_format="array")


In [4]:
mask_values = {}
for i in range(X_train.shape[1]):
    unique_values = np.unique(X_train[:, i])
    if len(unique_values) > 2:
        mask_values[i] = np.mean(X_train[:, i])
    else:
        mask_values[i] = list(unique_values)

In [5]:
est = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features="sqrt", random_state=0)
est.fit(X_train, y_train)

rf_plus_elastic = RandomForestPlusClassifier(rf_model=est, prediction_model=LogisticRegressionCV(penalty='elasticnet', l1_ratios=[0.1,0.5,0.99], solver = 'saga', cv=3, n_jobs=-1, tol=5e-4, max_iter=2000, random_state=0))
rf_plus_elastic.fit(X_train, y_train)

# est = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=0)
# est.fit(X_train, y_train)

# rf_plus_elastic = RandomForestPlusRegressor(rf_model=est, prediction_model=ElasticNetCV(cv=3, l1_ratio=[0.1,0.5,0.99], max_iter=2000,random_state=0))
# rf_plus_elastic.fit(X_train, y_train)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.8min finished


In [6]:
local_fi_score_train_lmdi, _ = LFI_evaluation_MDIRFPlus_all_ranking_retrain(X_train, y_train, X_test, fit=rf_plus_elastic, mode="absolute")
local_fi_score_train_shap, _ = tree_shap_evaluation_RF_retrain(X_train, y_train, X_test, fit=est, mode="absolute")

sorted_feature_train_lmdi = np.argsort(-local_fi_score_train_lmdi)
sorted_feature_train_shap = np.argsort(-local_fi_score_train_shap)
ablation_models = {"RF_Classifier": RandomForestClassifier(random_state=0)}
#ablation_models = {"RF_Regressor": RandomForestRegressor(random_state=0)}

In [None]:
mask_ratio = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
auroc_lmdi = []
for mask in mask_ratio:
    num_features_masked, X_train_masked = mask_unimportant_features(X_train, sorted_feature_train_lmdi, mask, mask_values)
    for a_model in ablation_models:
        ablation_models[a_model].fit(X_train_masked, y_train)
        y_pred = ablation_models[a_model].predict_proba(X_test)[:, 1]
        auroc_lmdi.append(roc_auc_score(y_test, y_pred))
        # y_pred = ablation_models[a_model].predict(X_test)
        # print(r2_score(y_test, y_pred))
auroc_lmdi = np.array(auroc_lmdi)

0.9771047307812013
0.9859165888577652
0.9908574540927483
0.9915771864301276
0.9830765639589168
0.9877840024898847
0.9873949579831932
0.9934056956115779
0.9929193899782135
0.9914799253034547


In [None]:
mask_ratio = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
auroc_shap = []
for mask in mask_ratio:
    num_features_masked, X_train_masked = mask_unimportant_features(X_train, sorted_feature_train_shap, mask, mask_values)
    for a_model in ablation_models:
        ablation_models[a_model].fit(X_train_masked, y_train)
        y_pred = ablation_models[a_model].predict_proba(X_test)[:, 1]
        auroc_shap.append(roc_auc_score(y_test, y_pred))
        # print(roc_auc_score(y_test, y_pred))
        # y_pred = ablation_models[a_model].predict(X_test)
        # print(r2_score(y_test, y_pred))
auroc_shap = np.array(auroc_shap)

0.9861694677871149
0.9784080298786181
0.9239612511671333
0.9347961406784936
0.9627100840336135
0.91328197945845
0.9708605664488018
0.9781551509492685
0.9937947401182695
0.9914799253034547
