In [1]:
import copy
import os
from os.path import join as oj
import glob
import argparse
import pickle as pkl
import time
import warnings
from scipy import stats
import dask
from dask.distributed import Client
import numpy as np
from tqdm import tqdm
import sys
from collections import defaultdict
from typing import Callable, List, Tuple
import itertools
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, mean_squared_error
import sys
sys.path.append(".")
sys.path.append("..")
sys.path.append("../../imodels/")

warnings.filterwarnings("ignore", message="Bins whose width")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, roc_auc_score, mean_squared_error

from imodels.importance import RandomForestPlusRegressor, RandomForestPlusClassifier, \
    RidgeRegressorPPM, LassoRegressorPPM, IdentityTransformer
from imodels.importance.rf_plus import _fast_r2_score
import seaborn as sns
from util import ModelConfig, FIModelConfig, tp, fp, neg, pos, specificity_score, auroc_score, auprc_score, compute_nsg_feat_corr_w_sig_subspace, apply_splitting_strategy
from scripts.competing_methods_local import *
from scripts.simulations_util import *

ModuleNotFoundError: No module named 'imodels.importance'

In [2]:
def generate_random_shuffle(data, seed):
    """
    Randomly shuffle each column of the data.
    """
    np.random.seed(seed)
    return np.array([np.random.permutation(data[:, i]) for i in range(data.shape[1])]).T


def ablation(data, feature_importance, mode, num_features, seed):
    """
    Replace the top num_features max feature importance data with random shuffle for each sample
    """
    assert mode in ["max", "min"]
    fi = feature_importance.to_numpy()
    shuffle = generate_random_shuffle(data.copy(), seed)
    if mode == "max":
        indices = np.argsort(-fi)
    else:
        indices = np.argsort(fi)
    data_copy = data.copy()
    for i in range(data.shape[0]):
        for j in range(num_features):
            data_copy[i, indices[i,j]] = shuffle[i, indices[i,j]]
    return data_copy


#### Demo of Getting LFI on synthetic dataset

In [3]:
seed = 0
n = 200
d = 10
mean = [[0]*5 + [0]*5, [10]*5 + [0]*5]
scale = [[1]*10,[1]*10]
s = 5
X = sample_normal_X_subgroups(n, d, mean, scale)
beta = np.concatenate((np.ones(s), np.zeros(d-s)))
y = np.matmul(X, beta)
split_seed = 0
X_train, X_tune, X_test, y_train, y_tune, y_test = apply_splitting_strategy(X, y, "train-test", split_seed)

In [4]:
rf_regressor = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=331)
rf_plus_model = RandomForestPlusRegressor(rf_model=copy.deepcopy(rf_regressor), include_raw=False)
rf_plus_model.fit(X_train, y_train)
score = rf_plus_model.get_mdi_plus_scores(X_test, y_test, lfi=True, lfi_abs = "none", sample_split=None, train_or_test = "test")
score["lfi"]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-2.802318,-4.855034,-6.534856,-6.303020,-5.666409,0.003367,-0.115584,-0.005208,-0.054205,-0.105709
1,4.235663,4.058804,7.247494,4.568692,3.723577,0.240077,-0.173623,0.056478,-0.103587,-0.024177
2,3.427314,4.358826,7.265487,4.922750,4.078967,0.022970,-0.032465,-0.048909,0.003921,-0.033833
3,-6.390700,-4.386212,-8.782353,-2.667173,-2.487387,-0.038786,-0.009141,-0.010059,0.058067,-0.004545
4,3.855409,4.314597,7.098136,4.671592,3.572662,-0.121405,0.386412,-0.018923,-0.095826,-0.019706
...,...,...,...,...,...,...,...,...,...,...
61,-4.678110,-4.993524,-7.039339,-6.902901,-1.957546,0.027181,-0.043465,0.000381,0.014384,0.062693
62,4.235663,4.058804,7.085392,4.551293,4.100747,-0.109350,0.015101,0.056478,-0.103587,-0.087515
63,-2.773373,-5.546139,-5.455163,-6.975617,-5.273687,0.089041,-0.023757,-0.001291,0.095483,-0.144661
64,3.250866,4.433513,7.279385,4.623573,4.017007,-0.134517,0.015101,-0.021426,0.024746,0.013218


In [5]:
rf_regressor = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=331)
rf_plus_model = RandomForestPlusRegressor(rf_model=copy.deepcopy(rf_regressor), include_raw=True)
rf_plus_model.fit(X_train, y_train)
score = rf_plus_model.get_mdi_plus_scores(X_test, y_test, lfi=True, lfi_abs = "none", sample_split=None, train_or_test = "test")
score["lfi"]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-3.666780,-4.573905,-6.975479,-6.084630,-6.224678,0.006328,-0.035066,0.005080,-0.014931,-0.016010
1,3.237603,3.885691,6.807192,4.577892,6.923594,0.014024,-0.011022,-0.013137,0.013587,-0.007547
2,4.138463,5.116393,7.990031,4.345799,3.765652,0.011890,0.007317,0.003906,0.018744,-0.016710
3,-4.701403,-4.654211,-7.689046,-3.572961,-5.217060,-0.022063,0.010813,0.006639,-0.010876,-0.019078
4,3.779464,4.193490,6.045288,5.470785,4.547649,-0.018626,0.028545,0.010906,0.019167,-0.007963
...,...,...,...,...,...,...,...,...,...,...
61,-3.969411,-6.326621,-6.232090,-6.912097,-4.267484,0.026549,0.001136,0.010596,-0.010539,-0.010633
62,4.086002,3.851585,6.274630,3.956496,6.276763,-0.008485,-0.008097,-0.017143,0.022414,-0.011498
63,-3.666791,-5.238759,-5.150104,-6.139017,-5.913825,0.024400,-0.015721,0.013183,0.003138,0.013493
64,3.123538,3.543525,6.164842,3.341763,6.544530,0.036032,-0.009216,0.000239,-0.006303,-0.014579


#### Demo of Getting LFI on real dataset

In [6]:
X= sample_real_data(X_fpath="../data/classification_data/Diabetes/X_diabetes.csv", return_data="X")
y,_,_ = sample_real_data(y_fpath="../data/classification_data/Diabetes/y_diabetes.csv", return_data="y")
split_seed = 0
X_train, X_tune, X_test, y_train, y_tune, y_test = apply_splitting_strategy(X, y, "train-test", split_seed)

In [45]:
rf_regressor = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features='sqrt', random_state=42)
rf_plus_model = RandomForestPlusClassifier(rf_model=copy.deepcopy(rf_regressor), include_raw=False)
rf_plus_model.fit(X_train, y_train)

In [8]:
kernel_shap_test_evaluation_RF_plus(X_train, y_train, X_test, y_test, rf_plus_model)

(514, 8) (254, 8)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.
The `ipykernel.comm.Comm` class has been deprecated. Please use the `comm` module instead.For creating comms, use the function `from comm import create_comm`.
The `ipykernel.comm.Comm` class has been deprecated. Please use the `comm` module instead.For creating comms, use the function `from comm import create_comm`.
The `ipykernel.comm.Comm` class has been deprecated. Please use the `comm` module instead.For creating comms, use the function `from comm import create_comm`.
The `ipykernel.comm.Comm` class has been deprecated. Please use the `comm` module instead.For creating comms, use the function `from comm import create_comm`.
The `ipykernel.comm.Comm` class has been deprecated. Please use the `comm` module instead.For creating comms, use the function `from comm import create_comm`.
The `ipykernel.comm.Com

  0%|          | 0/254 [00:00<?, ?it/s]

ValueError: Must pass 2-d input. shape=(2, 254, 8)

In [25]:
kernel_shap_scores = rf_plus_model.get_kernel_shap_scores(X_train, X_test[:10])

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
The `ipykernel.comm.Comm` class has been deprecated. Please use the `comm` module instead.For creating comms, use the function `from comm import create_comm`.
The `ipykernel.comm.Comm` class has been deprecated. Please use the `comm` module instead.For creating comms, use the function `from comm import create_comm`.
The `ipykernel.comm.Comm` class has been deprecated. Please use the `comm` module instead.For creating comms, use the function `from comm import create_comm`.
The `ipykernel.comm.Comm` class has been deprecated. Please use the `comm` module instead.For creating comms, use the function `from comm import create_comm`.
The `ipykernel.comm.Comm` class has been deprecated. Please use the `comm` module instead.For creating comms, use the function `from comm import create_comm`.
The `ipykernel.comm.Com

  0%|          | 0/10 [00:00<?, ?it/s]

In [19]:
# check if two arrays are the same kernel_shap_scores[0], -1*kernel_shap_scores[1]
assert np.allclose(kernel_shap_scores[0], -1*kernel_shap_scores[1])

In [47]:
rf_plus_model.predict_proba(X_test[:5])

array([[0.10403001, 0.89596999],
       [0.28298555, 0.71701445],
       [0.22117321, 0.77882679],
       [0.41769101, 0.58230899],
       [0.81401824, 0.18598176]])

In [40]:
kernel_shap_scores[0][:1]

array([[ 0.00059938, -0.00273772, -0.18238558, -0.20979838,  0.00102012,
         0.00128874, -0.00096814, -0.03520979,  0.        , -0.0027872 ]])

In [41]:
kernel_shap_scores[1][:1]

array([[-0.0014229 , -0.02722811, -0.13815625, -0.16263109, -0.00464371,
        -0.00148758,  0.00354948,  0.04096752,  0.        ,  0.00128256]])

In [32]:
kernel_shap_scores[2][:1]

array([[ 0.00127492,  0.03643567,  0.35294341,  0.41919286,  0.00479383,
         0.00234458, -0.00275117, -0.00450702,  0.        ,  0.00211675]])

In [None]:
kernel_shap_scores = np.abs(kernel_shap_scores)

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=331)
rf_plus_model = RandomForestPlusRegressor(rf_model=copy.deepcopy(rf_regressor), include_raw=False)
rf_plus_model.fit(X_train, y_train)
score = rf_plus_model.get_mdi_plus_scores(X_test, y_test, lfi=True, lfi_abs = "none", sample_split=None, train_or_test = "test")
score["lfi"]

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=331)
rf_plus_model = RandomForestPlusRegressor(rf_model=copy.deepcopy(rf_regressor), include_raw=True)
rf_plus_model.fit(X_train, y_train)
score = rf_plus_model.get_mdi_plus_scores(X_test, y_test, lfi=True, lfi_abs = "none", sample_split=None, train_or_test = "test")
score["lfi"]

#### Demo of Ablation

In [44]:
# Define the data
# X= sample_real_data(X_fpath="../data/regression_data/Diabetes_regression/X_diabetes_regression.csv", return_data="X")
# y,_,_ = sample_real_data(y_fpath="../data/regression_data/Diabetes_regression/y_diabetes_regression.csv", return_data="y")
X,y = sklearn.datasets.make_classification(n_samples=200, n_features=10, n_informative=2, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=42)
split_seed = 0
X_train, X_tune, X_test, y_train, y_tune, y_test = apply_splitting_strategy(X, y, "train-test", split_seed)

#Define the model and fit
rf_regressor = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=331)
rf_regressor.fit(X_train, y_train)
seed = 0

In [None]:
metric_results_LFI = {}
y_pred = rf_regressor.predict(X_test)
metric_results_LFI['MSE_before_ablation'] = mean_squared_error(y_test, y_pred)
local_fi_score = LFI_ablation_test_evaluation(X_train, y_train, X_test, y_test, rf_regressor, include_raw=False)
ascending = True
imp_vals = copy.deepcopy(local_fi_score)
imp_vals[imp_vals == float("-inf")] = -sys.maxsize - 1
imp_vals[imp_vals == float("inf")] = sys.maxsize - 1
for i in range(X_test.shape[1]):
    if ascending:
        ablation_X_test = ablation(X_test, imp_vals, "max", i+1, seed)
    else:
        ablation_X_test = ablation(X_test, imp_vals, "min", i+1, seed)
    metric_results_LFI[f'MSE_after_ablation_{i+1}'] = mean_squared_error(y_test, rf_regressor.predict(ablation_X_test))

In [None]:
metric_results_shap = {}
y_pred = rf_regressor.predict(X_test)
metric_results_shap['MSE_before_ablation'] = mean_squared_error(y_test, y_pred)
local_fi_score = tree_shap_local(X_test, y_test, rf_regressor)
ascending = True
imp_vals = copy.deepcopy(local_fi_score)
imp_vals[imp_vals == float("-inf")] = -sys.maxsize - 1
imp_vals[imp_vals == float("inf")] = sys.maxsize - 1
seed = np.random.randint(0, 100000)
for i in range(X_test.shape[1]):
    if ascending:
        ablation_X_test = ablation(X_test, imp_vals, "max", i+1, seed)
    else:
        ablation_X_test = ablation(X_test, imp_vals, "min", i+1, seed)
    metric_results_shap[f'MSE_after_ablation_{i+1}'] = mean_squared_error(y_test, rf_regressor.predict(ablation_X_test))

In [None]:
mse_LFI = [metric_results_LFI['MSE_before_ablation']] + [metric_results_LFI[f'MSE_after_ablation_{i+1}'] for i in range(X_test.shape[1])]
mse_SHAP =[metric_results_LFI['MSE_before_ablation']] + [metric_results_shap[f'MSE_after_ablation_{i+1}'] for i in range(X_test.shape[1])]

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(range(len(mse_LFI)), mse_LFI, label='LFI')
plt.plot(range(len(mse_SHAP)), mse_SHAP, label='SHAP')
plt.xlabel('Number of Features Ablated')
plt.ylabel('MSE')
plt.title('MSE After Ablation')
plt.legend()
plt.show()