In [1]:
import os
import sys
sys.path.append(".")
sys.path.append("..")
import imodels
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, mean_squared_error, r2_score, average_precision_score
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import *
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from feature_importance.scripts.simulations_util import *
from scripts.competing_methods_local import *
from rbo_implementation import rbo_dict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def ground_truth_fi_derivation(X, support, dgp):
    fi = np.zeros_like(X)  # Initialize feature importance array
    
    if dgp == "linear":
        fi = np.abs(X)  # Use absolute values for linear case
        fi[:, support == 0] = 0  # Set non-supported features to 0
    
    elif dgp == "polynomial":
        for j in range(X.shape[1]):
            if support[j] == 1:
                if j in [0, 2, 4]:
                    fi[:, j] = np.abs(X[:, j] + X[:, j] * X[:, j + 1])
                else:
                    fi[:, j] = np.abs(X[:, j] * X[:, j - 1])
    
    elif dgp == "lss":
        for j in range(X.shape[1]):
            if support[j] == 1:
                if j in [0, 2, 4]:
                    fi[:, j] = np.abs((X[:, j] > 0) * (X[:, j + 1] > 0) - 0.5 * (X[:, j + 1] > 0))
                else:
                    fi[:, j] = np.abs((X[:, j] > 0) * (X[:, j - 1] > 0) - 0.5 * (X[:, j - 1] > 0))
    
    elif dgp == "linear_lss":
        for j in range(X.shape[1]):
            if support[j] == 1:
                if j in [0, 2, 4]:
                    fi[:, j] = np.abs(X[:, j] + X[:, j] * X[:, j + 1] + ((X[:, j] > 0) * (X[:, j + 1] > 0) - 0.5 * (X[:, j + 1] > 0)))
                else:
                    fi[:, j] = np.abs(X[:, j] + ((X[:, j] > 0) * (X[:, j - 1] > 0) - 0.5 * (X[:, j - 1] > 0)))
    return fi

##### Debug the differences yielded by AUROC and RBO

In [3]:
X = sample_real_data_X(source="openml", task_id = 3917, normalize=True)
# y, support, beta = linear_model(X, sigma=None, s=5, beta=1, heritability=0.999999999999, return_support=True, seed=42)
# make y 0/1
# y = (y > 0).astype(int)
# y, support, beta = lss_model(X, m=3, r=2, beta=1, sigma=None, tau=0.5, heritability=0.99999999, return_support=True)
# y, support, beta = hierarchical_poly(X, m=3, r=2, beta=1, heritability=0.999999, return_support=True)
#y, support, beta = partial_linear_lss_model(X, s=1, m=3, r=2, beta=1, sigma=None, tau=0.5, heritability=0.99999999, return_support=True)

  X = sample_real_data_X(source="openml", task_id = 3917, normalize=True)
  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)
  return datasets.get_dataset(self.dataset_id)
  X, _, _, _ = dataset.get_data(target=dataset.default_target_attribute,dataset_format="array")


In [5]:
y, support, _ = logistic_linear_model_random_feature(X, beta=1, s=5, frac_label_corruption=0.15, return_support=True, error_seed=1, feature_seed=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
est = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features='sqrt', random_state=42)
est.fit(X_train, y_train)

rf_plus_base_ridge = RandomForestPlusClassifier(rf_model=est, prediction_model=LogisticRegressionCV(penalty='l2', cv=5, max_iter=10000, random_state=0))
rf_plus_base_ridge.fit(X_train, y_train)

rf_plus_base_lasso = RandomForestPlusClassifier(rf_model=est, prediction_model=LogisticRegressionCV(penalty='l1', solver = 'saga', cv=5, n_jobs=-1, tol=5e-4, max_iter=5000, random_state=0))
rf_plus_base_lasso.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   27.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.8min finished


In [9]:
def LFI_evaluation_MDIRFPlus_all_ranking_retrain(X_train, y_train, X_test, fit=None, mode="absolute"):
    assert isinstance(fit, RandomForestPlusRegressor) or isinstance(fit, RandomForestPlusClassifier)
    rf_plus_mdi = RFPlusMDI(fit, mode = 'only_k', evaluate_on="all")
    local_fi_score_train = rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)
    local_fi_score_test = rf_plus_mdi.explain_linear_partial(X=X_test, y=None, ranking = True)
    if mode == "absolute":
        return np.abs(local_fi_score_train), np.abs(local_fi_score_test)
    else:
        return local_fi_score_train, local_fi_score_test

In [10]:
ridge_train, ridge_test = LFI_evaluation_MDIRFPlus_all_ranking_retrain(X_train, y_train, X_test, fit=rf_plus_base_ridge, mode="absolute")

In [11]:
lasso_train, lasso_test = LFI_evaluation_MDIRFPlus_all_ranking_retrain(X_train, y_train, X_test, fit=rf_plus_base_lasso, mode="absolute")

In [12]:
temp = []
for i in range(ridge_test.shape[0]):
    temp.append(roc_auc_score(support, ridge_test[i, :]))
print(np.mean(temp))

0.8321718749999999


In [13]:
temp = []
for i in range(lasso_test.shape[0]):
    temp.append(roc_auc_score(support, lasso_test[i, :]))
print(np.mean(temp))

0.8668437499999999


In [4]:
lasso_train, lasso_test =

array([ 1.0045231 ,  1.20691848, -1.588879  , ..., -0.09489214,
       -0.83254277,  0.12159551])

In [5]:
y

array([ 1.0045231 ,  1.20691848, -1.588879  , ..., -0.09489214,
       -0.83254277,  0.12159551])

In [6]:
X.shape

(2000, 57)

In [None]:
y

In [None]:
np.shape(y)

In [None]:
math.ceil(0.05*len(y))

In [None]:
support

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, random_state=0)
est = RandomForestClassifier(n_estimators=100, min_samples_leaf=3, max_features='sqrt', random_state=42)
est.fit(X_train, y_train)
rf_plus_base = RandomForestPlusClassifier(rf_model=est)
rf_plus_base.fit(X_train, y_train)
# rf_plus_base_oob = RandomForestPlusRegressor(rf_model=est, fit_on="oob")
# rf_plus_base_oob.fit(X_train, y_train)

In [None]:
explainer = RFPlusLime(rf_plus_base)
local_fi_score_train_subset = explainer.explain(X_train, X_train[:20])

In [None]:
def ground_truth_fi_derivation(X, support, dgp):
    fi = np.zeros_like(X)
    assert dgp == "linear"
    fi = np.abs(X) 
    fi[:, support == 0] = 0
    return fi

In [None]:
X_train[0]

In [None]:
temp = ground_truth_fi_derivation(X_train, support, "linear")[0]
temp

In [None]:
def encode_largest_k(arr, k):
    indices = np.argpartition(arr, -k)[-k:]
    encoded_array = np.zeros_like(arr)
    encoded_array[indices] = 1
    return encoded_array

In [None]:
encode_largest_k(temp, 1)

In [None]:
local_fi_score_train_subset

In [None]:
local_fi_score_train_subset[:,:,1].shape

In [None]:
local_fi_score_train_subset[:,:,0][0]

In [None]:
local_fi_score_train_subset.shape

In [None]:
local_fi_score_train_subset[:,:,1][1]*2

In [None]:
np.sum(np.abs(local_fi_score_train_subset[1]),axis=-1)

In [None]:
rf_plus_kernel_shap = RFPlusKernelSHAP(rf_plus_base)
local_fi_score_train = None
local_fi_score_train_subset = rf_plus_kernel_shap.explain(X_train=X_train, X_test=X_train)

In [None]:
local_fi_score_train_subset.shape

In [None]:
np.sum(np.abs(local_fi_score_train_subset),axis=-1).shape

In [None]:
local_fi_score_train_subset[:,:,0][0]

In [None]:
local_fi_score_train_subset[:,:,1][0]

In [None]:
alo_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="oob")
rf_plus_lime = RFPlusLime(rf_plus_base)

In [None]:
local_fi_score_train_l2_norm_sign = np.abs(alo_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=True))
local_fi_score_train_l2_norm = np.abs(alo_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True))
local_fi_score_train = np.abs(alo_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=False))
lime_train = np.abs(rf_plus_lime.explain(X_train=X_train, X_test=X_train).values)

In [None]:
auroc_lmdi_norm_sign= []
for i in range(local_fi_score_train.shape[0]):       
    auroc_lmdi_norm_sign.append(roc_auc_score([1]*5+[0]*5, local_fi_score_train_l2_norm_sign[i]))
print(np.mean(auroc_lmdi_norm_sign))

In [None]:
auroc_lmdi_norm= []
for i in range(local_fi_score_train.shape[0]):       
    auroc_lmdi_norm.append(roc_auc_score([1]*5+[0]*5, local_fi_score_train_l2_norm[i]))
print(np.mean(auroc_lmdi_norm))

In [None]:
auroc_lmdi= []
for i in range(local_fi_score_train.shape[0]):       
    auroc_lmdi.append(roc_auc_score([1]*5+[0]*5, local_fi_score_train[i]))
print(np.mean(auroc_lmdi))

In [None]:
auroc_lime= []
for i in range(local_fi_score_train.shape[0]):       
    auroc_lime.append(roc_auc_score([1]*5+[0]*5, lime_train[i]))
print(np.mean(auroc_lime))

In [None]:
### LIME assessment
temp_lime = []
for i in range(5):
    indices_correct = np.argwhere(np.array(auroc_lime) == 1.0).flatten()
    indices = np.argwhere((-1 * lime_train).argsort() == i)[:,1][indices_correct]
    values = X_train[indices_correct][np.arange(indices_correct.shape[0]), indices]
    mean_abs_values = np.mean(np.abs(values))
    temp_lime.append(mean_abs_values)
print(temp_lime)

In [None]:
temp_lmdi = []
for i in range(5):
    indices_correct = np.argwhere(np.array(auroc_lmdi_norm) == 1.0).flatten()
    indices = np.argwhere((-1 * local_fi_score_train_l2_norm).argsort() == i)[:,1][indices_correct]
    values = X_train[indices_correct][np.arange(indices_correct.shape[0]), indices]
    mean_abs_values = np.mean(np.abs(values))
    temp_lmdi.append(mean_abs_values)
print(temp_lmdi)

In [None]:
# plot temp_lmdi and temp_lime
import matplotlib.pyplot as plt
plt.plot(temp_lmdi, label="lmdi")
plt.plot(temp_lime, label="lime")
plt.legend()
plt.show()

In [None]:
np.argwhere((-1 * local_fi_score_train).argsort() == 0)[:,1]

In [None]:
indices_correct

In [None]:
indices_correct = np.argwhere(np.array(auroc_lmdi) == 1.0)
indices = np.argwhere((-1*local_fi_score_train).argsort() == 0)
values = [X_train[tuple(indices[index])] for index in indices_correct]
np.mean(np.abs(values))

In [None]:
indices_correct = np.argwhere(np.array(auroc_lmdi) == 1.0)
indices = np.argwhere((-1*local_fi_score_train).argsort() == 1)
values = [X_train[tuple(indices[index])] for index in indices_correct]
np.mean(np.abs(values))

In [None]:
indices_correct = np.argwhere(np.array(auroc_lmdi) == 1.0)
indices = np.argwhere((-1*local_fi_score_train).argsort() == 2)
values = [X_train[tuple(indices[index])] for index in indices_correct]
np.mean(np.abs(values))

In [None]:
indices_correct = np.argwhere(np.array(auroc_lmdi) == 1.0)
indices = np.argwhere((-1*local_fi_score_train).argsort() == 3)
values = [X_train[tuple(indices[index])] for index in indices_correct]
np.mean(np.abs(values))

In [None]:
indices_correct = np.argwhere(np.array(auroc_lmdi) == 1.0)
indices = np.argwhere((-1*local_fi_score_train).argsort() == 4)
values = [X_train[tuple(indices[index])] for index in indices_correct]
np.mean(np.abs(values))

In [None]:
for index in indices_correct:
    print(tuple(indices[index]))

In [None]:
indices = np.argwhere(np.array(auroc_lmdi) == 1.0)

In [None]:
indices.shape

In [None]:
indices = np.argwhere((-1*local_fi_score_train_l2_norm).argsort() == 1)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*local_fi_score_train_l2_norm).argsort() == 2)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*local_fi_score_train_l2_norm).argsort() == 3)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*local_fi_score_train_l2_norm).argsort() == 4)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*lime_train).argsort() == 0)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*lime_train).argsort() == 1)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*lime_train).argsort() == 2)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*lime_train).argsort() == 3)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*lime_train).argsort() == 4)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
# Find mean of X_train of all index with 0 in lime_train


In [None]:
local_fi_score_train

In [None]:
rf_plus_mdi_train = AloRFPlusMDI(rf_plus_base_oob, evaluate_on="all")
rf_plus_mdi_test = RFPlusMDI(rf_plus_base_oob, evaluate_on="all")
local_fi_score_train_lmdi_plus_method2 = np.abs(rf_plus_mdi_train.explain_linear_partial(X=X_train, y=y_train, leaf_average=False))
local_fi_score_test_lmdi_plus_method2 = np.abs(rf_plus_mdi_test.explain_linear_partial(X=X_test, y=None))
local_fi_score_train_lmdi_plus_method2_l2_norm = np.abs(rf_plus_mdi_train.explain_linear_partial(X=X_train, y=y_train, l2norm=True, leaf_average=False))
local_fi_score_test_lmdi_plus_method2_l2_norm = np.abs(rf_plus_mdi_test.explain_linear_partial(X=X_test, y=None, l2norm=True))

In [None]:
explainer = shap.TreeExplainer(est)
local_fi_score_train_shap = np.abs(explainer.shap_values(X_train, check_additivity=False))
local_fi_score_test_shap = np.abs(explainer.shap_values(X_test, check_additivity=False))

In [None]:
local_fi_score_train_shap

In [None]:
auroc_shap = []
rbo_lst_09_shap = []
num_captured_shap = []
for i in range(local_fi_score_train_shap.shape[0]):
    fi_data_i = local_fi_score_train_shap[i]
    ground_truth_fi_i = np.abs(X_train)[i]
    ground_truth_fi_i[support == 0] = 0
    dict_predictions = dict(enumerate(fi_data_i))
    dict_ground_truth = dict(enumerate(ground_truth_fi_i))      
    num_signal_features = int(np.sum(support))            
    auroc_shap.append(roc_auc_score(support, fi_data_i))
    rbo_lst_09_shap.append(rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=0.9)[2])
    sorted_indices = np.argsort(-fi_data_i)
    top_indices = sorted_indices[:num_signal_features]
    actual_signal_features = np.sum(support[top_indices])
    num_captured_shap.append(actual_signal_features)

In [None]:
auroc_lmdi_plus= []
rbo_lst_09_lmdi_plus = []
num_captured_lmdi_plus = []
for i in range(local_fi_score_train_lmdi_plus_method2.shape[0]):
    fi_data_i = local_fi_score_train_lmdi_plus_method2[i]
    ground_truth_fi_i = np.abs(X_train)[i]
    ground_truth_fi_i[support == 0] = 0
    dict_predictions = dict(enumerate(fi_data_i))
    dict_ground_truth = dict(enumerate(ground_truth_fi_i))      
    num_signal_features = int(np.sum(support))            
    auroc_lmdi_plus.append(roc_auc_score(support, fi_data_i))
    rbo_lst_09_lmdi_plus.append(rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=0.9)[2])
    sorted_indices = np.argsort(-fi_data_i)
    top_indices = sorted_indices[:num_signal_features]
    actual_signal_features = np.sum(support[top_indices])
    num_captured_lmdi_plus.append(actual_signal_features)

In [None]:
auroc_lmdi_plus_avg= []
rbo_lst_09_lmdi_plus_avg = []
num_captured_lmdi_plus_avg = []
for i in range(local_fi_score_train_lmdi_plus_method2_l2_norm.shape[0]):
    fi_data_i = local_fi_score_train_lmdi_plus_method2_l2_norm[i]
    ground_truth_fi_i = np.abs(X_train)[i]
    ground_truth_fi_i[support == 0] = 0
    dict_predictions = dict(enumerate(fi_data_i))
    dict_ground_truth = dict(enumerate(ground_truth_fi_i))      
    num_signal_features = int(np.sum(support))            
    auroc_lmdi_plus_avg.append(roc_auc_score(support, fi_data_i))
    rbo_lst_09_lmdi_plus_avg.append(rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=0.9)[2])
    sorted_indices = np.argsort(-fi_data_i)
    top_indices = sorted_indices[:num_signal_features]
    actual_signal_features = np.sum(support[top_indices])
    num_captured_lmdi_plus_avg.append(actual_signal_features)

In [None]:
print(np.array(auroc_shap).mean(), np.array(rbo_lst_09_shap).mean(), np.array(num_captured_shap).mean())
print(np.array(auroc_lmdi_plus).mean(), np.array(rbo_lst_09_lmdi_plus).mean(), np.array(num_captured_lmdi_plus).mean())
print(np.array(auroc_lmdi_plus_avg).mean(), np.array(rbo_lst_09_lmdi_plus_avg).mean(), np.array(num_captured_lmdi_plus_avg).mean())

In [None]:
auroc_shap[5], rbo_lst_09_shap[5]

In [None]:
auroc_lmdi_plus[5], rbo_lst_09_lmdi_plus[5]

In [None]:
ground_truth_fi_i = np.abs(X_test)[5]
ground_truth_fi_i[support == 0] = 0
ground_truth_fi_i

In [None]:
p = 0.9

In [None]:
dict_predictions = dict(enumerate(local_fi_score_test_shap[5]))
dict_ground_truth = dict(enumerate(ground_truth_fi_i))
rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=p, verbose=True)[2]

In [None]:
local_fi_score_test_lmdi_plus[5]

In [None]:
dict_predictions = dict(enumerate(local_fi_score_test_lmdi_plus[5]))
dict_ground_truth = dict(enumerate(ground_truth_fi_i))
rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=p, verbose=True)[2]

In [None]:
temp = np.array([0.30647695  , 0.17410994, 0.816055, 0.17842848, 0.10012125,
       0.26276102, 0.26671546, 0.28039733, 0.23719995, 0.25739759])

In [None]:
dict_predictions = dict(enumerate(temp))#local_fi_score_test_lmdi_plus[5]))
dict_ground_truth = dict(enumerate(ground_truth_fi_i))
rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=p, verbose=True)[2]

##### Debug two group setting with intercept

In [None]:
X = sample_normal_X_subgroups(n = 500, d=10, mean= [[0]*10,[0]*5+[0]*5], scale =[[1]*10,[1]*10])
temp = linear_model(X, beta=1, sigma=None, heritability=0.6, s=5, return_support=True)
y = temp[0]
support = temp[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
linear_model(X, beta=1, sigma=None, heritability=0.2, s=5, return_support=True)[1]

In [None]:
rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=42)
rf.fit(X_train, y_train)

In [None]:
rf_plus_base = RandomForestPlusRegressor(rf_model=rf)
rf_plus_base.fit(X_train, y_train)

# rf_plus_base_oob = RandomForestPlusRegressor(rf_model=rf, fit_on="oob")
# rf_plus_base_oob.fit(X_train, y_train)

# rf_plus_base_inbag = RandomForestPlusRegressor(rf_model=rf, include_raw=False, fit_on="inbag", prediction_model=Ridge(alpha=1e-6))
# rf_plus_base_inbag.fit(X_train, y_train)

In [None]:
np.random.seed(42)
if X_train.shape[0] > 100:
    indices_train = np.random.choice(X_train.shape[0], 100, replace=False)
    X_train_subset = X_train[indices_train]
    y_train_subset = y_train[indices_train]
else:
    indices_train = np.arange(X_train.shape[0])
    X_train_subset = X_train
    y_train_subset = y_train

if X_test.shape[0] > 100:
    indices_test = np.random.choice(X_test.shape[0], 100, replace=False)
    X_test_subset = X_test[indices_test]
    y_test_subset = y_test[indices_test]
else:
    indices_test = np.arange(X_test.shape[0])
    X_test_subset = X_test
    y_test_subset = y_test

In [None]:
indices_train_0 = np.where(X_train_subset[:, -1] == 0)[0]
indices_test_0 = np.where(X_test_subset[:, -1] == 0)[0]

In [None]:
indices_train_1 = np.where(X_train_subset[:, -1] == 1)[0]
indices_test_1 = np.where(X_test_subset[:, -1] == 1)[0]

In [None]:
local_fi_score_train, local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset = tree_shap_evaluation_RF(X_train=X_train, y_train=y_train, X_train_subset = X_train_subset, y_train_subset=y_train_subset,X_test=X_test, y_test=y_test, X_test_subset=X_test_subset, y_test_subset=y_test_subset,fit=rf, mode="absolute")

In [None]:
new_support_train = np.abs(X_train_subset)
new_support_test = np.abs(X_test)
new_support_train[:, -5:] = 0
new_support_test[:, -5:] = 0

In [None]:
data = local_fi_score_train_subset
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print("Treeshap Trainsubset")
print(np.array(auroc).mean())
print(np.array(auprc).mean())

In [None]:
data = local_fi_score_test 
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print("Treeshap Test")
print(np.array(auroc).mean())
print(np.array(auprc).mean())

In [None]:
def rank_biased_overlap(list1, list2, p=0.9):
    """
    Compute the Rank-Biased Overlap (RBO) between two ranked lists.

    Parameters:
    - list1: numpy array or list of the first ranked list
    - list2: numpy array or list of the second ranked list
    - p: the discount factor (default is 0.9, which is commonly used)

    Returns:
    - rbo: the Rank-Biased Overlap score
    """

    # Convert lists to numpy arrays if they're not already
    list1 = np.asarray(list1)
    list2 = np.asarray(list2)

    # Get the indices that would sort the arrays in descending order
    sorted_indices1 = np.argsort(-list1)
    sorted_indices2 = np.argsort(-list2)

    # Rank lists based on sorted indices
    ranked_list1 = sorted_indices1
    ranked_list2 = sorted_indices2

    # Initialize the overlap
    overlap = 0.0
    min_len = min(len(ranked_list1), len(ranked_list2))
    
    # Compute the RBO
    for i in range(min_len):
        # Calculate the overlap at rank i
        rank_i_overlap = len(set(ranked_list1[:i+1]) & set(ranked_list2[:i+1]))
        
        # Add the discounted overlap to the total
        overlap += (rank_i_overlap / (i + 1)) * (p ** (i + 1))
    
    # Normalize the score
    normalization = (1 - p) / (1 - p ** (min_len + 1))
    rbo = overlap * normalization
    
    return rbo

In [None]:
data = local_fi_score_train_subset
rbo = []
for i in range(data.shape[0]):
        rbo.append(rank_biased_overlap(new_support_train[i], data[i]))
print("Treeshap Trainsubset")
print(np.array(rbo).mean())

In [None]:
data = local_fi_score_test
rbo = []
for i in range(data.shape[0]):
        rbo.append(rank_biased_overlap(new_support_test[i], data[i]))
print("Treeshap Trainsubset")
print(np.array(rbo).mean())

In [None]:
rf_plus_mdi_train = AloRFPlusMDI(rf_plus_base, evaluate_on="oob")
rf_plus_mdi_test = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
local_fi_score_train = np.abs(rf_plus_mdi_train.explain_subtract_intercept(X=X_train, y=y_train))
local_fi_score_test = np.abs(rf_plus_mdi_test.explain_subtract_intercept(X=X_test, y=None))
local_fi_score_test_subset = np.abs(rf_plus_mdi_test.explain_subtract_intercept(X=X_test_subset, y=None))
local_fi_score_train_subset = local_fi_score_train[indices_train]

In [None]:
data = local_fi_score_train_subset 
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print(np.array(auroc).mean())
print(np.array(auprc).mean())

In [None]:
data = local_fi_score_test 
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print(np.array(auroc).mean())
print(np.array(auprc).mean())

In [None]:
new_support_train = np.abs(X_train_subset)
new_support_test = np.abs(X_test)
new_support_train[:, -5:] = 0
new_support_test[:, -5:] = 0

In [None]:
data = local_fi_score_train_subset
rbo = []
for i in range(data.shape[0]):
        rbo.append(rank_biased_overlap(new_support_train[i], data[i]))
print("Treeshap Trainsubset")
print(np.array(rbo).mean())

In [None]:
data = local_fi_score_test
rbo_lst = []
for i in range(data.shape[0]):
        rbo_lst.append(rbo.RankingSimilarity(new_support_test[i], data[i]).rbo())#rbo.append(rank_biased_overlap(new_support_test[i], data[i]))
print("Treeshap Trainsubset")
print(np.array(rbo).mean())

In [None]:
new_support_test[0]

In [None]:
data[0]

In [None]:
rbo.RankingSimilarity(S, T).rbo()

In [None]:
rf_plus_mdi_train = AloRFPlusMDI(rf_plus_base, evaluate_on="oob")
rf_plus_mdi_test = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
local_fi_score_train = np.abs(rf_plus_mdi_train.explain(X=X_train, y=y_train)[1])
local_fi_score_test = np.abs(rf_plus_mdi_test.explain(X=X_test, y=None)[1])
local_fi_score_test_subset = np.abs(rf_plus_mdi_test.explain(X=X_test_subset, y=None)[1])
local_fi_score_train_subset = local_fi_score_train[indices_train]

In [None]:
data = local_fi_score_train_subset 
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print(np.array(auroc).mean())
print(np.array(auprc).mean())

In [None]:
data = local_fi_score_test 
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print(np.array(auroc).mean())
print(np.array(auprc).mean())