In [1]:
import os
import sys
sys.path.append(".")
sys.path.append("..")
import imodels
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, mean_squared_error, r2_score, average_precision_score
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import *
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from feature_importance.scripts.simulations_util import *
from scripts.competing_methods_local import *
from rbo_implementation import rbo_dict

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def ground_truth_fi_derivation(X, support, dgp):
    fi = np.zeros_like(X)  # Initialize feature importance array
    
    if dgp == "linear":
        fi = np.abs(X)  # Use absolute values for linear case
        fi[:, support == 0] = 0  # Set non-supported features to 0
    
    elif dgp == "polynomial":
        for j in range(X.shape[1]):
            if support[j] == 1:
                if j in [0, 2, 4]:
                    fi[:, j] = np.abs(X[:, j] + X[:, j] * X[:, j + 1])
                else:
                    fi[:, j] = np.abs(X[:, j] * X[:, j - 1])
    
    elif dgp == "lss":
        for j in range(X.shape[1]):
            if support[j] == 1:
                if j in [0, 2, 4]:
                    fi[:, j] = np.abs((X[:, j] > 0) * (X[:, j + 1] > 0) - 0.5 * (X[:, j + 1] > 0))
                else:
                    fi[:, j] = np.abs((X[:, j] > 0) * (X[:, j - 1] > 0) - 0.5 * (X[:, j - 1] > 0))
    
    elif dgp == "linear_lss":
        for j in range(X.shape[1]):
            if support[j] == 1:
                if j in [0, 2, 4]:
                    fi[:, j] = np.abs(X[:, j] + X[:, j] * X[:, j + 1] + ((X[:, j] > 0) * (X[:, j + 1] > 0) - 0.5 * (X[:, j + 1] > 0)))
                else:
                    fi[:, j] = np.abs(X[:, j] + ((X[:, j] > 0) * (X[:, j - 1] > 0) - 0.5 * (X[:, j - 1] > 0)))
    return fi

##### Debug the differences yielded by AUROC and RBO

In [2]:
X = sample_normal_X(n_train=100, n_test=100, d=10, seed=42)
y, support, beta = linear_model(X, sigma=None, s=5, beta=1, heritability=0.999999999999, return_support=True, seed=42)
# make y 0/1
y = (y > 0).astype(int)
# y, support, beta = lss_model(X, m=3, r=2, beta=1, sigma=None, tau=0.5, heritability=0.99999999, return_support=True)
# y, support, beta = hierarchical_poly(X, m=3, r=2, beta=1, heritability=0.999999, return_support=True)
#y, support, beta = partial_linear_lss_model(X, s=1, m=3, r=2, beta=1, sigma=None, tau=0.5, heritability=0.99999999, return_support=True)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, random_state=0)
est = RandomForestClassifier(n_estimators=100, min_samples_leaf=3, max_features='sqrt', random_state=42)
est.fit(X_train, y_train)
rf_plus_base = RandomForestPlusClassifier(rf_model=est)
rf_plus_base.fit(X_train, y_train)
# rf_plus_base_oob = RandomForestPlusRegressor(rf_model=est, fit_on="oob")
# rf_plus_base_oob.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.3s finished


In [11]:
explainer = RFPlusLime(rf_plus_base)
local_fi_score_train_subset = explainer.explain(X_train, X_train[:20])

In [12]:
def ground_truth_fi_derivation(X, support, dgp):
    fi = np.zeros_like(X)
    assert dgp == "linear"
    fi = np.abs(X) 
    fi[:, support == 0] = 0
    return fi

In [14]:
X_train[0]

array([ 0.2005692 ,  1.14863735, -1.01582182,  0.06167985,  0.4288165 ,
        0.69310561,  0.17644156, -0.36702784, -0.82759022,  0.08614388])

In [16]:
temp = ground_truth_fi_derivation(X_train, support, "linear")[0]
temp

array([0.2005692 , 1.14863735, 1.01582182, 0.06167985, 0.4288165 ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [28]:
def encode_largest_k(arr, k):
    indices = np.argpartition(arr, -k)[-k:]
    encoded_array = np.zeros_like(arr)
    encoded_array[indices] = 1
    return encoded_array

In [30]:
encode_largest_k(temp, 1)

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [27]:
local_fi_score_train_subset

Unnamed: 0,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9
0,0.042939,0.313872,-0.278004,-0.072298,0.071858,0.001796,-0.002037,0.000888,0.051954,0.016916
1,0.128044,0.30936,0.298556,0.088216,-0.196329,0.003211,0.001649,-0.005707,0.002099,0.004099
2,0.128975,0.300358,-0.286637,0.092595,-0.216878,-0.000919,-0.0145,-0.011689,-0.015093,-0.002838
3,0.031202,-0.092956,0.300951,0.264616,-0.073096,-0.003058,0.014221,-0.021825,0.013697,0.002225
4,-0.142722,0.127125,0.29774,-0.052867,0.205676,-0.022843,-0.025235,-0.003849,0.013926,0.002184
5,0.128965,-0.379115,0.282897,-0.05319,0.216716,-0.009724,0.027204,-0.003624,0.018411,0.017139
6,-0.016295,0.129708,-0.310517,0.25241,-0.061308,0.00062,0.025644,0.006252,0.001251,-0.004184
7,-0.031261,0.323186,-0.079017,-0.276703,-0.062329,0.010259,-0.021847,-4.9e-05,0.017011,0.002881
8,-0.024168,-0.066039,-0.105093,-0.292647,0.212371,-0.012728,0.030731,0.002272,-0.046477,-0.018832
9,0.119871,-0.376929,-0.281069,0.250905,0.056492,-0.016494,0.008817,0.018102,-0.051789,0.011734


In [24]:
local_fi_score_train_subset[:,:,1].shape

(20, 10)

In [23]:
local_fi_score_train_subset[:,:,0][0]

array([-0.01645106, -0.2746186 ,  0.18108225,  0.0119518 , -0.068272  ,
        0.00145062, -0.01007098, -0.00809212, -0.03299942, -0.00196042])

In [5]:
local_fi_score_train_subset.shape

(100, 10, 2)

In [13]:
local_fi_score_train_subset[:,:,1][1]*2

array([ 0.10618995,  0.32264464,  0.21552725,  0.0988917 , -0.14106001,
        0.046378  ,  0.01789238,  0.02558619,  0.03749313, -0.01384323])

In [12]:
np.sum(np.abs(local_fi_score_train_subset[1]),axis=-1)

array([0.10618995, 0.32264464, 0.21552725, 0.0988917 , 0.14106001,
       0.046378  , 0.01789238, 0.02558619, 0.03749313, 0.01384323])

In [15]:
rf_plus_kernel_shap = RFPlusKernelSHAP(rf_plus_base)
local_fi_score_train = None
local_fi_score_train_subset = rf_plus_kernel_shap.explain(X_train=X_train, X_test=X_train)

100%|██████████| 100/100 [00:39<00:00,  2.50it/s]


In [16]:
local_fi_score_train_subset.shape

(100, 10, 2)

In [18]:
np.sum(np.abs(local_fi_score_train_subset),axis=-1).shape

(100, 10)

In [20]:
local_fi_score_train_subset[:,:,0][0]

array([-0.01645106, -0.2746186 ,  0.18108225,  0.0119518 , -0.068272  ,
        0.00145062, -0.01007098, -0.00809212, -0.03299942, -0.00196042])

In [22]:
local_fi_score_train_subset[:,:,1][0]

array([ 0.01645106,  0.2746186 , -0.18108225, -0.0119518 ,  0.068272  ,
       -0.00145062,  0.01007098,  0.00809212,  0.03299942,  0.00196042])

In [None]:
alo_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="oob")
rf_plus_lime = RFPlusLime(rf_plus_base)

In [None]:
local_fi_score_train_l2_norm_sign = np.abs(alo_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=True))
local_fi_score_train_l2_norm = np.abs(alo_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True))
local_fi_score_train = np.abs(alo_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=False))
lime_train = np.abs(rf_plus_lime.explain(X_train=X_train, X_test=X_train).values)

In [None]:
auroc_lmdi_norm_sign= []
for i in range(local_fi_score_train.shape[0]):       
    auroc_lmdi_norm_sign.append(roc_auc_score([1]*5+[0]*5, local_fi_score_train_l2_norm_sign[i]))
print(np.mean(auroc_lmdi_norm_sign))

In [None]:
auroc_lmdi_norm= []
for i in range(local_fi_score_train.shape[0]):       
    auroc_lmdi_norm.append(roc_auc_score([1]*5+[0]*5, local_fi_score_train_l2_norm[i]))
print(np.mean(auroc_lmdi_norm))

In [None]:
auroc_lmdi= []
for i in range(local_fi_score_train.shape[0]):       
    auroc_lmdi.append(roc_auc_score([1]*5+[0]*5, local_fi_score_train[i]))
print(np.mean(auroc_lmdi))

In [None]:
auroc_lime= []
for i in range(local_fi_score_train.shape[0]):       
    auroc_lime.append(roc_auc_score([1]*5+[0]*5, lime_train[i]))
print(np.mean(auroc_lime))

In [None]:
### LIME assessment
temp_lime = []
for i in range(5):
    indices_correct = np.argwhere(np.array(auroc_lime) == 1.0).flatten()
    indices = np.argwhere((-1 * lime_train).argsort() == i)[:,1][indices_correct]
    values = X_train[indices_correct][np.arange(indices_correct.shape[0]), indices]
    mean_abs_values = np.mean(np.abs(values))
    temp_lime.append(mean_abs_values)
print(temp_lime)

In [None]:
temp_lmdi = []
for i in range(5):
    indices_correct = np.argwhere(np.array(auroc_lmdi_norm) == 1.0).flatten()
    indices = np.argwhere((-1 * local_fi_score_train_l2_norm).argsort() == i)[:,1][indices_correct]
    values = X_train[indices_correct][np.arange(indices_correct.shape[0]), indices]
    mean_abs_values = np.mean(np.abs(values))
    temp_lmdi.append(mean_abs_values)
print(temp_lmdi)

In [None]:
# plot temp_lmdi and temp_lime
import matplotlib.pyplot as plt
plt.plot(temp_lmdi, label="lmdi")
plt.plot(temp_lime, label="lime")
plt.legend()
plt.show()

In [None]:
np.argwhere((-1 * local_fi_score_train).argsort() == 0)[:,1]

In [None]:
indices_correct

In [None]:
indices_correct = np.argwhere(np.array(auroc_lmdi) == 1.0)
indices = np.argwhere((-1*local_fi_score_train).argsort() == 0)
values = [X_train[tuple(indices[index])] for index in indices_correct]
np.mean(np.abs(values))

In [None]:
indices_correct = np.argwhere(np.array(auroc_lmdi) == 1.0)
indices = np.argwhere((-1*local_fi_score_train).argsort() == 1)
values = [X_train[tuple(indices[index])] for index in indices_correct]
np.mean(np.abs(values))

In [None]:
indices_correct = np.argwhere(np.array(auroc_lmdi) == 1.0)
indices = np.argwhere((-1*local_fi_score_train).argsort() == 2)
values = [X_train[tuple(indices[index])] for index in indices_correct]
np.mean(np.abs(values))

In [None]:
indices_correct = np.argwhere(np.array(auroc_lmdi) == 1.0)
indices = np.argwhere((-1*local_fi_score_train).argsort() == 3)
values = [X_train[tuple(indices[index])] for index in indices_correct]
np.mean(np.abs(values))

In [None]:
indices_correct = np.argwhere(np.array(auroc_lmdi) == 1.0)
indices = np.argwhere((-1*local_fi_score_train).argsort() == 4)
values = [X_train[tuple(indices[index])] for index in indices_correct]
np.mean(np.abs(values))

In [None]:
for index in indices_correct:
    print(tuple(indices[index]))

In [None]:
indices = np.argwhere(np.array(auroc_lmdi) == 1.0)

In [None]:
indices.shape

In [None]:
indices = np.argwhere((-1*local_fi_score_train_l2_norm).argsort() == 1)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*local_fi_score_train_l2_norm).argsort() == 2)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*local_fi_score_train_l2_norm).argsort() == 3)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*local_fi_score_train_l2_norm).argsort() == 4)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*lime_train).argsort() == 0)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*lime_train).argsort() == 1)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*lime_train).argsort() == 2)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*lime_train).argsort() == 3)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
indices = np.argwhere((-1*lime_train).argsort() == 4)
values = [X_train[tuple(index)] for index in indices]
np.mean(np.abs(values))

In [None]:
# Find mean of X_train of all index with 0 in lime_train


In [None]:
local_fi_score_train

In [None]:
rf_plus_mdi_train = AloRFPlusMDI(rf_plus_base_oob, evaluate_on="all")
rf_plus_mdi_test = RFPlusMDI(rf_plus_base_oob, evaluate_on="all")
local_fi_score_train_lmdi_plus_method2 = np.abs(rf_plus_mdi_train.explain_linear_partial(X=X_train, y=y_train, leaf_average=False))
local_fi_score_test_lmdi_plus_method2 = np.abs(rf_plus_mdi_test.explain_linear_partial(X=X_test, y=None))
local_fi_score_train_lmdi_plus_method2_l2_norm = np.abs(rf_plus_mdi_train.explain_linear_partial(X=X_train, y=y_train, l2norm=True, leaf_average=False))
local_fi_score_test_lmdi_plus_method2_l2_norm = np.abs(rf_plus_mdi_test.explain_linear_partial(X=X_test, y=None, l2norm=True))

In [None]:
explainer = shap.TreeExplainer(est)
local_fi_score_train_shap = np.abs(explainer.shap_values(X_train, check_additivity=False))
local_fi_score_test_shap = np.abs(explainer.shap_values(X_test, check_additivity=False))

In [None]:
local_fi_score_train_shap

In [None]:
auroc_shap = []
rbo_lst_09_shap = []
num_captured_shap = []
for i in range(local_fi_score_train_shap.shape[0]):
    fi_data_i = local_fi_score_train_shap[i]
    ground_truth_fi_i = np.abs(X_train)[i]
    ground_truth_fi_i[support == 0] = 0
    dict_predictions = dict(enumerate(fi_data_i))
    dict_ground_truth = dict(enumerate(ground_truth_fi_i))      
    num_signal_features = int(np.sum(support))            
    auroc_shap.append(roc_auc_score(support, fi_data_i))
    rbo_lst_09_shap.append(rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=0.9)[2])
    sorted_indices = np.argsort(-fi_data_i)
    top_indices = sorted_indices[:num_signal_features]
    actual_signal_features = np.sum(support[top_indices])
    num_captured_shap.append(actual_signal_features)

In [None]:
auroc_lmdi_plus= []
rbo_lst_09_lmdi_plus = []
num_captured_lmdi_plus = []
for i in range(local_fi_score_train_lmdi_plus_method2.shape[0]):
    fi_data_i = local_fi_score_train_lmdi_plus_method2[i]
    ground_truth_fi_i = np.abs(X_train)[i]
    ground_truth_fi_i[support == 0] = 0
    dict_predictions = dict(enumerate(fi_data_i))
    dict_ground_truth = dict(enumerate(ground_truth_fi_i))      
    num_signal_features = int(np.sum(support))            
    auroc_lmdi_plus.append(roc_auc_score(support, fi_data_i))
    rbo_lst_09_lmdi_plus.append(rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=0.9)[2])
    sorted_indices = np.argsort(-fi_data_i)
    top_indices = sorted_indices[:num_signal_features]
    actual_signal_features = np.sum(support[top_indices])
    num_captured_lmdi_plus.append(actual_signal_features)

In [None]:
auroc_lmdi_plus_avg= []
rbo_lst_09_lmdi_plus_avg = []
num_captured_lmdi_plus_avg = []
for i in range(local_fi_score_train_lmdi_plus_method2_l2_norm.shape[0]):
    fi_data_i = local_fi_score_train_lmdi_plus_method2_l2_norm[i]
    ground_truth_fi_i = np.abs(X_train)[i]
    ground_truth_fi_i[support == 0] = 0
    dict_predictions = dict(enumerate(fi_data_i))
    dict_ground_truth = dict(enumerate(ground_truth_fi_i))      
    num_signal_features = int(np.sum(support))            
    auroc_lmdi_plus_avg.append(roc_auc_score(support, fi_data_i))
    rbo_lst_09_lmdi_plus_avg.append(rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=0.9)[2])
    sorted_indices = np.argsort(-fi_data_i)
    top_indices = sorted_indices[:num_signal_features]
    actual_signal_features = np.sum(support[top_indices])
    num_captured_lmdi_plus_avg.append(actual_signal_features)

In [None]:
print(np.array(auroc_shap).mean(), np.array(rbo_lst_09_shap).mean(), np.array(num_captured_shap).mean())
print(np.array(auroc_lmdi_plus).mean(), np.array(rbo_lst_09_lmdi_plus).mean(), np.array(num_captured_lmdi_plus).mean())
print(np.array(auroc_lmdi_plus_avg).mean(), np.array(rbo_lst_09_lmdi_plus_avg).mean(), np.array(num_captured_lmdi_plus_avg).mean())

In [None]:
auroc_shap[5], rbo_lst_09_shap[5]

In [None]:
auroc_lmdi_plus[5], rbo_lst_09_lmdi_plus[5]

In [None]:
ground_truth_fi_i = np.abs(X_test)[5]
ground_truth_fi_i[support == 0] = 0
ground_truth_fi_i

In [None]:
p = 0.9

In [None]:
dict_predictions = dict(enumerate(local_fi_score_test_shap[5]))
dict_ground_truth = dict(enumerate(ground_truth_fi_i))
rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=p, verbose=True)[2]

In [None]:
local_fi_score_test_lmdi_plus[5]

In [None]:
dict_predictions = dict(enumerate(local_fi_score_test_lmdi_plus[5]))
dict_ground_truth = dict(enumerate(ground_truth_fi_i))
rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=p, verbose=True)[2]

In [None]:
temp = np.array([0.30647695  , 0.17410994, 0.816055, 0.17842848, 0.10012125,
       0.26276102, 0.26671546, 0.28039733, 0.23719995, 0.25739759])

In [None]:
dict_predictions = dict(enumerate(temp))#local_fi_score_test_lmdi_plus[5]))
dict_ground_truth = dict(enumerate(ground_truth_fi_i))
rbo_dict(dict1=dict_ground_truth, dict2=dict_predictions, p=p, verbose=True)[2]

##### Debug two group setting with intercept

In [None]:
X = sample_normal_X_subgroups(n = 500, d=10, mean= [[0]*10,[0]*5+[0]*5], scale =[[1]*10,[1]*10])
temp = linear_model(X, beta=1, sigma=None, heritability=0.6, s=5, return_support=True)
y = temp[0]
support = temp[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
linear_model(X, beta=1, sigma=None, heritability=0.2, s=5, return_support=True)[1]

In [None]:
rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=42)
rf.fit(X_train, y_train)

In [None]:
rf_plus_base = RandomForestPlusRegressor(rf_model=rf)
rf_plus_base.fit(X_train, y_train)

# rf_plus_base_oob = RandomForestPlusRegressor(rf_model=rf, fit_on="oob")
# rf_plus_base_oob.fit(X_train, y_train)

# rf_plus_base_inbag = RandomForestPlusRegressor(rf_model=rf, include_raw=False, fit_on="inbag", prediction_model=Ridge(alpha=1e-6))
# rf_plus_base_inbag.fit(X_train, y_train)

In [None]:
np.random.seed(42)
if X_train.shape[0] > 100:
    indices_train = np.random.choice(X_train.shape[0], 100, replace=False)
    X_train_subset = X_train[indices_train]
    y_train_subset = y_train[indices_train]
else:
    indices_train = np.arange(X_train.shape[0])
    X_train_subset = X_train
    y_train_subset = y_train

if X_test.shape[0] > 100:
    indices_test = np.random.choice(X_test.shape[0], 100, replace=False)
    X_test_subset = X_test[indices_test]
    y_test_subset = y_test[indices_test]
else:
    indices_test = np.arange(X_test.shape[0])
    X_test_subset = X_test
    y_test_subset = y_test

In [None]:
indices_train_0 = np.where(X_train_subset[:, -1] == 0)[0]
indices_test_0 = np.where(X_test_subset[:, -1] == 0)[0]

In [None]:
indices_train_1 = np.where(X_train_subset[:, -1] == 1)[0]
indices_test_1 = np.where(X_test_subset[:, -1] == 1)[0]

In [None]:
local_fi_score_train, local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset = tree_shap_evaluation_RF(X_train=X_train, y_train=y_train, X_train_subset = X_train_subset, y_train_subset=y_train_subset,X_test=X_test, y_test=y_test, X_test_subset=X_test_subset, y_test_subset=y_test_subset,fit=rf, mode="absolute")

In [None]:
new_support_train = np.abs(X_train_subset)
new_support_test = np.abs(X_test)
new_support_train[:, -5:] = 0
new_support_test[:, -5:] = 0

In [None]:
data = local_fi_score_train_subset
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print("Treeshap Trainsubset")
print(np.array(auroc).mean())
print(np.array(auprc).mean())

In [None]:
data = local_fi_score_test 
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print("Treeshap Test")
print(np.array(auroc).mean())
print(np.array(auprc).mean())

In [None]:
def rank_biased_overlap(list1, list2, p=0.9):
    """
    Compute the Rank-Biased Overlap (RBO) between two ranked lists.

    Parameters:
    - list1: numpy array or list of the first ranked list
    - list2: numpy array or list of the second ranked list
    - p: the discount factor (default is 0.9, which is commonly used)

    Returns:
    - rbo: the Rank-Biased Overlap score
    """

    # Convert lists to numpy arrays if they're not already
    list1 = np.asarray(list1)
    list2 = np.asarray(list2)

    # Get the indices that would sort the arrays in descending order
    sorted_indices1 = np.argsort(-list1)
    sorted_indices2 = np.argsort(-list2)

    # Rank lists based on sorted indices
    ranked_list1 = sorted_indices1
    ranked_list2 = sorted_indices2

    # Initialize the overlap
    overlap = 0.0
    min_len = min(len(ranked_list1), len(ranked_list2))
    
    # Compute the RBO
    for i in range(min_len):
        # Calculate the overlap at rank i
        rank_i_overlap = len(set(ranked_list1[:i+1]) & set(ranked_list2[:i+1]))
        
        # Add the discounted overlap to the total
        overlap += (rank_i_overlap / (i + 1)) * (p ** (i + 1))
    
    # Normalize the score
    normalization = (1 - p) / (1 - p ** (min_len + 1))
    rbo = overlap * normalization
    
    return rbo

In [None]:
data = local_fi_score_train_subset
rbo = []
for i in range(data.shape[0]):
        rbo.append(rank_biased_overlap(new_support_train[i], data[i]))
print("Treeshap Trainsubset")
print(np.array(rbo).mean())

In [None]:
data = local_fi_score_test
rbo = []
for i in range(data.shape[0]):
        rbo.append(rank_biased_overlap(new_support_test[i], data[i]))
print("Treeshap Trainsubset")
print(np.array(rbo).mean())

In [None]:
rf_plus_mdi_train = AloRFPlusMDI(rf_plus_base, evaluate_on="oob")
rf_plus_mdi_test = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
local_fi_score_train = np.abs(rf_plus_mdi_train.explain_subtract_intercept(X=X_train, y=y_train))
local_fi_score_test = np.abs(rf_plus_mdi_test.explain_subtract_intercept(X=X_test, y=None))
local_fi_score_test_subset = np.abs(rf_plus_mdi_test.explain_subtract_intercept(X=X_test_subset, y=None))
local_fi_score_train_subset = local_fi_score_train[indices_train]

In [None]:
data = local_fi_score_train_subset 
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print(np.array(auroc).mean())
print(np.array(auprc).mean())

In [None]:
data = local_fi_score_test 
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print(np.array(auroc).mean())
print(np.array(auprc).mean())

In [None]:
new_support_train = np.abs(X_train_subset)
new_support_test = np.abs(X_test)
new_support_train[:, -5:] = 0
new_support_test[:, -5:] = 0

In [None]:
data = local_fi_score_train_subset
rbo = []
for i in range(data.shape[0]):
        rbo.append(rank_biased_overlap(new_support_train[i], data[i]))
print("Treeshap Trainsubset")
print(np.array(rbo).mean())

In [None]:
data = local_fi_score_test
rbo_lst = []
for i in range(data.shape[0]):
        rbo_lst.append(rbo.RankingSimilarity(new_support_test[i], data[i]).rbo())#rbo.append(rank_biased_overlap(new_support_test[i], data[i]))
print("Treeshap Trainsubset")
print(np.array(rbo).mean())

In [None]:
new_support_test[0]

In [None]:
data[0]

In [None]:
rbo.RankingSimilarity(S, T).rbo()

In [None]:
rf_plus_mdi_train = AloRFPlusMDI(rf_plus_base, evaluate_on="oob")
rf_plus_mdi_test = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
local_fi_score_train = np.abs(rf_plus_mdi_train.explain(X=X_train, y=y_train)[1])
local_fi_score_test = np.abs(rf_plus_mdi_test.explain(X=X_test, y=None)[1])
local_fi_score_test_subset = np.abs(rf_plus_mdi_test.explain(X=X_test_subset, y=None)[1])
local_fi_score_train_subset = local_fi_score_train[indices_train]

In [None]:
data = local_fi_score_train_subset 
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print(np.array(auroc).mean())
print(np.array(auprc).mean())

In [None]:
data = local_fi_score_test 
auroc = []
auprc = []
for i in range(data.shape[0]):
        auroc.append(roc_auc_score(support, data[i]))
        auprc.append(average_precision_score(support, data[i]))
print(np.array(auroc).mean())
print(np.array(auprc).mean())