In [1]:
import numpy as np
import pandas as pd
import tqdm
from time import time
import os

import configs
import datasets
from ensemble_DV_core_subset import RandomForestClassifierDV_subset
from ensemble_DV_core_original import RandomForestClassifierDV_original
from data_valuation import DataValuation
import utils_eval

# Toy example

In [17]:
# prepare the data
n, input_dim = 1000,20
data = np.random.normal(size=(n,input_dim))
beta_true = np.random.normal(size=(input_dim,1))
p_true = np.exp(data.dot(beta_true))/(1.+np.exp(data.dot(beta_true)))
target = np.random.binomial(n=1, p=p_true).reshape(-1)

In [25]:
# fit the model and get the joint valuation
rf = RandomForestClassifierDV_subset(n_estimators=2000)
rf.fit(data, target, subset_ratio='varying')
df_oob = rf.evaluate_dfoob_accuracy(data, target)
df_oob = df_oob.values.reshape(data.shape[0],data.shape[1])
df_oob

array([[0.47665848, 0.47215496, 0.46958637, ..., 0.46352941, 0.46875   ,
        0.48067633],
       [0.55524079, 0.46321526, 0.47008547, ..., 0.45325779, 0.48275862,
        0.42204301],
       [0.71637427, 0.64516129, 0.66285714, ..., 0.63687151, 0.66759003,
        0.62021858],
       ...,
       [0.50555556, 0.51742627, 0.49736842, ..., 0.5399449 , 0.55699482,
        0.57519789],
       [0.6452514 , 0.70994475, 0.68834688, ..., 0.71232877, 0.69505495,
        0.70114943],
       [0.67385445, 0.69086022, 0.68644068, ..., 0.66321244, 0.64850136,
        0.68169014]])

In [26]:
# marginalizations
df_oob_data, df_oob_feature = np.mean(df_oob,axis=1), np.mean(df_oob,axis=0)
df_oob_data, df_oob_feature

(array([0.47489853, 0.46314402, 0.65143008, 0.59880235, 0.59060084,
        0.50262106, 0.60149128, 0.6257931 , 0.57449424, 0.6982938 ,
        0.69763516, 0.51189636, 0.72526595, 0.55943197, 0.43733887,
        0.57190211, 0.59186916, 0.78561381, 0.62602158, 0.56409467,
        0.55028926, 0.5104387 , 0.60779274, 0.54852429, 0.68663726,
        0.37105651, 0.74924546, 0.63922358, 0.69201908, 0.59219158,
        0.4385657 , 0.59241403, 0.57514673, 0.64468189, 0.57462043,
        0.56885103, 0.59884343, 0.69347816, 0.65558606, 0.72305649,
        0.62497226, 0.57396499, 0.42225879, 0.60581305, 0.69872738,
        0.65332824, 0.64819824, 0.41353772, 0.60065163, 0.61596271,
        0.41616767, 0.63824261, 0.61138612, 0.63323368, 0.48815509,
        0.72897482, 0.68460407, 0.7952557 , 0.62706293, 0.72784662,
        0.78232987, 0.69573682, 0.50194811, 0.5725462 , 0.53390079,
        0.67886819, 0.67370648, 0.79932086, 0.47972057, 0.52044167,
        0.69104985, 0.6614892 , 0.51748305, 0.74

# Experiment

In [33]:
experiment = "feature_removal"
eval_experiments  = ["feature_removal"]
config = configs.config006CR(experiment)[1][0]
problem = config['problem']
dataset = config['dataset']
dargs_list = config['dargs_list']
dargs = dargs_list[0]

(X, y), (X_val, y_val), (X_test, y_test), \
                        noisy_index, beta_true, error_index, \
                        error_row_index, X_original = \
                        datasets.load_data(problem,dataset,**dargs)
data_valuation_engine=DataValuation(X=X, y=y, 
                                    X_val=X_val, y_val=y_val, 
                                    X_test=X_test, y_test=y_test,
                                    problem=problem, dargs=dargs)

if experiment == 'noisy':
    data_valuation_engine.compute_data_shap()
data_valuation_engine.compute_feature_shap(subset_ratio_list=['varying'])
if experiment in ['feature_removal','error']:
    data_valuation_engine.prepare_baseline(SHAP_size=1000)
data_valuation_engine.evaluate_data_values(noisy_index, beta_true, error_index, X_test, y_test, 
                                            experiments=eval_experiments, error_row_index=error_row_index)

------------------------------
{'experiment': 'feature_removal', 'n_train': 1000, 'n_val': 100, 'n_test': 3000, 'n_trees': 2000, 'clf_path': 'C:\\Users\\yf-su\\Desktop\\XAI\\df_oob\\openml_dataset', 'openml_clf_path': 'C:\\Users\\yf-su\\Desktop\\XAI\\df_oob\\openml_dataset', 'openml_reg_path': 'C:\\Users\\yf-su\\Desktop\\XAI\\df_oob\\openml_dataset', 'is_noisy': None, 'model_family': 'Tree', 'run_id': 0}
--------------------------------------------------
electricity
--------------------------------------------------
Train X: (1000, 6)
Val X: (100, 6)
Test X: (3000, 6)
------------------------------
Start: Data-OOB computation
Done: Data-OOB computation
Start: DF-OOB computation
Done: DF-OOB computation
Start: SHAP computation


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Done: SHAP computation


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:17<00:00,  3.49s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:17<00:00,  3.46s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:21<00:00,  4.31s/it]


In [34]:
data_valuation_engine.df_value_dict['Df-OOB-varying']

array([[0.80952381, 0.68965517, 0.74022346, 0.70437018, 0.6840796 ,
        0.65648855],
       [0.80569948, 0.97927461, 0.80381471, 0.75794621, 0.74559194,
        0.72544081],
       [0.08461538, 0.05785124, 0.11111111, 0.02710027, 0.07948718,
        0.05675676],
       ...,
       [0.57105943, 0.79521277, 0.6027027 , 0.6442577 , 0.67272727,
        0.69293478],
       [0.82816901, 0.86404834, 0.6442577 , 0.8245614 , 0.77363897,
        0.77777778],
       [0.36118598, 0.556231  , 0.41498559, 0.38109756, 0.36414566,
        0.48305085]])

In [35]:
# check the auc(the smaller the better)
for key,value in data_valuation_engine.feature_removal_dict['removal'].items():
    print(key,value[2])

Df-OOB-varying 2.4012222222222217
Base 2.437444444444444
Random 3.0706666666666664
