In [1]:
import numpy as np
import pandas as pd
import tqdm
from time import time
import os

import datasets
from ensemble_DV_core import RandomForestClassifierDV, RandomForestRegressorDV
from ensemble_DV_core_original import RandomForestClassifierDV_original, RandomForestRegressorDV_original
from data_valuation import DataValuation
import utils_eval
import configs

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
import sage
import shap
import xgboost as xgb
from scipy.stats import rankdata, ttest_ind

import configs
config = configs.config000CR()[1][0]
problem = config['problem']
dataset = config['dataset']
dargs_list = config['dargs_list']
dargs_ind = 0
dargs = dargs_list[dargs_ind]

if dataset != 'gaussian':
    loo_run=True
    betashap_run=True
    AME_run=True 
    lasso_run=True
    boosting_run=True
    treeshap_run=True
    removal_run=True
    simple_run=False
else:
    loo_run=False
    betashap_run=False
    AME_run=False
    lasso_run=False
    boosting_run=False
    treeshap_run=False
    removal_run=False
    simple_run=False

print(len(dargs_list))
(X, y), (X_val, y_val), (X_test, y_test), noisy_index, beta_true = datasets.load_data('clf','gaussian',**dargs)

36
------------------------------
{'n_data_to_be_valued': 5000, 'n_val': 500, 'n_test': 3000, 'n_trees': 800, 'masked_ratio': 0.5, 'is_noisy': 0.9, 'model_family': 'Tree', 'input_dim': 20, 'run_id': 0, 'rho': 0}
--------------------------------------------------
GAUSSIAN-C
--------------------------------------------------
Train X: (5000, 20)
Val X: (500, 20)
Test X: (3000, 20)
------------------------------


# loop

In [2]:
n_sim = 10

n_dargs_ind_list = []
for n in range(n_sim):
    for dargs_ind in range(len(dargs_list)):
        n_dargs_ind_list.append((n,dargs_ind))

In [None]:
for idx in range(len(n_dargs_ind_list)):
    n_dargs_ind = n_dargs_ind_list[idx]
    print('*'*50)
    n,dargs_ind = n_dargs_ind
    print("round:%s"%n)
    runpath = r'C:\Users\yf-su\Desktop\XAI\run_path_%s'%n
    if not os.path.exists(runpath):
        os.makedirs(runpath)
    print('-'*50)
    dargs = dargs_list[dargs_ind]
    print("current dargs:",dargs_ind, dargs)
    np.random.seed()

    (X, y), (X_val, y_val), (X_test, y_test), noisy_index, beta_true = datasets.load_data('clf','gaussian',**dargs)

    # engine initialization
    data_valuation_engine=DataValuation(X=X, y=y, 
                                        X_val=X_val, y_val=y_val, 
                                        problem=problem, dargs=dargs)

    # data_shap, feature_shap
    data_valuation_engine.compute_data_shap(loo_run=loo_run, 
                                                betashap_run=betashap_run)
    data_valuation_engine.compute_feature_shap(AME_run=AME_run,
                                           lasso_run=lasso_run, 
                                           boosting_run=boosting_run,
                                           treeshap_run=treeshap_run,
                                           simple_run=simple_run)

    data_valuation_engine.evaluate_data_values(noisy_index, beta_true, X_test, y_test, removal_run=removal_run)
    data_valuation_engine.save_results(runpath, dataset, dargs_ind, noisy_index, beta_true)
    
    # check for replicate
    if idx >= (len(dargs_list)):
        n,dargs_ind = n_dargs_ind_list[idx - (len(dargs_list))]
        past = np.load(r"C:\Users\yf-su\Desktop\XAI\run_path_%d\run_id0_%d.pkl"%(n,dargs_ind), allow_pickle = True)
        if (data_valuation_engine.feature_value_dict['Base'] == past['feature_value']['Base']).all():
            raise
    del X, y, X_val, y_val, X_test, y_test

**************************************************
round:0
--------------------------------------------------
current dargs: 0 {'n_data_to_be_valued': 5000, 'n_val': 500, 'n_test': 3000, 'n_trees': 800, 'masked_ratio': 0.5, 'is_noisy': 0.1, 'model_family': 'Tree', 'input_dim': 20, 'run_id': 0, 'rho': 0}
------------------------------
{'n_data_to_be_valued': 5000, 'n_val': 500, 'n_test': 3000, 'n_trees': 800, 'masked_ratio': 0.5, 'is_noisy': 0.1, 'model_family': 'Tree', 'input_dim': 20, 'run_id': 0, 'rho': 0}
--------------------------------------------------
GAUSSIAN-C
--------------------------------------------------
Train X: (5000, 20)
Val X: (500, 20)
Test X: (3000, 20)
------------------------------
Start: Data-OOB computation
Done: Data-OOB computation
Start: DF-OOB computation
Done: DF-OOB computation
Start: SHAP computation




Done: SHAP computation
--------------------------------------------------
Save results
--------------------------------------------------
Done! path: C:\Users\yf-su\Desktop\XAI\run_path_0, run_id: 0.
**************************************************
round:0
--------------------------------------------------
current dargs: 1 {'n_data_to_be_valued': 5000, 'n_val': 500, 'n_test': 3000, 'n_trees': 800, 'masked_ratio': 0.5, 'is_noisy': 0.1, 'model_family': 'Tree', 'input_dim': 20, 'run_id': 0, 'rho': 0.2}
------------------------------
{'n_data_to_be_valued': 5000, 'n_val': 500, 'n_test': 3000, 'n_trees': 800, 'masked_ratio': 0.5, 'is_noisy': 0.1, 'model_family': 'Tree', 'input_dim': 20, 'run_id': 0, 'rho': 0.2}
--------------------------------------------------
GAUSSIAN-C
--------------------------------------------------
Train X: (5000, 20)
Val X: (500, 20)
Test X: (3000, 20)
------------------------------
Start: Data-OOB computation
Done: Data-OOB computation
Start: DF-OOB computation
D



Done: SHAP computation
--------------------------------------------------
Save results
--------------------------------------------------
Done! path: C:\Users\yf-su\Desktop\XAI\run_path_0, run_id: 0.
**************************************************
round:0
--------------------------------------------------
current dargs: 2 {'n_data_to_be_valued': 5000, 'n_val': 500, 'n_test': 3000, 'n_trees': 800, 'masked_ratio': 0.5, 'is_noisy': 0.1, 'model_family': 'Tree', 'input_dim': 20, 'run_id': 0, 'rho': 0.6}
------------------------------
{'n_data_to_be_valued': 5000, 'n_val': 500, 'n_test': 3000, 'n_trees': 800, 'masked_ratio': 0.5, 'is_noisy': 0.1, 'model_family': 'Tree', 'input_dim': 20, 'run_id': 0, 'rho': 0.6}
--------------------------------------------------
GAUSSIAN-C
--------------------------------------------------
Train X: (5000, 20)
Val X: (500, 20)
Test X: (3000, 20)
------------------------------
Start: Data-OOB computation
Done: Data-OOB computation
Start: DF-OOB computation
D

