In [1]:
import numpy as np
import pandas as pd
import tqdm
from time import time
import os

import datasets
from ensemble_DV_core import RandomForestClassifierDV, RandomForestRegressorDV
from ensemble_DV_core_original import RandomForestClassifierDV_original, RandomForestRegressorDV_original
from data_valuation import DataValuation
import utils_eval
from utils import *
import configs

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
import sage
import shap
import xgboost as xgb
from scipy.stats import rankdata, ttest_ind

import configs
config = configs.config000CR()[1][0]
problem = config['problem']
dataset = config['dataset']
dargs_list = config['dargs_list']
dargs_ind = 0
dargs = dargs_list[dargs_ind]

if dataset != 'gaussian':
    loo_run=True
    betashap_run=True
    AME_run=True 
    lasso_run=True
    boosting_run=True
    treeshap_run=True
    removal_run=True
    simple_run=False
else:
    loo_run=False
    betashap_run=False
    AME_run=False
    lasso_run=False
    boosting_run=False
    treeshap_run=False
    removal_run=False
    simple_run=False

print(len(dargs_list))
(X, y), (X_val, y_val), (X_test, y_test), noisy_index, beta_true = datasets.load_data('clf','gaussian',**dargs)

24
------------------------------
{'n_data_to_be_valued': 5000, 'n_val': 500, 'n_test': 3000, 'n_trees': 800, 'masked_ratio': 0.5, 'is_noisy': 0.1, 'model_family': 'Tree', 'input_dim': 25, 'run_id': 0, 'rho': 0}
--------------------------------------------------
GAUSSIAN-C
--------------------------------------------------
Train X: (5000, 25)
Val X: (500, 25)
Test X: (3000, 25)
------------------------------


# loop

In [3]:
n_sim = 10

n_dargs_ind_list = []
#-6 here since the last 6 settings are too time-consuming
for n in range(n_sim):
    for dargs_ind in range(len(dargs_list)-6):
        n_dargs_ind_list.append((n,dargs_ind))

In [None]:
for idx in range(len(n_dargs_ind_list)):
    n_dargs_ind = n_dargs_ind_list[idx]
    print('*'*50)
    n,dargs_ind = n_dargs_ind
    print("round:%s"%n)
    runpath = r'C:\Users\yf-su\Desktop\XAI\run_path_%s'%n
    if not os.path.exists(runpath):
        os.makedirs(runpath)
    print('-'*50)
    dargs = dargs_list[dargs_ind]
    print("current dargs:",dargs_ind, dargs)
    np.random.seed()

    # data generation
    (X, y), (X_val, y_val), (X_test, y_test), noisy_index, beta_true = datasets.load_data('clf','gaussian',**dargs)

    # engine initialization
    data_valuation_engine=DataValuation(X=X, y=y, 
                                            X_val=X_val, y_val=y_val, 
                                            problem=problem, dargs=dargs)

    # rf evaluation, data_shap, feature_shap
    data_valuation_engine.evalute_rf_models(X_test, y_test)
    data_valuation_engine.compute_data_shap(loo_run=loo_run, 
                                                    betashap_run=betashap_run)
    data_valuation_engine.compute_feature_shap(AME_run=AME_run,
                                               lasso_run=lasso_run, 
                                               boosting_run=boosting_run,
                                               treeshap_run=treeshap_run,
                                               simple_run=simple_run)

    # learn oob
    X_y = np.concatenate((X,y.reshape(-1,1)), axis=1)
    oob = data_valuation_engine.data_value_dict['Data-OOB']

    learn = learn_oob(X_y, oob, global_method = 'SHAP')
    base_learn = base_learn_oob(learn['X_y_split'], global_method = 'SHAP')

    # store values
    data_valuation_engine.feature_value_dict['Learn-OOB'] = learn['learn_feature_importance']
    data_valuation_engine.learn_dict['Learn-OOB-y'] = learn['learn_feature_importance_y']    
    data_valuation_engine.feature_value_dict['Base-Learn-OOB'] = base_learn['learn_feature_importance']

    data_valuation_engine.learn_dict['mape'] = learn['score_mape']
    data_valuation_engine.learn_dict['mse'] = learn['score_mse']
    data_valuation_engine.learn_dict['acc(base)'] = base_learn['score_acc']

    # attribution difference (1:A<B 2:A>B)
    attrA = data_valuation_engine.feature_value_dict['Learn-OOB']
    attrB = data_valuation_engine.feature_value_dict['Base-Learn-OOB']
    attr_diff = (preprocessing.normalize(attrA.reshape(1,-1)) - preprocessing.normalize(
        attrB.reshape(1,-1))).reshape(-1)

    rank_true = rankdata(-np.abs(beta_true), method='ordinal')
    rank_true[beta_true.reshape(1,-1)[0] == 0] = -1
    data_valuation_engine.learn_dict['attr_diff_outlier_1'] = rank_true[detect_outlier(attr_diff)[0]]
    data_valuation_engine.learn_dict['attr_diff_outlier_2'] = rank_true[detect_outlier(attr_diff)[1]]
    data_valuation_engine.learn_dict['non_masked_feature'] = (beta_true != 0).sum()

    rank_true = rankdata(-np.abs(beta_true), method='dense')
    data_valuation_engine.learn_dict['attr_diff_top3'] = np.mean(rank_true[np.argsort(attr_diff) <= 2])
    data_valuation_engine.learn_dict['attr_diff_bottom3'] = np.mean(rank_true[np.argsort(-attr_diff) <= 2])

    data_valuation_engine.learn_dict['pearson'] = rcorr(attrA, attrB)[0]
    data_valuation_engine.learn_dict['tau'] = rcorr(attrA, attrB)[1]

    data_valuation_engine.evaluate_data_values(noisy_index, beta_true, X_test, y_test, removal_run=removal_run)
    data_valuation_engine.new_evaluation_dict = utils_eval.evalution_new(oob, noisy_index, learn['model'], X_y, beta_true, dargs['rho'])
    data_valuation_engine.save_results(runpath, dataset, dargs_ind, noisy_index, beta_true)
    
    # check for replicate
    if idx >= (len(dargs_list)-6):
        n,dargs_ind = n_dargs_ind_list[idx - (len(dargs_list)-6)]
        past = np.load(r"C:\Users\yf-su\Desktop\XAI\run_path_%d\run_id0_%d.pkl"%(n,dargs_ind), allow_pickle = True)
        if (data_valuation_engine.feature_value_dict['Learn-OOB'] == past['feature_value']['Learn-OOB']).all():
            raise
    del X, y, X_val, y_val, X_test, y_test, X_y, oob
    del data_valuation_engine, learn, base_learn

**************************************************
round:0
--------------------------------------------------
current dargs: 0 {'n_data_to_be_valued': 5000, 'n_val': 500, 'n_test': 3000, 'n_trees': 800, 'masked_ratio': 0.5, 'is_noisy': 0.1, 'model_family': 'Tree', 'input_dim': 25, 'run_id': 0, 'rho': 0}
------------------------------
{'n_data_to_be_valued': 5000, 'n_val': 500, 'n_test': 3000, 'n_trees': 800, 'masked_ratio': 0.5, 'is_noisy': 0.1, 'model_family': 'Tree', 'input_dim': 25, 'run_id': 0, 'rho': 0}
--------------------------------------------------
GAUSSIAN-C
--------------------------------------------------
Train X: (5000, 25)
Val X: (500, 25)
Test X: (3000, 25)
------------------------------
RF 0.782
RF_original 0.797
gap 0.015
Start: Data-OOB computation
Done: Data-OOB computation
Start: DF-OOB computation


# Important analysis

In [2]:
def count_balance(X_y):
    count = np.bincount(X_y[:,-1].astype(int))
    return max(count)/sum(count)
def generate_X_y_oob(dargs):
    np.random.seed()
    # data generation
    (X, y), (X_val, y_val), (X_test, y_test), noisy_index, beta_true = datasets.load_data('clf','gaussian',**dargs)

    # engine initialization
    data_valuation_engine=DataValuation(X=X, y=y, 
                                            X_val=X_val, y_val=y_val, 
                                            problem=problem, dargs=dargs)

    # rf evaluation, data_shap, feature_shap
    data_valuation_engine.evalute_rf_models(X_test, y_test)
    data_valuation_engine.compute_data_shap(loo_run=loo_run, 
                                                    betashap_run=betashap_run)
    data_valuation_engine.compute_feature_shap(AME_run=AME_run,
                                               lasso_run=lasso_run, 
                                               boosting_run=boosting_run,
                                               treeshap_run=treeshap_run,
                                               simple_run=simple_run,
                                               df_oob_run=False)

    # learn oob
    X_y = np.concatenate((X,y.reshape(-1,1)), axis=1)
    oob = data_valuation_engine.data_value_dict['Data-OOB']
    return X_y, oob

def f(f):
    return str(round(f,4))

In [4]:
dargs_ind = 0
dargs = dargs_list[dargs_ind]
X_y, oob = generate_X_y_oob(dargs)
learn = learn_oob(X_y, oob, global_method = 'SHAP')

------------------------------
{'n_data_to_be_valued': 5000, 'n_val': 500, 'n_test': 3000, 'n_trees': 800, 'masked_ratio': 0.5, 'is_noisy': 0.1, 'model_family': 'Tree', 'input_dim': 25, 'run_id': 0, 'rho': 0}
--------------------------------------------------
GAUSSIAN-C
--------------------------------------------------
Train X: (5000, 25)
Val X: (500, 25)
Test X: (3000, 25)
------------------------------
RF 0.772
RF_original 0.779
gap 0.007
Start: Data-OOB computation
Done: Data-OOB computation


In [None]:
for dargs_ind in range(18):

    runpath = r'C:\Users\yf-su\Desktop\XAI\Y\experiment_%s\\'%dargs_ind 
    if not os.path.exists(runpath):
        os.makedirs(runpath)

    dargs = dargs_list[dargs_ind]
    X_y, oob = generate_X_y_oob(dargs)
    learn = learn_oob(X_y, oob, global_method = 'SHAP')

    end_point = 25 if dargs['n_data_to_be_valued'] == 5000 else 10

    with open(runpath+'log.txt','w') as file:
        file.writelines(['quantile','\t','value','\t','t','\t','p'])
        for top in range(9,0,-1):
            top = top/10
            file.writelines("\n")
            top_ind_test = np.where(learn['oob_split'][2]>=np.quantile(oob,1-top))
            bottom_ind_test = np.where(learn['oob_split'][2]<np.quantile(oob,1-top))
            t_test = ttest_ind(learn['local_importance'][:,-1][top_ind_test],learn['local_importance'][:,-1][bottom_ind_test])
            file.writelines([str(top),'\t',f(np.quantile(oob,1-top)),'\t'])
            file.writelines([f(t_test[0]),'\t',f(t_test[1])])
            file.writelines("\n")

    plt.figure(figsize=(6,6))
    plt.scatter(learn['oob_split'][2],learn['local_importance'][:,-1])
    plt.xlabel("OOB")
    plt.ylabel("y_importance")
    plt.savefig(runpath+"scatter_plot.jpg")
    plt.show()


    results_order = []
    results_balance = []
    results_mape = []

    tops = []
    for top in tqdm.tqdm(range(100,end_point,-5)):
        top = top/100
        tops.append(top)

        top_ind = np.where(oob>=np.quantile(oob,1-top))
        X_y_sub = X_y[top_ind]
        oob_sub = oob[top_ind]

        learn = learn_oob(X_y_sub, oob_sub, global_method = 'SHAP')

        results_order.append(learn['learn_feature_importance_y_order'])
        results_balance.append(count_balance(X_y_sub))
        results_mape.append(learn['score_mape'])

    plt.plot(tops,results_order,label = 'y_order')
    plt.plot(tops,results_balance,label = 'balance')
    plt.xlabel("keep_percentage")
    plt.legend(loc="upper left")
    plt.savefig(runpath+"trend_1.jpg")
    plt.show()
    plt.plot(tops,np.array(results_mape)/100, label = 'mse')
    plt.xlabel("keep_percentage")
    plt.legend(loc="upper left")
    plt.savefig(runpath+"trend_2.jpg")
    plt.show()