# Simulation for same acc and same probability

In [18]:
import os, sys, argparse
from time import time
import numpy as np
np.random.seed(2023)
sys.path.append('../pshap')
from shap import ShapEngine
import utils, data

## Load data

- We use the Phoneme dataset and suppose 10% of data points in the training dataset is mislabeled.
- A function data.load_data will load the train, val, and test datasets as well as a noisy index.

In [19]:
problem='classification'
dataset='wind'
dargs={'n_data_to_be_valued': 200, 'n_val': 200, 'n_test': 1000}

In [20]:
# Load dataset
(X, y), (X_val, y_val), (X_test, y_test), noisy_index=data.load_data(problem, dataset, **dargs)

print('-'*30)
print(f'Train X: {X.shape}')
print(f'Val X: {X_val.shape}')
print(f'Test X: {X_test.shape}')
print('-'*30)
print(np.bincount(y))

--------------------------------------------------
Wind
--------------------------------------------------
------------------------------
Train X: (200, 14)
Val X: (200, 14)
Test X: (1000, 14)
------------------------------
[100 100]


## Check the mariginal utility from acc and probability

In [21]:
model_family='SVC'
metric='accuracy'
GR_threshold=1.05
weights_list=[(1, 16), (1, 4), (1,1), (4,1), (16, 1)]

In [22]:
# Evaluate values
shap_engine_acc=ShapEngine(X=X, y=y, X_val=X_val, y_val=y_val, 
                       problem=problem, model_family=model_family, 
                       metric=metric, GR_threshold=GR_threshold, max_iters=1)

Source is initialized. A unit of sample is one data point


In [23]:
metric='likelihood'
GR_threshold=1.05
weights_list=[(1, 16), (1, 4), (1,1), (4,1), (16, 1)]

In [24]:
# Evaluate values
shap_engine_proba=ShapEngine(X=X, y=y, X_val=X_val, y_val=y_val, 
                       problem=problem, model_family=model_family, 
                       metric=metric, GR_threshold=GR_threshold, max_iters=1)

Source is initialized. A unit of sample is one data point


In [25]:
from tqdm import trange
n = len(y)
# set counter
num_valid = 0
num_same_acc, num_same_proba, num_da_cond_sp, num_dp_cond_sa, num_all_same = 0,0,0,0,0
        
for _ in trange(1000):
    perm = np.arange(n)
    np.random.shuffle(perm)

    old_au = shap_engine_acc.init_score()
    old_pu = shap_engine_proba.init_score()

    for i in range(1, n+1):
        # get utility and then mariginal contirbution
        try:
            shap_engine_acc.model.fit(X[perm[:i]], y[perm[:i]])
            au = shap_engine_acc.value()
        except ValueError:
            continue
            
        try:
            shap_engine_proba.model.fit(X[perm[:i]], y[perm[:i]])
            pu = shap_engine_proba.value()
        except ValueError:
            continue
        
        num_valid += 1
            
        if old_au == au:
            num_same_acc += 1
            if old_pu != pu:
                num_dp_cond_sa += 1
                
        if old_pu == pu:
            num_same_proba += 1
            if old_au != au:
                num_da_cond_sp += 1
                
        if old_au == au and old_pu == pu:
            num_all_same += 1
                        
        old_au = au
        old_pu = pu
 
print(f'num_valid: {num_valid}')
print(f'num_same_acc: {num_same_acc}')
print(f'num_same_proba: {num_same_proba}')
print(f'num_da_cond_sp: {num_da_cond_sp}')
print(f'num_dp_cond_sa: {num_dp_cond_sa}')
print(f'num_all_same: {num_all_same}')

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [49:26<00:00,  2.97s/it]

num_valid: 198062
num_same_acc: 109462
num_same_proba: 103
num_da_cond_sp: 102
num_dp_cond_sa: 109461
num_all_same: 1



