forked from amiratag/DataShapley
-
Notifications
You must be signed in to change notification settings - Fork 0
/
example.py
81 lines (72 loc) · 2.4 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#%load_ext autoreload
#%autoreload 2
import os
import sys
import time
import numpy as np
from Shapley import ShapNN
from DShap import DShap
import matplotlib.pyplot as plt
import sklearn
from shap_utils import *
#%matplotlib inline
MEM_DIR = './'
problem, model = 'classification', 'logistic'
hidden_units = [] # Empty list in the case of logistic regression.
train_size = 200
test_size = 50
d, difficulty = 50, 1
num_classes = 2
tol = 0.03
target_accuracy = 0.7
important_dims = 5
clf = return_model(model, solver='liblinear', hidden_units=tuple(hidden_units))
_param = 1.0
for _ in range(100):
X_raw = np.random.multivariate_normal(mean=np.zeros(d), cov = np.eye(d),
size=train_size + test_size)
_, y_raw, _, _ = label_generator(
problem, X_raw, param = _param, difficulty = difficulty, important=important_dims)
clf.fit(X_raw[:train_size], y_raw[:train_size])
test_acc = clf.score(X_raw[train_size:], y_raw[train_size:])
if test_acc > target_accuracy:
break
_param *= 1.1
print('Performance using the whole training set = {0:.2f}'.format(test_acc))
X, y = X_raw[:train_size], y_raw[:train_size]
X_test, y_test = X_raw[train_size:], y_raw[train_size:]
model = 'logistic'
problem = 'classification'
num_test = 1000
directory = './temp'
dshap = DShap(X, y, X_test, y_test, num_test,
sources=None,
sample_weight=None,
model_family=model,
metric='accuracy',
overwrite=True,
directory=directory, seed=0)
dshap.run(100, 0.1, g_run=False)
X, y = X_raw[:100], y_raw[:100]
X_test, y_test = X_raw[100:], y_raw[100:]
model = 'logistic'
problem = 'classification'
num_test = 1000
directory = './temp'
dshap = DShap(X, y, X_test, y_test, num_test, model_family=model, metric='accuracy',
directory=directory, seed=1)
dshap.run(100, 0.1)
X, y = X_raw[:100], y_raw[:100]
X_test, y_test = X_raw[100:], y_raw[100:]
model = 'logistic'
problem = 'classification'
num_test = 1000
directory = './temp'
dshap = DShap(X, y, X_test, y_test, num_test, model_family=model, metric='accuracy',
directory=directory, seed=2)
dshap.run(100, 0.1)
dshap.merge_results()
convergence_plots(dshap.marginals_tmc)
convergence_plots(dshap.marginals_g)
dshap.performance_plots([dshap.vals_tmc, dshap.vals_g, dshap.vals_loo], num_plot_markers=20,
sources=dshap.sources)