$$ \phi_i = C \sum_{S \subset D \backslash \{i\}} \frac{1}{{n-1} \choose {|S|}} [V(S \cup \{i\})-V(S)]$$

$$ \phi_i = \mathbb{E}_{\pi \sim \Pi} [V(S^i_{\pi} \cup {\{i\}})-V(S^i_{\pi})]$$

1. n = number of training samples
1. values = -np.inf * np.ones(n)
1. scores = [[] for _ in range(n)]
1. For i in (1,n):
    1. iteration = 0
    1. while iteration < max_iterations:
        1. Draw an n-permutation
        1. model.fit(samples(permutation(:index(i))))
        1. score_without = model.predict(test set)
        1. model.fit(samples(permutation(:index(i)+1)))
        1. score_with = model.predict(test set)
        1. old_moving_average = mean(scores(i))
        1. scores(i).push(score_with - score_without)
        1. new_moving_average = mean(scores(i))
        1. if abs(new_moving_average - old_moving_average) < eps then break
        1. old_moving_average = new_moving_average 
        1. iteration += 1
    1. values(i) = mean(scores(i))

In [None]:
%load_ext autoreload
import matplotlib.pyplot as plt

from functools import partial
from sklearn import datasets
from sklearn.ensemble import GradientBoostingRegressor
from valuation.utils import Dataset
import numpy as np

In [None]:
data = Dataset.from_sklearn(datasets.load_boston())
model = GradientBoostingRegressor()
model.fit(data.x_train, data.y_train)
predictions = model.predict(data.x_test)

In [None]:
plt.figure(figsize=(9, 6))
plt.scatter(data.y_test, predictions)
plt.plot([0, 50], [0, 50], '--k')
plt.xlabel('True')
plt.ylabel('Predicted');

In [None]:
%autoreload

In [None]:
from valuation.shapley import permutation_exact_shapley, truncated_montecarlo_shapley
from valuation.utils import map_reduce, Utility
from valuation.reporting.scores import compute_fb_scores
from valuation.reporting.plots import shapley_results

# Naive Shapley

We want to examine how the shapley values change with train and test data. In particular, we want to examine how robust data shapley values are to out of distribution input data. To do so, we will progressively add more and more outliers to our train dataset

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from valuation.utils.dataset import polynomial_dataset, polynomial

d, coeffs = polynomial_dataset(np.random.randint(-3, 3, size=3))  
model = make_pipeline(PolynomialFeatures(len(coeffs)-1), LinearRegression())

n = len(d)
model.fit(d.x_train, d.y_train)
predicted = [model.predict(d.x_test)]

x_cont = d.x_train.reshape(-1,) + np.random.uniform(-0.05, 0.05, size=len(d))
x_cont = x_cont[::2]
y_cont = polynomial(np.random.normal(loc=coeffs, scale=0.3), x_cont)
xtrain = np.concatenate([d.x_train, x_cont.reshape(-1, 1)], axis=0)
ytrain = np.concatenate([d.y_train, y_cont.reshape(-1,)], axis=0)
for i in range(len(d), len(xtrain)):
    model.fit(xtrain[:i+1], ytrain[:i+1])
    ypred = model.predict(d.x_test)
    predicted.append(ypred)

test_indices = np.argsort(d.x_test, axis=0).reshape(-1, )
xx = np.arange(-1, 1, 0.1)
yy = polynomial(coeffs, xx)

from matplotlib import pyplot as plt
for i, ypred in enumerate(predicted):
    plt.figure(dpi=300)
    plt.scatter(d.x_train[:n], d.y_train[:n], label="Training (in-dist)")
    plt.scatter(d.x_test, d.y_test, label="Test")

    if i > 0:
        plt.scatter(x_cont[:i], y_cont[:i], label="Training (out of dist)")
    plt.plot(xx, yy, label="True")
    plt.plot(d.x_test[test_indices], ypred[test_indices], label="Predicted")
    plt.ylim(min(d.y_train[:n].min(), y_cont.min()) - 1,
                max(d.y_train[:n].max(), y_cont.max()) + 1)
    plt.legend()
    plt.title(d.description)
    plt.show()

d.x_train = xtrain
d.y_train = ytrain

from valuation.shapley import combinatorial_exact_shapley
from valuation.utils import Utility
u = Utility(model, d, "neg_median_absolute_error")
values = combinatorial_exact_shapley(u, progress=False)
high_to_low = list(reversed(values))

take = 5
plt.figure(dpi=300)
plt.scatter(d.x_train[:n], d.y_train[:n], label="Training (in-dist)")
plt.scatter(d.x_test, d.y_test, label="Test")
plt.scatter(d.x_train[high_to_low][:take], d.y_train[high_to_low][:take],
            marker='x', label='High value')
plt.scatter(x_cont, y_cont, label="Training (out of dist)")
plt.plot(xx, yy, label="True")
plt.plot(d.x_test[test_indices], predicted[-1][test_indices], label="Predicted")

model.fit(d.x_train[high_to_low][:take], d.y_train[high_to_low][:take])
ypred = model.predict(d.x_test)
plt.plot(d.x_test, ypred, label='High: prediction')
plt.title(d.description)

plt.legend()
plt.show()

# MCShapley

In [None]:
max_iterations = 200
utility = Utility(
    model,
    data,
    scoring=None,
    enable_cache=False,
)
fun = partial(permutation_exact_shapley, utility=utility, progress=True)
values_nmcs, hist_nmcs = map_reduce(fun, num_runs=10, num_jobs=160)

In [None]:
scores_nmcs = compute_fb_scores(model=model, data=data, values=values_nmcs)

In [None]:
scores_nmcs.update({'max_iterations': max_iterations, 'score_name': "$R^2$"})
shapley_results(scores_nmcs)

# Truncated MC Shapley

In [None]:
params = {'bootstrap_iterations': 200,
          'min_scores': 10,
          'score_tolerance': 0.1,
          'min_values': 10,
          'value_tolerance': 1e-2,
          'max_iterations': 0.5*len(data)}
fun = partial(truncated_montecarlo_shapley, 
              model, data, num_workers=160, worker_progress=False, **params)

In [None]:
values_mcs, hist_mcs = map_reduce(fun, data=data, num_runs=10)

In [None]:
scores_mcs = compute_fb_scores(model=model, data=data, values=values_mcs)

In [None]:
scores_mcs.update({'max_iterations': params['max_iterations'], 'score_name': "$R^2$"})
shapley_results(scores_mcs)