# Dval - Shapley for data valuation

This notebook introduces Dval, a library to evaluate the importance of single datapoints in the performance of machine learning models.
We will go through the foundations of the library, its main entry-points and capabilities working with a real dataset of music tracks.
We will also highlight the advantages of using our library over vanilla data-shapley calculations, showing explicitly the advantages in runtime and efficiency for large datasets.

In [None]:
%load_ext autoreload
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
from valuation.utils import load_spotify_dataset
data = load_spotify_dataset(min_year=2014)
data.head()

In [None]:
target_column = 'popularity'
y = data[target_column]
X = data.drop(target_column, axis=1)
X, X_test, y, y_test = train_test_split(X, y, test_size=0.3, random_state=24)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=24)

In [None]:
song_name = X_train['song']
artist = X_train['artist']
X_train = X_train.drop(['song', 'artist'], axis=1)
X_test = X_test.drop(['song', 'artist'], axis=1)
X_val = X_val.drop(['song', 'artist'], axis=1)

Note: Make sure to restart (or simply start if it is not already running) your memcache. In the terminal, type

`sudo service memcached restart`

In [None]:
from valuation.shapley import create_utility, shapley_dval
from sklearn.ensemble import GradientBoostingRegressor
utility = create_utility(model=GradientBoostingRegressor(n_estimators=3), x_train=X_train, y_train=y_train, x_test=X_val, y_test=y_val, scoring='neg_mean_absolute_error', data_groups=artist)
dval_df = shapley_dval(utility, iterations_per_job=1, num_jobs=20)

In [None]:
from valuation.shapley import create_utility, shapley_dval
from sklearn.ensemble import GradientBoostingRegressor
utility = create_utility(model=GradientBoostingRegressor(n_estimators=3), x_train=X_train, y_train=y_train, x_test=X_val, y_test=y_val, scoring='neg_mean_absolute_error', data_groups=artist)
dval_df = shapley_dval(utility, iterations_per_job=10, num_jobs=20)

In [None]:
low_dval = dval_df.iloc[:30]
plt.figure(figsize=(20, 5))
plt.errorbar(x=low_dval['artist'], y=low_dval['shapley_dval'], yerr=low_dval['dval_std'], fmt='o')
plt.xticks(rotation=45)
plt.show()

In [None]:
low_dval_artists = dval_df.iloc[:30].artist.to_list()
artist_filter = ~artist.isin(low_dval_artists)
X_train_new = X_train[artist_filter]
y_train_new = y_train[artist_filter]

In [None]:
from sklearn.metrics import mean_absolute_error
full_model = GradientBoostingRegressor(n_estimators=3).fit(X_train_new, y_train_new)
mean_absolute_error(full_model.predict(X_test), y_test)

In [None]:
full_model = GradientBoostingRegressor(n_estimators=3).fit(X_train, y_train)
mean_absolute_error(full_model.predict(X_test), y_test)

## Evaluation of anomalous data

In [None]:
high_dval = dval_df.iloc[-30:]
plt.figure(figsize=(20, 5))
plt.errorbar(x=high_dval['artist'], y=high_dval['shapley_dval'], yerr=high_dval['dval_std'], fmt='o')
plt.xticks(rotation=45)
plt.show()

In [None]:
y_train.loc[artist == 'Rihanna'] = 0

In [None]:
utility = create_utility(model=GradientBoostingRegressor(n_estimators=3), x_train=X_train, y_train=y_train, x_test=X_val, y_test=y_val, scoring='neg_mean_absolute_error', data_groups=artist)
dval_df = shapley_dval(utility, iterations_per_job=10, num_jobs=20)

In [None]:
low_dval = dval_df.iloc[:30]
plt.figure(figsize=(20, 5))
plt.errorbar(x=low_dval['artist'], y=low_dval['shapley_dval'], yerr=low_dval['dval_std'], fmt='o')
plt.xticks(rotation=45)
plt.show()

# Advanced: Dval cache configuration

In [None]:
from valuation.utils import MemcachedConfig
memcache_config = MemcachedConfig(
                        cache_threshold = 0.3,
                        allow_repeated_training = False,
                        rtol_threshold = 0.1,
                        min_repetitions = 3,
                    )
utility = create_utility(model=GradientBoostingRegressor(n_estimators=3), x_train=X_train, y_train=y_train, x_test=X_val, y_test=y_val, scoring='neg_mean_absolute_error', data_groups=artist, enable_cache=True, cache_options=memcache_config)
dval_df = shapley_dval(utility, iterations_per_job=10, num_jobs=20)

In [None]:
utility = create_utility(model=GradientBoostingRegressor(n_estimators=3), x_train=X_train, y_train=y_train, x_test=X_val, y_test=y_val, scoring='neg_mean_absolute_error', data_groups=artist, enable_cache=False)
dval_df = shapley_dval(utility, iterations_per_job=10, num_jobs=20)