### Quantification of the prominence of discrepancies in ML models in the data science practice

In [None]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

import numpy as np
import pandas as pd

from pyemd import emd_samples

from sklearn.metrics import f1_score, plot_precision_recall_curve, RocCurveDisplay, plot_confusion_matrix, accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import appdirs

import openml
from mltasks import openml_tasks

%matplotlib inline
%load_ext line_profiler

PATH_DATA = appdirs.user_cache_dir("mltasks", "mltasks")
PATH_OPENML = PATH_DATA+'/openml/'

### Retrieve data from OpenML
##### Input/output data, predictions from best models submitted to OpenML and computation of prediction discrepancies between those models

In [None]:
import warnings
from tables import NaturalNameWarning
warnings.filterwarnings('ignore', category=NaturalNameWarning)

# OpenML-CC18 Curated Classification benchmark
suite = openml_tasks.get_suite(suite='OpenML-CC18')

for task_id in suite.tasks[28:]:
    (discr, preds, scores) = openml_tasks.get_discrepancies(task=task_id, metric='predictive_accuracy', n_runs=100, epsilon_runs=0.02)
    (data, target) = openml_tasks.get_dataset(task=task_id)
    #openml_fetcher.get_discr(task_id, get_data=False, path=path_data)

In [None]:
# Check the number of datasets retrieved from OpenML for the benchmark
with pd.HDFStore(PATH_OPENML+'/openml-discr.h5') as store:
    print( len( list(store.keys()) ) )

### Descriptive analysis of prediction discrepancies

In [None]:
store_discr = pd.HDFStore(PATH_OPENML+'/openml-discr.h5')
store_accuracies = pd.HDFStore(PATH_OPENML+'/openml-accuracies.h5')

df_dataset_properties = {}

for dataset_id in list(store_discr.keys()):

    discr = store_discr[dataset_id]
    prediction_error = 1-store_accuracies[dataset_id]

    dataset = openml.datasets.get_dataset(int(dataset_id.split('/')[1]))
    df_dataset_properties[dataset.name] = {'Proportion of discrepancies':discr.sum()/discr.shape[0],
                        'Prediction error of the worst model':prediction_error.max(),
                        'Number of instances':dataset.qualities['NumberOfInstances'],
                        'Number of features':dataset.qualities['NumberOfFeatures'],
                        'Ratio features/instances':dataset.qualities['NumberOfFeatures']/float(dataset.qualities['NumberOfInstances'])}

df_dataset_properties = pd.DataFrame(df_dataset_properties).T

In [None]:
print(df_dataset_properties.columns)

df_dataset_properties.plot(kind='scatter', y='Prediction error of the worst model', x='Proportion of discrepancies')

##### Distribution of the number of prediction discrepancies by dataset

In [None]:
from matplotlib.ticker import PercentFormatter

ax = sns.boxplot(data=df_dataset_properties, y='Proportion of discrepancies',
            whis=[0, 100], width=.6, palette="vlag")

sns.stripplot(data=df_dataset_properties, y='Proportion of discrepancies',
              size=4, color=".3", linewidth=0)
plt.ylim([-0.1,1.1])
plt.title('Proportion of instances $\\bf{with\ prediction\ discrepancies}$\n over the 72 datasets of OpenML-CC18')
plt.ylabel('')
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))

plt.tight_layout()
plt.savefig(os.path.expanduser('~')+'/Desktop/discrepancies/figures/proportion_with_discr.pdf')

In [None]:
s = df_dataset_properties.loc[:,'Proportion of discrepancies']

s = s.round(3)*100
s.name = "Proportion of predictions with discrepancies"

for i in range(s.shape[0]):
    s.iloc[i] = str(s.iloc[i])+'%'

print(s.to_latex())

### Explore the structure of instances with prediction discrepancies with the Wasserstein distance

The dataset needs first to be prepared in order to compute the Wasserstein distance between instances.

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

def feature_encoder(X, feature_to_encode):
    """
    Encode non-numeric features, remove the original feature and concatenate its encoded version in the dataset returned

    Args:
        X ([type]): dataset
        feature_to_encode (string or int): feature to encore (name of the dataframe column)
    """
    dummies = pd.get_dummies( X.loc[:,[feature_to_encode]] )
    res = pd.concat([X.drop(labels=feature_to_encode, axis=1), dummies], axis=1)

    return(res) 

def get_and_prepare_openML_dataset(dataset_id):

    # Get OpenML dataset properties
    dataset = openml.datasets.get_dataset(dataset_id)

    # Retrieve dataset
    (data, y) = openml_tasks.get_dataset(task=dataset_id)
    X = data

    # Encode non-numeric features
    features_to_encode = dataset.get_features_by_type('nominal')
    features_to_encode = [X.columns[feature] for feature in features_to_encode[:-1]]
    for feature in features_to_encode:
        X = feature_encoder(X, feature)

    # Complete missing values
    X = KNNImputer(n_neighbors=1).fit_transform(X)

    # Standardize features
    X = StandardScaler().fit_transform(X)
    X = pd.DataFrame(X, index=data.index)

    # Encode target
    y = LabelEncoder().fit_transform(y.to_frame())
    y = y.flatten()
    y = pd.Series(y, index=X.index)

    # Retrieve pre-computed discrepancies
    with pd.HDFStore(PATH_OPENML+'/openml-discr.h5') as store:
        y_discr = store[str(dataset_id)]
    mask_instances_with_discrepancies = (y_discr==1).values

    y.name = 'Label'
    y_discr.name = 'Discrepancies'

    labels = pd.concat((y, y_discr), axis=1)

    return (X, labels, mask_instances_with_discrepancies)

In [None]:
# Sanity check of the dataset' cleaning & preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

dataset_id = 3
X, labels, mask_instances_with_discrepancies = get_and_prepare_openML_dataset(dataset_id)

clf = RandomForestClassifier()
cross_val_score(clf, X, labels.Label).mean()

In [None]:
res = []

# OpenML-CC18 Curated Classification benchmark
suite = openml_tasks.get_suite(suite='OpenML-CC18')

for task_id in suite.tasks[:]:

    task = openml.tasks.get_task(task_id)
    dataset_id = task.dataset_id

    X, labels, mask_instances_with_discrepancies = get_and_prepare_openML_dataset(dataset_id)

    if len(labels.Label.unique())>2:
            continue

    print('#########')
    print(dataset_id)

    c = labels.Label.unique()[0]
    if len(X[labels.Label==c][labels.Discrepancies==1])==0 or len(X[labels.Label!=c][labels.Discrepancies==1])==0:
        print("not enough discrepancies")
        continue

    n_samples = int(X.shape[0]/2)
    print(n_samples)
    dist0 = emd_samples(X.sample(n_samples), X.sample(n_samples))
    res.append({'Dataset':dataset_id, 'Distance':dist0, 'Comparison':'Entire dataset', 'Label':None})

    # Distance computations are made 1 class versus another class

    for c in labels.Label.unique():

        tmp1 = X[labels.Label==c].values
        tmp2 = X[labels.Label!=c].values
        dist1 = emd_samples(tmp1, tmp2)
        res.append({'Dataset':dataset_id, 'Distance':dist1, 'Comparison':'Between classes', 'Label':str(c)+' vs all'})

        tmp1 = X[labels.Label==c][labels.Discrepancies==0].values
        tmp2 = X[labels.Label!=c][labels.Discrepancies==0].values
        dist2 = emd_samples(tmp1, tmp2)
        res.append({'Dataset':dataset_id, 'Distance':dist2, 'Comparison':'Between classes - Instances without discrepancies', 'Label':str(c)+' vs all'})

        tmp1 = X[labels.Label==c][labels.Discrepancies==1].values
        tmp2 = X[labels.Label!=c][labels.Discrepancies==1].values
        dist3 = emd_samples(tmp1, tmp2)
        res.append({'Dataset':dataset_id, 'Distance':dist3, 'Comparison':'Between classes - Instances with discrepancies', 'Label':str(c)+' vs all'})

        res.append({'Dataset':dataset_id, 'Distance':dist2/dist3, 'Comparison':'Ratio', 'Label':str(c)+' vs all'})

    df = pd.DataFrame(res)
    df.to_csv(os.path.expanduser('~')+'/Desktop/discrepancies/stats_discr.csv')


In [None]:
res = pd.read_csv(os.path.expanduser('~')+'/Desktop/discrepancies/stats_discr.csv', index_col=0)

In [None]:
res.loc[:,'Distance'].groupby(res.Comparison).mean()

In [None]:
fig, ax = plt.subplots(figsize=(9,5)) 
sns.boxplot(data=res[res.Comparison!='Ratio'], x='Distance', y='Comparison', palette="vlag", ax=ax)
ax.set_yticklabels(['Distance between 2 random samples\n from the entire dataset - $\\bf{for\ control}$',
                     'Distances between instances of\n opposite classes - $\\bf{for\ control}$',
                     'Distances between instances of\n opposite classes $\\bf{without\ discrepancies}$',
                     'Distances between instances of\n opposite classes $\\bf{with\ discrepancies}$'], rotation=0, horizontalalignment='right')
#ax.set_title("Distributions of normalized Wasserstein distances for various\n configurations over OpenML-CC18 binary classification datasets")
ax.set_title("Comparison of the closeness of instances\n with and without prediction discrepancies")
ax.set_ylabel('')
ax.set_xlabel('Distribution of Wasserstein distances over \nOpenML-CC18 binary classification datasets')

plt.tight_layout()
plt.savefig(os.path.expanduser('~')+'/Desktop/discrepancies/figures/wasserstein.pdf')

In [None]:
from sklearn.manifold import TSNE, Isomap
from sklearn.decomposition import PCA

X_embedded = PCA(n_components=2).fit_transform(X)
X_embedded = pd.DataFrame(X_embedded, index=X.index)

df_tmp = pd.concat((X_embedded, labels), axis=1)

# scatterplot
sns.scatterplot(data=df_tmp, x=0, y=1, hue="Discrepancies", style="Label")