### Quantification of the prominence of discrepancies in ML models in the data science practice

In [1]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

import numpy as np
import pandas as pd

from pyemd import emd_samples

from sklearn.metrics import f1_score, plot_precision_recall_curve, RocCurveDisplay, plot_confusion_matrix, accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import openml
import openml_fetcher

%matplotlib inline
%load_ext line_profiler

path_data = '/Users/---/Desktop/discrepancies'

ModuleNotFoundError: No module named 'mltasks'

In [None]:
# Fetch OpenML data (accuracy scores, predictions, compute prediction discrepancies)

# OpenML-CC18 Curated Classification benchmark
suite = openml.study.get_suite(99)

for task_id in suite.tasks:
    openml_fetcher.get_discr(task_id, get_data=False, path=path_data)

In [None]:
# Control the number of datasets retrieved from OpenML for the benchmark
with pd.HDFStore(path_data+'/openml/openml-discr.h5') as store:
    print( len( list(store.keys()) ) )

In [None]:
store_discr = pd.HDFStore(path_data+'/openml/openml-discr.h5')
store_accuracies = pd.HDFStore(path_data+'/openml/openml-accuracies.h5')

df_dataset_properties = {}

for dataset_id in list(store_discr.keys()):

    discr = store_discr[dataset_id]
    prediction_error = 1-store_accuracies[dataset_id]

    dataset = openml.datasets.get_dataset(int(dataset_id.split('/')[1]))
    df_dataset_properties[dataset.name] = {'Proportion of discrepancies':discr.sum()/discr.shape[0],
                        'Prediction error of the worst model':prediction_error.max(),
                        'Number of instances':dataset.qualities['NumberOfInstances'],
                        'Number of features':dataset.qualities['NumberOfFeatures'],
                        'Ratio features/instances':dataset.qualities['NumberOfFeatures']/float(dataset.qualities['NumberOfInstances'])}

df_dataset_properties = pd.DataFrame(df_dataset_properties).T

In [None]:
print(df_dataset_properties.columns)

df_dataset_properties.plot(kind='scatter', y='Prediction error of the worst model', x='Proportion of discrepancies')

In [None]:
tmp = df_dataset_properties.loc[:,'Proportion of discrepancies']
np.sum(tmp>=0.2)

In [None]:
from matplotlib.ticker import PercentFormatter

ax = sns.boxplot(data=df_dataset_properties, y='Proportion of discrepancies',
            whis=[0, 100], width=.6, palette="vlag")

sns.stripplot(data=df_dataset_properties, y='Proportion of discrepancies',
              size=4, color=".3", linewidth=0)
plt.ylim([-0.1,1.1])
plt.title('Proportion of instances $\\bf{with\ prediction\ discrepancies}$\n over the 72 datasets of OpenML-CC18')
plt.ylabel('')
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))

plt.tight_layout()
plt.savefig(path_data+'/figures/proportion_with_discr.pdf')

In [None]:
s = df_dataset_properties.loc[:,'Proportion of discrepancies']

s = s.round(3)*100
s.name = "Proportion of predictions with discrepancies"

for i in range(s.shape[0]):
    s.iloc[i] = str(s.iloc[i])+'%'

print(s.to_latex())

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

def feature_encoder(X, feature_to_encode):
    """
    Encode non-numeric features, remove the original feature and concatenate its encoded version in the dataset returned

    Args:
        X ([type]): dataset
        feature_to_encode (string or int): feature to encore (name of the dataframe column)
    """
    dummies = pd.get_dummies( X.loc[:,[feature_to_encode]] )
    res = pd.concat([X.drop(labels=feature_to_encode, axis=1), dummies], axis=1)

    return(res) 

def get_and_prepare_openML_dataset(dataset_id):

    # Get OpenML dataset properties
    dataset = openml.datasets.get_dataset(dataset_id)

    # Retrieve dataset
    (data, y) = fetch_openml(data_id=dataset_id, return_X_y=True)
    X = data

    # Encode non-numeric features
    features_to_encode = dataset.get_features_by_type('nominal')
    features_to_encode = [X.columns[feature] for feature in features_to_encode[:-1]]
    for feature in features_to_encode:
        X = feature_encoder(X, feature)

    # Complete missing values
    X = KNNImputer(n_neighbors=1).fit_transform(X)

    # Standardize features
    X = StandardScaler().fit_transform(X)
    X = pd.DataFrame(X, index=data.index)

    # Encode target
    y = LabelEncoder().fit_transform(y.to_frame())
    y = y.flatten()
    y = pd.Series(y, index=X.index)

    # Retrieve pre-computed discrepancies
    with pd.HDFStore(path_data+'/openml/openml-discr.h5') as store:
        y_discr = store[str(dataset_id)]
    mask_instances_with_discrepancies = (y_discr==1).values

    y.name = 'Label'
    y_discr.name = 'Discrepancies'

    labels = pd.concat((y, y_discr), axis=1)

    return (X, labels, mask_instances_with_discrepancies)

In [None]:
# Sanity check of the dataset' cleaning & preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

dataset_id = 15
X, labels, mask_instances_with_discrepancies = get_and_prepare_openML_dataset(dataset_id)

clf = RandomForestClassifier()
cross_val_score(clf, X, labels.Label).mean()

In [None]:
res = []

# OpenML-CC18 Curated Classification benchmark
suite = openml.study.get_suite(99)

for task_id in suite.tasks[:]:

    task = openml.tasks.get_task(task_id)
    dataset_id = task.dataset_id

    X, labels, mask_instances_with_discrepancies = get_and_prepare_openML_dataset(dataset_id)

    if len(labels.Label.unique())>2:
            continue

    print('#########')
    print(dataset_id)

    c = labels.Label.unique()[0]
    if len(X[labels.Label==c][labels.Discrepancies==1])==0 or len(X[labels.Label!=c][labels.Discrepancies==1])==0:
        print("not enough discrepancies")
        continue

    n_samples = int(X.shape[0]/2)
    print(n_samples)
    dist0 = emd_samples(X.sample(n_samples), X.sample(n_samples))
    res.append({'Dataset':dataset_id, 'Distance':dist0, 'Comparison':'Entire dataset', 'Label':None})

    # Distance computations are made 1 class versus another class

    for c in labels.Label.unique():

        tmp1 = X[labels.Label==c].values
        tmp2 = X[labels.Label!=c].values
        dist1 = emd_samples(tmp1, tmp2)
        res.append({'Dataset':dataset_id, 'Distance':dist1, 'Comparison':'Between classes', 'Label':str(c)+' vs all'})

        tmp1 = X[labels.Label==c][labels.Discrepancies==0].values
        tmp2 = X[labels.Label!=c][labels.Discrepancies==0].values
        dist2 = emd_samples(tmp1, tmp2)
        res.append({'Dataset':dataset_id, 'Distance':dist2, 'Comparison':'Between classes - Instances without discrepancies', 'Label':str(c)+' vs all'})

        tmp1 = X[labels.Label==c][labels.Discrepancies==1].values
        tmp2 = X[labels.Label!=c][labels.Discrepancies==1].values
        dist3 = emd_samples(tmp1, tmp2)
        res.append({'Dataset':dataset_id, 'Distance':dist3, 'Comparison':'Between classes - Instances with discrepancies', 'Label':str(c)+' vs all'})

        res.append({'Dataset':dataset_id, 'Distance':dist2/dist3, 'Comparison':'Ratio', 'Label':str(c)+' vs all'})

    df = pd.DataFrame(res)
    df.to_csv(path_data+'/openml/stats_discr.csv')


In [None]:
path_data = '/Users/a435vv/Desktop/discrepancies'
res = pd.read_csv(path_data+'/openml/stats_discr.csv', index_col=0)

In [None]:
res.loc[:,'Distance'].groupby(res.Comparison).mean()

In [None]:
fig, ax = plt.subplots(figsize=(9,5)) 
sns.boxplot(data=res[res.Comparison!='Ratio'], x='Distance', y='Comparison', palette="vlag", ax=ax)
ax.set_yticklabels(['Distance between 2 random samples\n from the entire dataset - $\\bf{for\ control}$',
                     'Distances between instances of\n opposite classes - $\\bf{for\ control}$',
                     'Distances between instances of\n opposite classes $\\bf{without\ discrepancies}$',
                     'Distances between instances of\n opposite classes $\\bf{with\ discrepancies}$'], rotation=0, horizontalalignment='right')
#ax.set_title("Distributions of normalized Wasserstein distances for various\n configurations over OpenML-CC18 binary classification datasets")
ax.set_title("Comparison of the closeness of instances\n with and without prediction discrepancies")
ax.set_ylabel('')
ax.set_xlabel('Distribution of Wasserstein distances over \nOpenML-CC18 binary classification datasets')

plt.tight_layout()
plt.savefig(path_data+'/figures/wasserstein.pdf')

In [None]:
from sklearn.manifold import TSNE, Isomap
from sklearn.decomposition import PCA

X_embedded = PCA(n_components=2).fit_transform(X)
X_embedded = pd.DataFrame(X_embedded, index=X.index)

df_tmp = pd.concat((X_embedded, labels), axis=1)

# scatterplot
sns.scatterplot(data=df_tmp, x=0, y=1, hue="Discrepancies", style="Label")

In [None]:
## Look what labels / discrepancies have the nearest neighbours of points with discrepancies

from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=11).fit(X)
distances, indices = nbrs.kneighbors(X)

In [None]:
from sklearn.metrics import pairwise_distances

def count_discr(x):
    return df.Discrepancies.iloc[x].sum()

def count_label(x):
    return df.Label.iloc[x].value_counts()

nbrs_discr = pd.DataFrame(indices[:,1:]).apply(count_discr, axis=1)
nbrs_discr = nbrs_discr/indices[:,1:].shape[1]
nbrs_discr.name = 'Proportion of discrepancies among neighbours'

nbrs_labels = pd.DataFrame(indices[:,1:]).apply(count_label, axis=1).fillna(0)
nbrs_labels = nbrs_labels/indices[:,1:].shape[1]
nbrs_labels.columns = ['Label '+str(c) for c in nbrs_labels.columns]

nbrs_prop_opposite_labels = []
for i in range(X.shape[0]):
    nbrs_prop_opposite_labels.append ( (y.iloc[indices[i,1:]] != y.iloc[i]).astype(int).sum() )
nbrs_prop_opposite_labels = pd.Series(nbrs_prop_opposite_labels, index=X.index, name='Proportion of neighbours with different labels')
nbrs_prop_opposite_labels = nbrs_prop_opposite_labels/indices[:,1:].shape[1]

# Mean distance to points of opposite class / vs / same class
X_dist = pairwise_distances(X)
X_dist = pd.DataFrame(X_dist, index=X.index, columns=X.index)

mean_dist_same_label, mean_dist_diff_label = [], []
for i in range(X.shape[0]):
    mean_dist_same_label.append( X_dist.loc[i, y[y==y.loc[i]].index].drop(i, errors='ignore').sort_values().iloc[:20].mean() )
    mean_dist_diff_label.append( X_dist.loc[i, y[y!=y.loc[i]].index].drop(i, errors='ignore').sort_values().iloc[:20].mean() )
mean_dist_same_label = pd.Series(mean_dist_same_label, index=X.index, name='Mean distance to instances with same labels')
mean_dist_diff_label = pd.Series(mean_dist_diff_label, index=X.index, name='Mean distance to instances with different labels')

df2 = pd.concat((df, nbrs_discr, nbrs_labels, nbrs_prop_opposite_labels, mean_dist_same_label, mean_dist_diff_label), axis=1)

# sns.boxplot(data=df2, x="Discrepancies", y="Proportion of discrepancies among neighbours", whis=[0,100], width=.6, palette="vlag")
sns.displot(data=df2, hue="Discrepancies", x="Proportion of discrepancies among neighbours", palette="vlag", kde=True, stat="probability", common_norm=False)
# sns.swarmplot(data=df2, x="Discrepancies", y="Proportion of discrepancies among neighbours")

plt.figure()

# sns.boxplot(data=df2, x="Discrepancies", y="Proportion of neighbours with different labels", whis=[0,100], width=.6, palette="vlag")
sns.displot(data=df2, hue="Discrepancies", x="Proportion of neighbours with different labels", palette="vlag", kde=True, stat="probability", common_norm=False)
# sns.swarmplot(data=df2, x="Discrepancies", y="Proportion of neighbours with different labels")

plt.figure()

# sns.boxplot(data=df2, x="Discrepancies", y="Mean distance to instances with same labels", whis=[0,100], width=.6, palette="vlag")
sns.displot(data=df2, hue="Discrepancies", x="Mean distance to instances with same labels", palette="vlag", kde=True, stat="probability", common_norm=False)
# sns.swarmplot(data=df2, x="Discrepancies", y="Mean distance to instances with same labels", palette="vlag")

plt.figure()

# sns.boxplot(data=df2, x="Discrepancies", y="Mean distance to instances with different labels", whis=[0,100], width=.6, palette="vlag")
sns.displot(data=df2, hue="Discrepancies", x="Mean distance to instances with different labels", palette="vlag", kde=True, stat="probability", common_norm=False)
# sns.swarmplot(data=df2, x="Discrepancies", y="Mean distance to instances with different labels", palette="vlag")

In [None]:
from matplotlib.ticker import PercentFormatter

ax = sns.scatterplot(data=X, x="Proportion of discrepancies", y="Prediction error of the worst model")
plt.ylim([0,1])
plt.xlim([0,1])
plt.plot([0,1],[0,1], 'r--')
ax.xaxis.set_major_formatter(PercentFormatter(xmax=1))
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))
