In [1]:
import numpy as np
import scipy.stats as ss
import pickle 
import shap
from scipy.stats import mannwhitneyu

In [2]:
def pickle_load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

## Load data

In [11]:
#if false, use the shaps of a cluster instead
real_shapley_values_for_whole_model = True


#adjust for correct model
if real_shapley_values_for_whole_model:
    filename_full_model = './shap_values_per_feature_dict.txt'
    shaps_full_model = pickle_load(filename=filename_full_model)
else:
    #fake the shapley values of the full model with the values of cluster_0
    filename_full_model = './shap_values_per_cluster.txt'
    shaps_full_model = pickle_load(filename=filename_full_model)['cluster_0']


filename_per_cluster = './shap_values_per_cluster.txt'
filename_per_feature = './shap_values_per_feature.txt'

shaps_per_cluster = pickle_load(filename=filename_per_cluster)
shaps_per_feature = pickle_load(filename=filename_per_feature)

In [12]:
shaps_full_model

{'Pclass': array([ 0.07185714,  0.1767619 ,  0.23540476,  0.00066667,  0.07497619,
         0.1397619 ,  0.1397619 , -0.08369048, -0.08338095, -0.26066667,
         0.30221429, -0.12207143, -0.04278571,  0.36564286,  0.06690476,
         0.00066667,  0.06669048, -0.07364286,  0.31845238,  0.31845238,
        -0.079     ,  0.00066667, -0.07364286, -0.26247619,  0.27897619,
         0.37814286, -0.07364286,  0.259     ,  0.13216667, -0.26066667,
         0.20754762, -0.07364286,  0.20147619, -0.14116667, -0.07364286,
         0.21154762, -0.19378571,  0.06669048, -0.04847619,  0.26311905,
        -0.07364286, -0.14116667, -0.26066667,  0.24280952, -0.07364286,
        -0.0675    ,  0.03019048,  0.32866667, -0.07364286, -0.08138095,
        -0.14830952,  0.14880952,  0.06147619, -0.11980952, -0.26066667,
         0.03959524, -0.14116667,  0.13216667,  0.18442857,  0.39747619,
         0.31583333,  0.14395238, -0.226     , -0.11980952, -0.28516667,
        -0.12119048,  0.22411905,  0.303 

In [13]:
#[feature][cluster] -> shapley values
shaps_per_feature['Age'][0][:10]

[0.7952834261698375,
 -0.2200954610953728,
 -0.22040439656518754,
 -0.22023397829206226,
 -0.21965845268609993,
 -0.7226681096908588,
 -0.7251288564297415,
 1.3029350604478802,
 -0.22673789665883476,
 -0.2194870985753481]

In [14]:
#[cluster][feature] -> shapley values
shaps_per_cluster['cluster_0']['Age'][:10]

[-0.23653411725163442,
 0.007121666402334254,
 0.010174802504834646,
 0.016543612745546152,
 0.017361911268461313,
 0.12104088200415887,
 0.09400855419110685,
 -0.10352762969902579,
 0.014475127621775583,
 0.009159684503362237]

## Transform data

In [15]:
# transform the fullmodel Explainer into a dict with format:
'''
cluster_0 : feat1 : [shap1, shap2, ..., shapn]
            feat2 : [shap1, shap2, ..., shapn]
            ...
            featn : [shap1, shap2, ..., shapn]
cluster_1 : feat1 : [shap1, shap2, ..., shapn]
            feat2 : [shap1, shap2, ..., shapn]
            ...
            featn : [shap1, shap2, ..., shapn]
'''

'\ncluster_0 : feat1 : [shap1, shap2, ..., shapn]\n            feat2 : [shap1, shap2, ..., shapn]\n            ...\n            featn : [shap1, shap2, ..., shapn]\ncluster_1 : feat1 : [shap1, shap2, ..., shapn]\n            feat2 : [shap1, shap2, ..., shapn]\n            ...\n            featn : [shap1, shap2, ..., shapn]\n'

## Find p_values and significance

In [16]:
def find_pvalues_per_feature(shaps_full_model, dict_of_features_with_shaps):
    '''
    Calc mann-withney for every feature against full_model

    Returns: dict with p-values per feature
    '''
    dict_of_pvalues_per_feature = {}

    #loop over available features
    for feature_name, shapley_values in dict_of_features_with_shaps.items():

        #calc M-W two sided (same or different)
        _ , p_value = mannwhitneyu(shaps_full_model[feature_name], shapley_values, alternative='two-sided')

        dict_of_pvalues_per_feature[feature_name] = round(p_value,4)
    
    return dict_of_pvalues_per_feature

In [17]:
p_values_per_cluster_per_feature = {}

#loop over every cluster
for cluster_name in shaps_per_cluster.keys():
    result = find_pvalues_per_feature(shaps_full_model=shaps_full_model, 
                                      dict_of_features_with_shaps=shaps_per_cluster[cluster_name])
    p_values_per_cluster_per_feature[cluster_name] = result

p_values_per_cluster_per_feature

{'cluster_0': {'Pclass': 0.9459,
  'Sex': 0.0109,
  'Age': 0.0031,
  'Fare': 0.0087,
  'Embarked': 0.0558,
  'Alone': 0.3412,
  'Title': 0.635},
 'cluster_1': {'Pclass': 0.5179,
  'Sex': 0.547,
  'Age': 0.7715,
  'Fare': 0.0,
  'Embarked': 0.0782,
  'Alone': 0.9766,
  'Title': 0.362},
 'cluster_2': {'Pclass': 0.918,
  'Sex': 0.0878,
  'Age': 0.6117,
  'Fare': 0.0224,
  'Embarked': 0.0123,
  'Alone': 0.3127,
  'Title': 0.3491}}

### Store results

In [18]:
with open('p_values_per_cluster_per_feature.pkl', 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(p_values_per_cluster_per_feature, outp, pickle.HIGHEST_PROTOCOL)

### Find the significant differences

In [19]:
#what is significantly different?

def filter_significant_differences(p_values_per_cluster_per_feature, signicance_level):
    significance_reporter_per_cluster = {cluster_name:[] for cluster_name in p_values_per_cluster_per_feature.keys()}
    
    for cluster_name, dict_of_pvalues_per_feature in p_values_per_cluster_per_feature.items():
        for feature_name, p_value in dict_of_pvalues_per_feature.items():
            if p_value <= signicance_level:
                significance_reporter_per_cluster[cluster_name].append(feature_name)
    return significance_reporter_per_cluster

filter_significant_differences(p_values_per_cluster_per_feature, 0.05)

{'cluster_0': ['Sex', 'Age', 'Fare'],
 'cluster_1': ['Fare'],
 'cluster_2': ['Fare', 'Embarked']}