In [56]:
import numpy as np
import scipy.stats as ss
import pickle 
import shap
from scipy.stats import mannwhitneyu

In [57]:
def pickle_load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

## Load data

In [58]:
#if false, use the shaps of a cluster instead
real_shapley_values_for_whole_model = False


#adjust for correct model
if real_shapley_values_for_whole_model:
    filename_full_model = './shap_values_full_model.txt'
    shaps_full_model = pickle_load(filename=filename_full_model)
else:
    #fake the shapley values of the full model with the values of cluster_0
    filename_full_model = './shap_values_per_cluster.txt'
    shaps_full_model = pickle_load(filename=filename_full_model)['cluster_0']


filename_per_cluster = './shap_values_per_cluster.txt'
filename_per_feature = './shap_values_per_feature.txt'

shaps_per_cluster = pickle_load(filename=filename_per_cluster)
shaps_per_feature = pickle_load(filename=filename_per_feature)

In [59]:
#[feature][cluster] -> shapley values
shaps_per_feature['Age'][0][:10]

[0.7952834261698375,
 -0.2200954610953728,
 -0.22040439656518754,
 -0.22023397829206226,
 -0.21965845268609993,
 -0.7226681096908588,
 -0.7251288564297415,
 1.3029350604478802,
 -0.22673789665883476,
 -0.2194870985753481]

In [60]:
#[cluster][feature] -> shapley values
shaps_per_cluster['cluster_0']['Age'][:10]

[-0.23653411725163442,
 0.007121666402334254,
 0.010174802504834646,
 0.016543612745546152,
 0.017361911268461313,
 0.12104088200415887,
 0.09400855419110685,
 -0.10352762969902579,
 0.014475127621775583,
 0.009159684503362237]

## Transform data

In [None]:
# transform the fullmodel Explainer into a dict with format:
'''
cluster_0 : feat1 : [shap1, shap2, ..., shapn]
            feat2 : [shap1, shap2, ..., shapn]
            ...
            featn : [shap1, shap2, ..., shapn]
cluster_1 : feat1 : [shap1, shap2, ..., shapn]
            feat2 : [shap1, shap2, ..., shapn]
            ...
            featn : [shap1, shap2, ..., shapn]
'''

## Find p_values and significance

In [61]:
def find_pvalues_per_feature(shaps_full_model, dict_of_features_with_shaps):
    '''
    Calc mann-withney for every feature against full_model

    Returns: dict with p-values per feature
    '''
    dict_of_pvalues_per_feature = {}

    #loop over available features
    for feature_name, shapley_values in dict_of_features_with_shaps.items():

        #calc M-W two sided (same or different)
        _ , p_value = mannwhitneyu(shaps_full_model[feature_name], shapley_values, alternative='two-sided')

        dict_of_pvalues_per_feature[feature_name] = round(p_value,4)
    
    return dict_of_pvalues_per_feature

In [62]:
p_values_per_cluster_per_feature = {}

#loop over every cluster
for cluster_name in shaps_per_cluster.keys():
    result = find_pvalues_per_feature(shaps_full_model=shaps_full_model, 
                                      dict_of_features_with_shaps=shaps_per_cluster[cluster_name])
    p_values_per_cluster_per_feature[cluster_name] = result

p_values_per_cluster_per_feature

{'cluster_0': {'Pclass': 1.0,
  'Sex': 1.0,
  'Age': 1.0,
  'Fare': 1.0,
  'Embarked': 1.0,
  'Alone': 1.0,
  'Title': 1.0},
 'cluster_1': {'Pclass': 0.4361,
  'Sex': 0.0,
  'Age': 0.2312,
  'Fare': 0.0008,
  'Embarked': 0.1587,
  'Alone': 0.4049,
  'Title': 0.3152},
 'cluster_2': {'Pclass': 0.8493,
  'Sex': 0.0717,
  'Age': 0.293,
  'Fare': 0.3592,
  'Embarked': 0.8641,
  'Alone': 0.7268,
  'Title': 0.5152}}

### Store results

In [63]:
with open('p_values_per_cluster_per_feature.pkl', 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(p_values_per_cluster_per_feature, outp, pickle.HIGHEST_PROTOCOL)

### Find the significant differences

In [64]:
#what is significantly different?

def filter_significant_differences(p_values_per_cluster_per_feature, signicance_level):
    significance_reporter_per_cluster = {cluster_name:[] for cluster_name in p_values_per_cluster_per_feature.keys()}
    
    for cluster_name, dict_of_pvalues_per_feature in p_values_per_cluster_per_feature.items():
        for feature_name, p_value in dict_of_pvalues_per_feature.items():
            if p_value <= signicance_level:
                significance_reporter_per_cluster[cluster_name].append(feature_name)
    return significance_reporter_per_cluster

filter_significant_differences(p_values_per_cluster_per_feature, 0.05)

{'cluster_0': [], 'cluster_1': ['Sex', 'Fare'], 'cluster_2': []}