In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import xgboost 
import shap
from interpret import show


import interpret.glassbox

file_loc = '*/GA_result/'

file_dir = ['BPIC2011_1_100/','BPIC2011_2_100/','BPIC2011_3_100/','BPIC2011_4_100/', 'BPIC2012_O_ACCEPTED_100/','BPIC2012_O_CANCELLED_100/','BPIC2012_O_DECLINED_100/',
            'BPIC2015_1_100/','BPIC2015_2_100/','BPIC2015_3_100/','BPIC2015_4_100/','BPIC2015_5_100/','BPIC2017_O_Accepted_20/','BPIC2017_O_Accepted_30/','BPIC2017_O_Accepted_40/']

file_tag = ['BPIC2011_1','BPIC2011_2','BPIC2011_3','BPIC2011_4','BPIC2012_1','BPIC2012_2','BPIC2012_3',
            'BPIC2015_1','BPIC2015_2','BPIC2015_3','BPIC2015_4','BPIC2015_5','BPIC2017_20','BPIC2017_30','BPIC2017_40']

indep_var = ['Bucketing',
        'Decision Tree', 'LightGBM',
       'Random Forest', 'Xgboost', 'Encoding', 'Drop act']

colors_bin = sns.color_palette("colorblind", len(file_dir))


In [None]:
def shap_val(df_c):
    X = df_c.loc[:,['bucketing', 'drop_act', 'encoding_index',
        'models_Decision Tree', 'models_LightGBM',
       'models_Random Forest', 'models_Xgboost']]#.values
    Y = df_c.fit#.values

    model = interpret.glassbox.ExplainableBoostingRegressor(interactions=0,random_state=42)
    model.fit(X, Y)
    explainer = shap.Explainer(model.predict,X)
    shap_values = explainer(X)
    return shap_values

In [None]:
def df_concat_maker(fd_):
    file_list = os.listdir(file_loc+str(fd))

    df_list = []
    df_ = pd.DataFrame()
    
    for i in file_list:
        with open(file=(file_loc+str(fd)+str(i)), mode='rb') as f:
            df =pickle.load(f)
        df_ = pd.concat([df_,df], sort=False)

    df_ = pd.get_dummies(data = df_, columns = ['encoding'], prefix = 'encoding')
    df_ = pd.get_dummies(data = df_, columns = ['models'], prefix = 'models')
    return df_

In [None]:
shap_bin = []
for fd in file_dir:

    df_concat = df_concat_maker(fd)
    shap_bin.append(list(np.mean(np.abs(shap_val(df_concat).values),axis=0)))

shap_bin = np.array(shap_bin)

In [None]:

fig, ax = plt.subplots(figsize=(32,9))
bar_width = 0.05
index = np.arange(len(indep_var))

for i in range(len(file_dir)):

    if i < 4:
        plt.bar(index + bar_width*i, shap_bin[i], bar_width, alpha=1, color=colors_bin[i], label=file_tag[i],hatch = '')
    elif (4 <= i) and (i < 7):
        plt.bar(index + bar_width*i+0.02, shap_bin[i], bar_width, alpha=1, color=colors_bin[i-4], label=file_tag[i],hatch = '..')
    elif (7 <= i) and (i < 12):
        plt.bar(index + bar_width*i+0.04, shap_bin[i], bar_width, alpha=1, color=colors_bin[i-7], label=file_tag[i],hatch = '\\\\')
    else:
        plt.bar(index + bar_width*i+0.06, shap_bin[i], bar_width, alpha=1, color=colors_bin[i-12], label=file_tag[i],hatch = '//')

plt.xticks(np.arange(bar_width+0.35, 7+ bar_width, 1), indep_var,fontsize="22")

plt.xlabel('Features', size = 25)
plt.ylabel('Mean Absolute Shapley value', size = 25)
plt.legend(fontsize="18")
plt.show()

In [None]:
plt.figure(figsize=(12, 7))
marker_bin = ['o','^','x','s','D']
for idx,fd in enumerate(file_dir):

    df_concat = df_concat_maker(fd)
    
    shap_value_ = shap_val(df_concat)
    if idx < 4:
        plt.scatter(shap_value_[:,'bucketing'].data, shap_value_[:,'bucketing'].values,s=18, color=colors_bin[0], label=file_tag[idx],marker = marker_bin[idx])
    elif (4 <= idx) and (idx < 7):
        plt.scatter(shap_value_[:,'bucketing'].data, shap_value_[:,'bucketing'].values,s=18, color=colors_bin[1], label=file_tag[idx],marker = marker_bin[idx-4])
    elif (7 <= idx) and (idx < 12):
        plt.scatter(shap_value_[:,'bucketing'].data, shap_value_[:,'bucketing'].values,s=18, color=colors_bin[2], label=file_tag[idx],marker = marker_bin[idx-7])
    else:
        plt.scatter(shap_value_[:,'bucketing'].data, shap_value_[:,'bucketing'].values,s=18, color=colors_bin[3], label=file_tag[idx],marker = marker_bin[idx-12])

plt.legend(fontsize="11")
plt.xlabel('Bucketing', size = 20)
plt.ylabel('SHAP value for bucketing', size = 20)
plt.xlim([-2, 43])     
plt.ylim([-0.8, 0.3])
plt.show()

In [None]:
for fd in file_dir:

    df_concat = df_concat_maker(fd)

    shap_value__ = shap_val(df_concat)
    
    print(fd)
    print('models_Decision Tree : ',round(np.mean(shap_value__[:,'models_Decision Tree'].values[np.where(shap_value__[:,'models_Decision Tree'].data == 1)]),5))
    print('models_Random Forest : ',round(np.mean(shap_value__[:,'models_Random Forest'].values[np.where(shap_value__[:,'models_Random Forest'].data == 1)]),5))
    print('models_LightGBM : ',round(np.mean(shap_value__[:,'models_LightGBM'].values[np.where(shap_value__[:,'models_LightGBM'].data == 1)]),5))
    print('models_Xgboost : ',round(np.mean(shap_value__[:,'models_Xgboost'].values[np.where(shap_value__[:,'models_Xgboost'].data == 1)]),5),'\n')

    

In [None]:
for fd in file_dir:

    df_concat = df_concat_maker(fd)

    shap_value___ = shap_val(df_concat)
    print(fd)
    print('encoding index : ',round(np.mean(shap_value___[:,'encoding_index'].values[np.where(shap_value___[:,'encoding_index'].data == 1)]),5))
    print('encoding aggregate : ',round(np.mean(shap_value___[:,'encoding_index'].values[np.where(shap_value___[:,'encoding_index'].data == 0)]),5),'\n')

    

In [None]:
for fd in file_dir:

    df_concat = df_concat_maker(fd)

    X = df_concat.loc[:,['bucketing', 'drop_act', 'encoding_index',
        'models_Decision Tree', 'models_LightGBM',
       'models_Random Forest', 'models_Xgboost']]
    Y = df_concat.fit
    model = interpret.glassbox.ExplainableBoostingRegressor(interactions=45,random_state=42)
    model.fit(X, Y)

    print(fd)
    show(model.explain_global())
