In [1]:
## Importing libraries

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import os
import sys
import math
import numpy as np
import pandas as pd
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from scipy.spatial.distance import pdist
from bokeh.models import HoverTool
from bokeh.io import output_notebook
from bokeh.models.glyphs import Text
from bokeh.layouts import row
from bokeh.io import export_png
output_notebook()

from sklearn.neighbors import NearestNeighbors

## Importing local Python files

sys.path.append('../') 
from src import Preprocessing, Distances, Explanation, DR_algorithms, genericMethods
from src.GAPS_tabular import GapsExplainer
from src.GAPS_Explanation import get_local_explanations_for_GAPS, compute_local_divergences

## Defining input paths (Update the Data_path attribute accordingly)

Data_path = "../data/"

### Step 1: Load and Pre-process Dataset

#### a. Load Dataset

In [2]:
df = pd.read_csv(Data_path+'BCancer.csv')
df = df.fillna(0)
#df = df.sample(100)
df.reset_index(inplace= True, drop= True)
df.head()

Unnamed: 0,radius_mean,texture_mean,perim_mean,area_mean,smooth_mean,compact_mean,concave_mean,concv_p_mean,symetry_mean,frac_dim_mean,...,texture_worst,perim_worst,area_worst,smooth_worst,compact_worst,concave_worst,concv_p_wrst,symetry_worst,frac_dim_worst,diagnosis
0,18.63,25.11,124.8,1088.0,0.1064,0.1887,0.2319,0.1244,0.2183,0.06197,...,34.01,160.5,1670.0,0.1491,0.4257,0.6133,0.1848,0.3444,0.09782,1
1,11.84,18.7,77.93,440.6,0.1109,0.1516,0.1218,0.05182,0.2301,0.07799,...,28.12,119.4,888.7,0.1637,0.5775,0.6956,0.1546,0.4761,0.1402,1
2,17.02,23.98,112.8,899.3,0.1197,0.1496,0.2417,0.1203,0.2248,0.06382,...,32.09,136.1,1344.0,0.1634,0.3559,0.5588,0.1847,0.353,0.08482,1
3,19.27,26.47,127.9,1162.0,0.09401,0.1719,0.1657,0.07593,0.1853,0.06261,...,30.9,161.4,1813.0,0.1509,0.659,0.6091,0.1785,0.3672,0.1123,1
4,16.13,17.88,107.0,807.2,0.104,0.1559,0.1354,0.07752,0.1998,0.06515,...,27.26,132.7,1261.0,0.1446,0.5804,0.5274,0.1864,0.427,0.1233,1


#### b. Identify Intrinsic Dimensionality

In [8]:
## Identify intrinsic dimensionality using: the Maximum likelihood intrinsic dimensionality estimator

int_dim = Preprocessing.repeated(Preprocessing.intrinsic_dim_scale_interval, 
                             df.values, 
                             mode='bootstrap', 
                             nb_iter=500, # nb_iter for bootstrapping
                             verbose=1, 
                             k1=10, k2=20)
print ("Intrinsic dimensionality:", int_dim)

100%|██████████| 500/500 [00:29<00:00, 17.63it/s]

Intrinsic dimensionality: 2





#### c. Set Features and Target

In [3]:
model_features, target = Preprocessing.set_features_and_target(df)

y = (df[target].values.reshape(-1, ))
X_df = pd.DataFrame(df, columns=model_features)

Features:  ['radius_mean', 'texture_mean', 'perim_mean', 'area_mean', 'smooth_mean', 'compact_mean', 'concave_mean', 'concv_p_mean', 'symetry_mean', 'frac_dim_mean', 'radius_se', 'texture_se', 'perim_se', 'area_se', 'smooth_se', 'compact_se', 'concavity_se', 'concave_p_se', 'symmetry_se', 'frac_dim_se', 'radius_worst', 'texture_worst', 'perim_worst', 'area_worst', 'smooth_worst', 'compact_worst', 'concave_worst', 'concv_p_wrst', 'symetry_worst', 'frac_dim_worst']
Target:  ['diagnosis']


#### d. Identify Categorical Features

In [4]:
X_transformed, categorical_features, numeric_features, categorical_names = Preprocessing.identify_and_transform_features(df, model_features)

### Step 2: Run Dimensionality Reduction

In [5]:
ld_embedding = DR_algorithms.run_DR_Algorithm("tSNE", X_transformed)

In [6]:
indexes = [i for i in range(0,len(y))]
colormap = {0: 'SteelBlue', 1: 'SandyBrown'}
colors = [colormap[z] for z in y.ravel()]
labels = {0: 'Defaulter', 1: 'Non-defaulter'}
annotations = [labels[z] for z in y.ravel()]

source = ColumnDataSource(
        data=dict(
            x=ld_embedding[:,0],
            y=ld_embedding[:,1],
            all_colors = colors,
            label = annotations,
            indexes = indexes
        )
    )

hover = HoverTool(
        tooltips=[
            ("index", "$index"),
            ("(x,y)", "($x, $y)"),
        ]
    )

p = figure(plot_width=700, plot_height=450, tools=[hover],
           title="Embedding")
glyph = Text(x="x", y="y", text='indexes', x_offset=7, y_offset=7, text_font_size="9pt", text_color="grey")
p.circle('x', 'y', fill_color='all_colors', line_color='white', legend='label', size=10, source=source)
p.add_glyph(source, glyph)
show(p)

### Step 3: Obtain Representative Data-subset and Select a Single Subset

In [7]:
rep_subset = genericMethods.generate_representative_subset(20, X_df)

In [8]:
Rep_order = ["Density and dissimilarity based subset", "Cluster based subset"]

for index in range(0,len(rep_subset)):
    print(Rep_order[index]+" : "+''.join(str(rep_subset[index])))

Density and dissimilarity based subset : [85, 51, 81, 50, 43, 48, 19, 47, 36, 45, 93, 40, 37, 35, 83, 34, 29, 8, 71, 52]
Cluster based subset : [44, 18, 49, 58, 85, 51, 54, 28, 39, 72, 12, 0, 57, 55, 15, 91, 47, 97, 48, 3]


In [9]:
## Calculate coverage of density and dissimilarity based subset
    
genericMethods.compute_coverage(X_df, rep_subset[0])

Coverage of density and dissimilarity based subset is: 86.87%


In [10]:
## user selects a subset

data_instance_numbers = rep_subset[0]

### Step 4: Initiate and Execute GAPS Explainer

#### a. Initiate Explainer

In [11]:
explainer = GapsExplainer(X_df.values,
                    feature_names=model_features,
                    class_names=target,
                    categorical_features=categorical_features,
                    categorical_names=categorical_names,
                    discretize_continuous=False,
                    discretizer='quartile',
                    random_state=42)

In [12]:
neighbors, neighbors_embd, oversampled_data, oversampled_data_embd, local_feature_contributions, local_feature_contributions_embd, neighbors_local, neighbors_embd_local = explainer.generate_perturbed_neighborhood_global(
                                                                                X_df.values,
                                                                                X_transformed,
                                                                                ld_embedding,
                                                                                data_instance_numbers,
                                                                                model_features,
                                                                                categorical_features,
                                                                                numeric_features,
                                                                                nbrs=5,
                                                                                num_features=5)

[85 49 79 44 18]
[51 17 75  9  6]
[81 83 58 76 69]
[50 36 76 31 73]
[43 93  9 18 49]
[48 78 92 12  2]
[19 10 63 60 49]
[47 15 59 87 78]
[36 50 67 31 73]
[45 61 40 64 97]
[93 43 49 18 63]
[40 97 45 24 70]
[37 39 22 25 20]
[35 34 27  2 13]
[83 81 80 41 76]
[34 35 13 17  6]
[29 16 84 68 90]
[ 8 62 70 60 63]
[71 29 36 16 30]
[52 42 88 32  0]
......................................
[85 79 18 90 84]
[51 17 38 75 13]
[81 83 86 41 76]
[50 36 73 31 67]
[43 93 14  9 58]
[48 92 78 12  0]
[19 10 60 24 96]
[47 15 64  2 42]
[36 67 50 31 80]
[45 56 61 40 97]
[93 43 14 94 85]
[40 45 97 89 61]
[37 22 39 25 20]
[35 27  4 34 13]
[83 81 86 41 80]
[34 13 35 17 27]
[29 16 84 71 68]
[ 8 60 62 10 19]
[71 74 67 30 29]
[52 53 32 88 42]
Neighbors for data-point 85 85
(1, 30)
(1, 30)
Neighbors for data-point 49 79
(2, 30)
(1, 30)
Neighbors for data-point 79 18
(4, 30)
(1, 30)
Neighbors for data-point 44 90
(6, 30)
(1, 30)
Neighbors for data-point 18 84
(8, 30)
(1, 30)
Neighbors for data-point 51 51
(1, 30)
(1, 30)

#### b. Explain the Point in Original Data and in Embedding

In [13]:
sorted_index_combinations, sorted_index_combinations_embd = get_local_explanations_for_GAPS(X_df, ld_embedding, data_instance_numbers, local_feature_contributions)
local_divergences = compute_local_divergences(neighbors_local, neighbors_embd_local, local_feature_contributions, local_feature_contributions_embd)

In [14]:
corr_feat_dist, feature_dict, sorted_index_combinations = Explanation.explain_point_global(oversampled_data, 
                                                                               model_features, 
                                                                               categorical_features, 
                                                                               numeric_features)

In [15]:
corr_feat_dist_embd, feature_dict_embd, sorted_index_combinations_embd = Explanation.explain_point_global(oversampled_data_embd, 
                                                                                              model_features, 
                                                                                              categorical_features, 
                                                                                              numeric_features)

In [16]:
from sklearn.manifold import Isomap
int_dim = 2
unsquareform = lambda a: a[np.nonzero(np.triu(a, 1))]

temp_data = np.zeros((len(data_instance_numbers),local_feature_contributions.shape[1]))
temp_data_embd = np.zeros((len(data_instance_numbers),int_dim))

for index in range(0,len(data_instance_numbers)):
    temp_data[index] = X_df.values[index]
for index in range(0,int_dim):
    temp_data_embd[index] = ld_embedding[index]

iso_embd = Isomap(n_components=2, n_neighbors =1)

X_trans = iso_embd.fit_transform(temp_data)        
distance_matrix = iso_embd.dist_matrix_
iso_embd.dist_matrix_[iso_embd.dist_matrix_ == 0] = -9999
distances = unsquareform(iso_embd.dist_matrix_)

embd_trans = iso_embd.fit_transform(temp_data_embd)        
distance_matrix_embd = iso_embd.dist_matrix_
iso_embd.dist_matrix_[iso_embd.dist_matrix_ == 0] = -9999
distances_embd = unsquareform(iso_embd.dist_matrix_)

index_combinations=[]
for row in range(0,len(distance_matrix)):
    for col in range(row+1,len(distance_matrix[0])):
        index_combinations.append([row,col])

sorted_indexes = np.argsort(distances)
sorted_index_combinations = []

sorted_indexes_embd = np.argsort(distances_embd)
sorted_index_combinations_embd = []

for index in sorted_indexes_embd:
    sorted_index_combinations_embd.append(index_combinations[index])

for index in sorted_indexes:
    sorted_index_combinations.append(index_combinations[index])

#print(distances)
#print(distances_embd)
print("Original Order of Geodesic Distances\n")
print(sorted_index_combinations)
print("#################################################################################################\n")
print("After Transformation Order of Geodesic Distances\n")
print(sorted_index_combinations_embd)

Original Order of Geodesic Distances

[[0, 1], [6, 16], [6, 18], [7, 8], [7, 9], [7, 10], [7, 12], [7, 13], [6, 15], [7, 14], [7, 17], [7, 19], [8, 11], [8, 12], [8, 13], [8, 15], [8, 16], [7, 15], [8, 18], [6, 13], [6, 11], [4, 18], [4, 19], [5, 6], [5, 7], [5, 8], [5, 9], [5, 10], [6, 12], [5, 11], [5, 13], [5, 14], [5, 16], [5, 17], [5, 18], [5, 19], [6, 7], [5, 12], [9, 11], [9, 12], [9, 13], [13, 14], [13, 15], [13, 16], [13, 17], [13, 18], [13, 19], [14, 15], [12, 19], [14, 16], [15, 16], [15, 17], [15, 18], [15, 19], [16, 17], [16, 19], [17, 18], [14, 18], [12, 18], [12, 17], [12, 16], [9, 15], [9, 16], [9, 18], [10, 11], [10, 12], [10, 13], [10, 15], [10, 16], [10, 18], [11, 12], [11, 13], [11, 14], [11, 15], [11, 17], [11, 19], [12, 13], [12, 14], [4, 17], [4, 16], [5, 15], [4, 14], [1, 9], [1, 10], [1, 11], [1, 12], [1, 14], [1, 15], [1, 16], [1, 17], [1, 18], [1, 19], [2, 3], [2, 6], [4, 15], [2, 7], [2, 8], [2, 9], [2, 10], [1, 8], [1, 7], [1, 6], [1, 5], [0, 2], [0, 4], [0

#### c. Plot Feature Contribution in Original Data and in Embedding

In [17]:
bar_color = []
y_offset_val = []
label = []
for item in feature_dict.values():
    label.append("{0:.2f}".format(item))
    if item<0:
        bar_color.append('#e34a33')
        y_offset_val.append(item+15)
    else:
        bar_color.append('#2ca25f')
        y_offset_val.append(item)
        
        
source = ColumnDataSource(
        data=dict(
            x_val = list(feature_dict.keys()),
            y_val = list(feature_dict.values()),
            y_offset = y_offset_val,
            labels =  label,
            color = bar_color
        )
    )

p = figure(x_range=list(feature_dict.keys()), plot_height=420, plot_width=980, title="Feature Influences", x_axis_label = "Features",
        y_axis_label = "Contribution", toolbar_location=None, tools="")

glyph = Text(x="x_val", y="y_val", text='labels', x_offset=-10, y_offset='y_offset', text_font_size="9pt", text_color="black")


p.vbar(x="x_val", top="y_val", color="color", width=0.9, source=source)
p.xaxis.major_label_orientation = math.pi/3
p.xgrid.grid_line_color = None
p.y_range.start = np.min(list(feature_dict.values()))-0.0001
p.add_glyph(source, glyph)

#export_png(p, filename="Feature-Influences-Original.png")
show(p)

In [18]:
bar_color = []
y_offset_val = []
label = []
for item in feature_dict_embd.values():
    label.append("{0:.2f}".format(item))
    if item<0:
        bar_color.append('#e34a33')
        y_offset_val.append(item+15)
    else:
        bar_color.append('#2ca25f')
        y_offset_val.append(item)
        
        
source = ColumnDataSource(
        data=dict(
            x_val = list(feature_dict_embd.keys()),
            y_val = list(feature_dict_embd.values()),
            y_offset = y_offset_val,
            labels =  label,
            color = bar_color
        )
    )

p = figure(x_range=list(feature_dict_embd.keys()), plot_height=420, plot_width=980, title="Feature Influences", x_axis_label = "Features",
        y_axis_label = "Contribution", toolbar_location=None, tools="")

glyph = Text(x="x_val", y="y_val", text='labels', x_offset=-10, y_offset='y_offset', text_font_size="9pt", text_color="black")


p.vbar(x="x_val", top="y_val", color="color", width=0.9, source=source)
p.xaxis.major_label_orientation = math.pi/3
p.xgrid.grid_line_color = None
p.y_range.start = np.min(list(feature_dict_embd.values()))-0.0001
p.add_glyph(source, glyph)
#export_png(p, filename="Feature-Influences-tSNE.png")
show(p)

#### d. Calculate Local Divergence

In [19]:
components, overall_divergence = Explanation.compute_global_divergence(corr_feat_dist, corr_feat_dist_embd, neighbors, neighbors_embd, local_divergences)

In [20]:
components = components.split(",")
print("Discrepancy in Feature Influence", components[0])
print("Discrepancy in Neighborhood Content", components[1])
print("Discrepancy in Neighborhood Order", components[2])
    

Discrepancy in Feature Influence 0.4870174473818492
Discrepancy in Neighborhood Content 0.41
Discrepancy in Neighborhood Order 0.61


In [21]:
print(overall_divergence)

0.09681515833457467
