In [None]:
## Importing libraries

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import os
import sys
import math
import numpy as np
import pandas as pd
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook
from bokeh.models.glyphs import Text
from bokeh.layouts import row
from bokeh.io import export_png
output_notebook()

## Importing local Python files

sys.path.append('../') 
from src import Preprocessing, Distances, Explanation, DR_algorithms, genericMethods
from src.LAPS_tabular import LapsExplainer

## Defining input paths (Update the Data_path attribute accordingly)

Data_path = "../data/"

### Step 1: Load and Pre-process Dataset

#### a. Load Dataset

In [2]:
df = pd.read_csv(Data_path+'breast_cancer.csv')
df = df.fillna(0)
df = df.sample(100)
df.reset_index(inplace= True, drop= True)
df.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,17.29,22.13,114.4,947.8,0.08999,0.1273,0.09697,0.07507,0.2108,0.05464,...,27.24,137.9,1295.0,0.1134,0.2867,0.2298,0.1528,0.3067,0.07484,1
1,10.75,14.97,68.26,355.3,0.07793,0.05139,0.02251,0.007875,0.1399,0.05688,...,20.72,77.79,441.2,0.1076,0.1223,0.09755,0.03413,0.23,0.06769,0
2,15.1,22.02,97.26,712.8,0.09056,0.07081,0.05253,0.03334,0.1616,0.05684,...,31.69,117.7,1030.0,0.1389,0.2057,0.2712,0.153,0.2675,0.07873,1
3,15.08,25.74,98.0,716.6,0.1024,0.09769,0.1235,0.06553,0.1647,0.06464,...,33.22,121.2,1050.0,0.166,0.2356,0.4029,0.1526,0.2654,0.09438,1
4,14.25,22.15,96.42,645.7,0.1049,0.2008,0.2135,0.08653,0.1949,0.07292,...,29.51,119.1,959.5,0.164,0.6247,0.6922,0.1785,0.2844,0.1132,1


#### b. Identify Intrinsic Dimensionality

In [6]:
## Identify intrinsic dimensionality using: the Maximum likelihood intrinsic dimensionality estimator

int_dim = Preprocessing.repeated(Preprocessing.intrinsic_dim_scale_interval, 
                             df.values, 
                             mode='bootstrap', 
                             nb_iter=500, # nb_iter for bootstrapping
                             verbose=1, 
                             k1=10, k2=20)
print ("Intrinsic dimensionality:", int_dim)

100%|██████████| 500/500 [00:12<00:00, 39.37it/s]

Intrinsic dimensionality: 2





#### c. Set Features and Target

In [3]:
model_features, target = Preprocessing.set_features_and_target(df)

y = (df[target].values.reshape(-1, ))
X_df = pd.DataFrame(df, columns=model_features)

Features:  ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
Target:  ['diagnosis']


#### d. Identify Categorical Features

In [4]:
X_transformed, categorical_features, numeric_features, categorical_names = Preprocessing.identify_and_transform_features(df, model_features)

### Step 2: Run Dimensionality Reduction

In [5]:
ld_embedding = DR_algorithms.run_DR_Algorithm("tSNE", X_transformed)

In [6]:
indexes = [i for i in range(0,len(y))]
colormap = {0: 'SteelBlue', 1: 'SandyBrown'}
colors = [colormap[z] for z in y.ravel()]
labels = {0: 'Benign', 1: 'Malignant'}
annotations = [labels[z] for z in y.ravel()]

source = ColumnDataSource(
        data=dict(
            x=ld_embedding[:,0],
            y=ld_embedding[:,1],
            all_colors = colors,
            label = annotations,
            indexes = indexes
        )
    )

hover = HoverTool(
        tooltips=[
            ("index", "$index"),
            ("(x,y)", "($x, $y)"),
        ]
    )

p = figure(plot_width=700, plot_height=450, tools=[hover],
           title="Embedding")
glyph = Text(x="x", y="y", text='indexes', x_offset=7, y_offset=7, text_font_size="9pt", text_color="grey")
p.circle('x', 'y', fill_color='all_colors', line_color='white', legend='label', size=10, source=source)
p.add_glyph(source, glyph)
show(p)

### Step 3: Obtain Representative Data-point and Select a Single Point

In [7]:
rep_points = genericMethods.generate_representative_points(20, X_df, X_transformed, y, ld_embedding)

In [8]:
Rep_order = ["Density Based Outliers", "Points with Highest Density", "Points with Misplaced Neighborhoods", "Points Close to Decision Boundary", "Cluster Centres"]

for index in range(0,len(rep_points)):
    print(Rep_order[index]+" : "+''.join(str(rep_points[index])))

Density Based Outliers : [12, 88, 93, 29, 90, 86, 8, 10, 14, 71, 98, 9, 57, 53, 51, 74, 28, 27, 83, 48]
Points with Highest Density : [12, 88, 93, 29, 90, 86, 8, 10, 14, 71, 98, 9, 57, 53, 51, 74, 28, 27, 83, 48]
Points with Misplaced Neighborhoods : [74, 46, 92, 94, 36, 0, 28, 56, 22, 72, 2, 12, 9, 73, 54, 42, 40, 27, 59, 39]
Points Close to Decision Boundary : [20, 72, 39, 87, 69, 23, 95, 98, 73, 54, 26, 16, 59, 25, 29, 6, 10, 27, 41, 99]
Cluster Centres : [37, 56]


In [9]:
## user selects an instance

data_instance_number = 12

### Step 4: Initiate and Execute LAPS Explainer

#### a. Initiate Explainer

In [10]:
explainer = LapsExplainer(X_df.values,
                feature_names=model_features,
                class_names=target,
                categorical_features=categorical_features,
                categorical_names=categorical_names,
                discretize_continuous=False,
                discretizer='quartile',
                random_state=42)

In [11]:
neighbors, neighbors_embd, oversampled_data, oversampled_data_embd = explainer.generate_perturbed_neighborhood(
                                                                                X_df.values,
                                                                                X_transformed,
                                                                                ld_embedding,
                                                                                data_instance_number,
                                                                                X_df.values[data_instance_number],
                                                                                nbrs=20,
                                                                                num_features=5)

point | point_embd
12     |     12
85     |     50
50     |     43
51     |     85
14     |     94
69     |     15
74     |     2
5     |     66
43     |     22
46     |     89
75     |     51
71     |     35
15     |     46
92     |     70
40     |     5
70     |     38
72     |     14
31     |     13
53     |     79
94     |     53


#### b. Explain the Point in Original Data and in Embedding

In [12]:
corr_feat_dist, feature_dict, feature_distance_contribution, dvs_matrix, sorted_indexes = Explanation.explain_point_local(X_df.values[data_instance_number], 
                                                                                                              neighbors, 
                                                                                                              oversampled_data, 
                                                                                                              model_features, 
                                                                                                              categorical_features, 
                                                                                                              numeric_features)

In [13]:
corr_feat_dist_embd, feature_dict_embd, feature_distance_contribution_embd, dvs_matrix_embd, sorted_indexes_embd = Explanation.explain_point_local(X_df.values[data_instance_number], 
                                                                                                                                       neighbors_embd, 
                                                                                                                                       oversampled_data_embd, 
                                                                                                                                       model_features, 
                                                                                                                                       categorical_features, 
                                                                                                                                       numeric_features)

In [14]:
sorted_indexes

array([  0,   1,  77,  52,  26,  51,  78, 103, 192, 104, 129, 363, 338,
       130, 155, 233, 208, 182, 207, 303, 415, 390, 312, 337, 299, 260,
       285, 492, 143, 181, 156, 364, 389, 286, 311, 494, 519, 180, 273,
       169, 125,  98, 518, 178,  86,  24, 157, 320, 394, 344, 440, 304,
       442, 467, 113, 199, 167, 434, 321,  79, 291, 202, 119,  33, 264,
       201, 245,  42, 101, 326, 508, 447, 443, 251, 438,  54, 274, 172,
       473, 318, 234, 259,  97, 230, 430,   4, 333, 131, 160, 424, 203,
       148, 117, 224, 431, 347, 126, 228, 381, 396,  32,   6, 372, 302,
       187, 222, 429, 481, 490, 445,  84, 141,  44, 220,  35, 339, 497,
       225,  25, 146, 195,  49, 373, 478, 219,  34, 314, 419,  12,  73,
       240,  45, 504,  91, 198,  80, 362, 297, 193, 226, 258, 322, 379,
       183, 480, 402, 426, 253, 517, 257,  68, 136, 151, 294, 360, 310,
        23, 374,  62, 292, 386, 108,  11, 107, 486, 507, 341, 293, 428,
       307, 475, 205, 210, 470, 301, 403, 327, 249, 513,  53, 12

#### c. Plot Feature Contribution in Original Data and in Embedding

In [15]:
bar_color = []
y_offset_val = []
label = []
for item in feature_dict.values():
    label.append("{0:.2f}".format(item))
    if item<0:
        bar_color.append('#e34a33')
        y_offset_val.append(item+15)
    else:
        bar_color.append('#2ca25f')
        y_offset_val.append(item)
        
        
source = ColumnDataSource(
        data=dict(
            x_val = list(feature_dict.keys()),
            y_val = list(feature_dict.values()),
            y_offset = y_offset_val,
            labels =  label,
            color = bar_color
        )
    )

p = figure(x_range=list(feature_dict.keys()), plot_height=380, plot_width=980, toolbar_location=None, tools="")

glyph = Text(x="x_val", y="y_val", text='labels', x_offset=-15, y_offset='y_offset', text_font_size="9pt", text_font_style = 'bold', text_color="black")


p.vbar(x="x_val", top="y_val", color="color", width=0.7, source=source)
p.xaxis.major_label_orientation = math.pi/7
p.xaxis.major_label_text_font_size = '9pt'
p.yaxis.major_label_text_font_size = '9pt'
p.xaxis.major_label_text_font_style = 'bold'
p.yaxis.major_label_text_font_style = 'bold'
p.min_border_left = 82
p.xgrid.grid_line_color = None
p.xaxis.axis_label_text_align = 'center'
p.y_range.start = np.min(list(feature_dict.values()))-0.05
p.y_range.end = np.max(list(feature_dict.values()))+0.01
p.add_glyph(source, glyph)
#export_png(p, filename="Images/Fig4/Feature-Influences-85-Original.png")
show(p)

In [16]:
bar_color = []
y_offset_val = []
label = []
for item in feature_dict_embd.values():
    label.append("{0:.2f}".format(item))
    if item<0:
        bar_color.append('#e34a33')
        y_offset_val.append(item+15)
    else:
        bar_color.append('#2ca25f')
        y_offset_val.append(item)
        
        
source = ColumnDataSource(
        data=dict(
            x_val = list(feature_dict_embd.keys()),
            y_val = list(feature_dict_embd.values()),
            y_offset = y_offset_val,
            labels =  label,
            color = bar_color
        )
    )

p = figure(x_range=list(feature_dict_embd.keys()), plot_height=380, plot_width=980, toolbar_location=None, tools="")

glyph = Text(x="x_val", y="y_val", text='labels', x_offset=-15, y_offset='y_offset', text_font_size="9pt", text_font_style = 'bold', text_color="black")


p.vbar(x="x_val", top="y_val", color="color", width=0.7, source=source)
p.xaxis.major_label_orientation = math.pi/7
p.xaxis.major_label_text_font_size = '9pt'
p.yaxis.major_label_text_font_size = '9pt'
p.xaxis.major_label_text_font_style = 'bold'
p.yaxis.major_label_text_font_style = 'bold'
p.min_border_left = 80
p.xgrid.grid_line_color = None
p.xaxis.axis_label_text_align = 'center'
p.y_range.start = np.min(list(feature_dict.values()))-0.05
p.y_range.end = np.max(list(feature_dict.values()))+0.01
p.add_glyph(source, glyph)
#export_png(p, filename="Images/Fig4/Feature-Influences-85-Embedding.png")
show(p)

#### d. Calculate Local Divergence

In [17]:
components, divergence = Explanation.compute_local_divergence(corr_feat_dist, corr_feat_dist_embd, neighbors, neighbors_embd)

In [18]:
components = components.split(",")

print("Discrepancy in Feature Influence:", components[0])
print("Discrepancy in Neighborhood Content:", components[1])
print("Discrepancy in Neighborhood Order:", components[2])
    

Discrepancy in Feature Influence: 0.795386355756614
Discrepancy in Neighborhood Content: 0.4
Discrepancy in Neighborhood Order: 0.95


In [19]:
print("Overall Divergence: ", divergence)

Overall Divergence:  0.7151287852522046
