In [1]:
import sys
import os

# Obtener el directorio actual
current_directory = os.getcwd()

# Subir dos niveles al directorio raíz, donde se encuentra la carpeta 'interpretml'
root_path = os.path.abspath(os.path.join(current_directory, '..', '..', '..'))

sys.path.append(root_path)

In [2]:
import pandas as pd
import numpy as np
import interpret

Read the data

In [3]:
X_train = pd.read_csv('data/X_train.csv')
y_train = pd.read_csv('data/y_train.csv').to_numpy().ravel()

X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv').to_numpy().ravel()

In [4]:
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

ebm = ExplainableBoostingClassifier(interactions=0)
ebm.fit(X_train, y_train)

## Global explanations

In [5]:
ebm_global = ebm.explain_global()
show(ebm_global)

Utilizando librería modificada de interpret


EBM class use 'ebm.term_scores' to create the graph. Each term has a list of scores, each one representing a bin of the variable (or term). In the case of glucose, a low (0-120) value would mean a negative score, suggesting no-diabetes prediction; a high score (120-180) would likely mean that the person has diabetes.

The number of scores in the list is determined by the bins of the variable. And the number of bins depend on the distribution of the variable and the discretization algorithm used. For example, in the case of glucose would be the following bins:

In [6]:
print("Number of bins in glucose: ", len(ebm.bins_[1][0]))
ebm.bins_[1][0]

Number of bins in glucose:  127


array([ 22. ,  50. ,  56.5,  62. ,  67.5,  69.5,  72. ,  73.5,  74.5,
        75.5,  76.5,  77.5,  78.5,  79.5,  80.5,  81.5,  82.5,  83.5,
        84.5,  85.5,  86.5,  87.5,  88.5,  89.5,  90.5,  91.5,  92.5,
        93.5,  94.5,  95.5,  96.5,  97.5,  98.5,  99.5, 100.5, 101.5,
       102.5, 103.5, 104.5, 105.5, 106.5, 107.5, 108.5, 109.5, 110.5,
       111.5, 112.5, 113.5, 114.5, 115.5, 116.5, 117.5, 118.5, 119.5,
       120.5, 121.5, 122.5, 123.5, 124.5, 125.5, 126.5, 127.5, 128.5,
       129.5, 130.5, 131.5, 132.5, 133.5, 134.5, 135.5, 136.5, 137.5,
       138.5, 139.5, 140.5, 141.5, 142.5, 143.5, 144.5, 145.5, 146.5,
       147.5, 148.5, 149.5, 150.5, 151.5, 152.5, 153.5, 154.5, 155.5,
       156.5, 157.5, 158.5, 159.5, 160.5, 161.5, 162.5, 163.5, 164.5,
       165.5, 166.5, 167.5, 168.5, 169.5, 170.5, 172. , 173.5, 174.5,
       175.5, 176.5, 177.5, 178.5, 179.5, 180.5, 182. , 183.5, 185. ,
       186.5, 187.5, 188.5, 189.5, 190.5, 192.5, 194.5, 195.5, 196.5,
       197.5])

And the constant value associated to each bin:

In [7]:
ebm.term_scores_[1]

array([ 0.        , -0.63099025, -0.96340995, -1.07419818, -1.08696046,
       -1.09622566, -1.10274483, -1.10289811, -1.10208151, -1.09683749,
       -1.0895702 , -1.08023566, -1.06673348, -1.05791202, -1.05088763,
       -1.04238371, -1.03588866, -1.02504243, -1.01327663, -1.00053769,
       -0.99254221, -0.9866094 , -0.97700863, -0.96352772, -0.95094951,
       -0.87290783, -0.85902529, -0.82093015, -0.80191835, -0.79039522,
       -0.73511625, -0.74355387, -0.72844433, -0.72850191, -0.70666284,
       -0.58605972, -0.55097082, -0.48845591, -0.47596127, -0.43548435,
       -0.44000901, -0.42161198, -0.27850545, -0.26367686, -0.25517668,
       -0.24383747, -0.22785207, -0.2121048 , -0.20165836, -0.19118097,
       -0.17604766, -0.16344234, -0.15081739, -0.1416803 , -0.12985512,
       -0.120627  , -0.09882757, -0.08985672, -0.09918228,  0.11258764,
        0.11835203,  0.09493013,  0.10516306,  0.12794073,  0.14430878,
        0.34647529,  0.36365193,  0.37696104,  0.39750485,  0.40

So this means that we can modify the value on each bin to visualize, for example, the weights of another model:

#### 1. Constant score

In [8]:
ebm.term_scores_[1] = np.ones(ebm.term_scores_[1].shape)

In [9]:
ebm.term_scores_[1]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [10]:
ebm_global = ebm.explain_global()
show(ebm_global)

Utilizando librería modificada de interpret


#### 2. Random scores

In [11]:
ebm.term_scores_[1] = np.random.rand(ebm.term_scores_[1].shape[0])

In [12]:
ebm.term_scores_[1]

array([9.98645731e-01, 6.13323590e-01, 4.25733164e-01, 2.52192535e-01,
       7.39769632e-01, 6.16252962e-01, 6.03938261e-01, 5.26987643e-01,
       9.85731679e-01, 5.30057723e-01, 7.59101559e-01, 5.52280775e-01,
       7.09009569e-01, 7.66561173e-01, 5.05512363e-04, 6.75456784e-01,
       3.71809009e-01, 6.91865981e-01, 2.97693558e-02, 4.37018471e-01,
       5.63341390e-01, 3.09027373e-01, 7.68676887e-02, 6.83983413e-01,
       7.47897448e-01, 9.50811921e-02, 2.04644221e-01, 1.99329013e-01,
       7.40992363e-01, 4.41920360e-01, 1.05830930e-01, 3.41212396e-01,
       7.83833720e-01, 8.94191939e-01, 5.50573617e-01, 8.15798029e-01,
       5.26412475e-01, 3.00595459e-01, 7.54902476e-01, 6.16281680e-01,
       8.14523777e-01, 1.62874928e-01, 4.97889179e-01, 3.59984082e-01,
       2.65127566e-03, 1.20965394e-01, 7.48154761e-01, 8.15573071e-01,
       2.11336776e-01, 9.37857067e-01, 5.54249679e-01, 8.74290333e-01,
       5.65846315e-01, 6.21093773e-01, 1.50681732e-01, 8.72383556e-01,
      

In [13]:
ebm_global = ebm.explain_global()
show(ebm_global)

Utilizando librería modificada de interpret


So in this way we can use the same visualization for models like Naive Bayes.

We could also modify the histogram values using 'ebm.histogram_edges_' and 'ebm.histogram_weights_'.

In [14]:
ebm.histogram_edges_[1]

array([  0. ,  16.5,  33. ,  49.5,  66. ,  82.5,  99. , 115.5, 132. ,
       148.5, 165. , 181.5, 198. ])

In [15]:
ebm.histogram_weights_[1]

array([  0.,   4.,   0.,   1.,   3.,  36., 103., 145., 125.,  83.,  49.,
        38.,  27.,   0.])

## Local explanations

In [16]:
ebm_local = ebm.explain_local(X_test, y_test)
show(ebm_local)

In [34]:
ebm_local._internal_obj['specific'][0]['meta']

{'label_names': [0, 1]}

In [17]:
ebm.term_scores_[0].shape

(19,)

In [18]:
X_test.iloc[0]

pregnancies      1.000
glucose        199.000
diastolic       76.000
triceps         43.000
insulin          0.000
bmi             42.900
dpf              1.394
age             22.000
Name: 0, dtype: float64

In [19]:
ebm.predict_proba(X_test)

array([[0.46421658, 0.53578342],
       [0.67431601, 0.32568399],
       [0.54809805, 0.45190195],
       [0.65967561, 0.34032439],
       [0.65513352, 0.34486648],
       [0.78666254, 0.21333746],
       [0.2996435 , 0.7003565 ],
       [0.32995922, 0.67004078],
       [0.69744126, 0.30255874],
       [0.30568031, 0.69431969],
       [0.53803049, 0.46196951],
       [0.31434524, 0.68565476],
       [0.5914088 , 0.4085912 ],
       [0.57280872, 0.42719128],
       [0.32632092, 0.67367908],
       [0.53914057, 0.46085943],
       [0.41958219, 0.58041781],
       [0.80081134, 0.19918866],
       [0.56097238, 0.43902762],
       [0.51171661, 0.48828339],
       [0.41555938, 0.58444062],
       [0.56343408, 0.43656592],
       [0.60328483, 0.39671517],
       [0.3691295 , 0.6308705 ],
       [0.83534007, 0.16465993],
       [0.63508857, 0.36491143],
       [0.81955172, 0.18044828],
       [0.09536241, 0.90463759],
       [0.81142334, 0.18857666],
       [0.83168569, 0.16831431],
       [0.

In [20]:
scores = ebm._predict_score(X_test)
scores

array([ 1.43378813e-01, -7.27771288e-01, -1.92988945e-01, -6.61848942e-01,
       -6.41681722e-01, -1.30492414e+00,  8.48996043e-01,  7.08369492e-01,
       -8.35142870e-01,  8.20392697e-01, -1.52416335e-01,  7.79882342e-01,
       -3.69792346e-01, -2.93319962e-01,  7.24872525e-01, -1.56883274e-01,
        3.24488760e-01, -1.39137300e+00, -2.45109342e-01, -4.68750102e-02,
        3.41029653e-01, -2.55110965e-01, -4.19170856e-01,  5.35953065e-01,
       -1.62395657e+00, -5.54109810e-01, -1.51331331e+00,  2.24984997e+00,
       -1.45928533e+00, -1.59762148e+00, -8.65723027e-01, -7.55184840e-01,
       -7.36116312e-01,  7.00776246e-01, -2.97816487e+00,  4.80959632e-02,
        9.47285363e-02, -2.06746127e+00, -8.10147974e-01, -8.57054862e-01,
       -9.29422107e-01, -8.24237247e-01, -2.74654715e-01, -2.25038675e-01,
       -2.11769822e-01, -2.24865924e+00, -2.32729010e+00, -1.27049384e+00,
       -4.41131142e-01, -1.21997447e-01,  4.55279050e-01, -9.05658836e-01,
        1.50506882e+00,  

In [21]:
ebm_local.data(2)

{'type': 'univariate',
 'names': ['pregnancies',
  'glucose',
  'diastolic',
  'triceps',
  'insulin',
  'bmi',
  'dpf',
  'age'],
 'scores': [-0.04484705147183339,
  0.5522807752172241,
  -0.01568311965900504,
  -0.030176061300163523,
  0.009020029244850623,
  0.260328294298641,
  0.03824722953561576,
  -0.15562985228560677],
 'values': [4.0, 76.0, 62.0, 0.0, 0.0, 34.0, 0.391, 25.0],
 'extra': {'names': ['Intercept'],
  'scores': [-0.8065291885261778],
  'values': [1]},
 'perf': {'is_classification': True,
  'actual': 0,
  'predicted': 0,
  'actual_score': 0.5480980455831383,
  'predicted_score': 0.5480980455831383},
 'meta': {'label_names': [0, 1]}}