In [1]:
import sys
import os

# Obtener el directorio actual
current_directory = os.getcwd()

# Subir dos niveles al directorio raíz, donde se encuentra la carpeta 'interpretml'
root_path = os.path.abspath(os.path.join(current_directory, '..', '..', '..'))

sys.path.append(root_path)

In [2]:
import pandas as pd
import numpy as np
import interpret
from interpret import show

# IRIS Dataset

In [3]:
iris = pd.read_csv('iris.csv', header=None)
iris.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

# Create a binary problem
iris['species'] = np.where(iris['species'] == 'Iris-setosa', 1, 0)

X = iris.drop('species', axis=1)
y = iris['species']

In [4]:
# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Gaussian NB (compared with LogisticRegression)

In [5]:
X_train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
15,5.8,4.0,1.2,0.2
125,6.7,3.3,5.7,2.1
11,5.4,3.7,1.5,0.2
127,6.2,2.8,4.8,1.8
51,7.0,3.2,4.7,1.4
...,...,...,...,...
71,5.9,3.2,4.8,1.8
106,7.6,3.0,6.6,2.1
14,4.3,3.0,1.1,0.1
92,6.1,3.0,4.6,1.4


In [6]:
# import linear from interpret
from interpret.glassbox._naivebayes import GaussianNB
from interpret.glassbox._linear import LogisticRegression

nb = GaussianNB()
nb.fit(X_train, y_train)

lr = LogisticRegression()
lr.fit(X_train, y_train)

<interpret.glassbox._linear.LogisticRegression at 0x284331d2760>

In [7]:
print(X_test.shape)
X_test.sample(3)

(31, 4)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
19,5.7,3.8,1.7,0.3
56,5.7,2.8,4.5,1.3
132,7.9,3.8,6.4,2.0


In [8]:
nb.predict(X_test)

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

In [9]:
grouped = iris.groupby('species').mean()
grouped

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6.2,2.853465,4.877228,1.689109
1,5.006,3.418,1.464,0.244


In [10]:
print(nb._model().theta_)
print(nb._model().var_)

[[6.1962963  2.84074074 4.88271605 1.68395062]
 [5.01025641 3.43333333 1.44358974 0.24102564]]
[[0.9302332  0.1520439  0.84562719 0.20159427]
 [0.12809994 0.15606838 0.0270743  0.01267587]]


In [11]:
nb.predict_proba(X_test.iloc[[0]])

array([[1.00000000e+00, 1.04545725e-96]])

In [12]:
nb_local = nb.explain_local(X_test, y_test)
show(nb_local)

In [13]:
nb_local._internal_obj['specific'][0]

{'data_type': 'univariate',
 'perf': {'is_classification': True,
  'actual': 0,
  'predicted': 0,
  'actual_score': 1.0,
  'predicted_score': 1.0},
 'names': ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
 'scores': array([  -8.78559177,   -0.53124212, -159.55585807,  -51.40013512]),
 'values': array([6.6, 3.0, 4.4, 1.4], dtype=object),
 'extra': {'names': ['Intercept'],
  'scores': [0.7308875085427924],
  'values': [1]},
 'meta': {'label_names': [0, 1]}}

In [14]:
X0 = X_test.iloc[[20]]
X0

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
29,5.2,3.4,1.4,0.2


In [15]:
X0.iloc[:, 0]

29    5.2
Name: sepal_length, dtype: float64

In [16]:
nb_global = nb.explain_global()
show(nb_global)

In [17]:
lr.predict(X_test)

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

In [18]:
lr_local = lr.explain_local(X_test, y_test)
show(lr_local)

In [19]:
lr_global = lr.explain_global()
show(lr_global)

# Categorical NB with discretized dataset

In [20]:
from interpret.glassbox._naivebayes import CategoricalNB as CategoricalNaiveBayesClassifier

In [21]:
from sklearn.preprocessing import KBinsDiscretizer

kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform', subsample=200)
X_train_discrete = pd.DataFrame(kbd.fit_transform(X_train).astype(int), columns=X_train.columns)
X_test_discrete = pd.DataFrame(kbd.transform(X_test).astype(int), columns=X_test.columns)

In [22]:
X_train_discrete.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,3,4,0,0
1,4,3,3,3
2,3,3,0,0
3,4,2,3,2
4,4,3,3,2


In [23]:
CNB = CategoricalNaiveBayesClassifier()
CNB.fit(X_train_discrete, y_train)

<interpret.glassbox._naivebayes.CategoricalNB at 0x28435207fa0>

In [24]:
CNB.model.category_count_

[array([[ 1.,  0.,  0., 36., 44.],
        [ 0.,  0.,  8., 31.,  0.]]),
 array([[ 1.,  7., 52., 20.,  1.],
        [ 0.,  1.,  5., 25.,  8.]]),
 array([[ 1.,  3., 27., 35., 15.],
        [39.,  0.,  0.,  0.,  0.]]),
 array([[ 0., 13., 41., 23.,  4.],
        [39.,  0.,  0.,  0.,  0.]])]

In [25]:
(CNB.predict(X_test_discrete) == y_test).sum() / len(y_test)

1.0

In [26]:
CNB.model.feature_log_prob_

[array([[-3.76120012, -4.4543473 , -4.4543473 , -0.84342938, -0.64768481],
        [-3.78418963, -3.78418963, -1.58696506, -0.31845373, -3.78418963]]),
 array([[-3.76120012, -2.37490575, -0.48405538, -1.40982486, -3.76120012],
        [-3.78418963, -3.09104245, -1.99243016, -0.5260931 , -1.58696506]]),
 array([[-3.76120012, -3.06805294, -1.12214279, -0.87082836, -1.68175857],
        [-0.09531018, -3.78418963, -3.78418963, -3.78418963, -3.78418963]]),
 array([[-4.4543473 , -1.81528997, -0.71667768, -1.27629347, -2.84490938],
        [-0.09531018, -3.78418963, -3.78418963, -3.78418963, -3.78418963]])]

In [27]:
CNBglobal = CNB.explain_global()
show(CNBglobal)

In [28]:
print(dir(CNBglobal))

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_internal_obj', 'data', 'explanation_type', 'feature_names', 'feature_types', 'name', 'selector', 'visualize']


In [29]:
CNBglobal.explanation_type

'global'

In [30]:
densities = list(CNB.model.category_count_[0].sum(axis=0))

In [31]:
densities

[1.0, 0.0, 8.0, 67.0, 44.0]

In [32]:
scores = CNBglobal._internal_obj['specific'][0]['scores']
scores

[0.022989518224698996,
 -2.867382239671466,
 -0.5249756524907486,
 3.1365048274350733]

In [33]:
sum([(densities[i] / sum(densities))*scores[i] for i in range(len(scores))])

1.7164083978037386

In [34]:
CNBglobal._internal_obj['specific'][0]

{'type': 'univariate',
 'names': [0.0, 2.0, 3.0, 4.0],
 'scores': [0.022989518224698996,
  -2.867382239671466,
  -0.5249756524907486,
  3.1365048274350733],
 'scores_range': None,
 'upper_bounds': None,
 'lower_bounds': None,
 'density': {'names': [0.0, 2.0, 3.0, 4.0],
  'scores': [1.0, 0.0, 8.0, 67.0, 44.0]},
 'meta': {'label_names': [0, 1]}}

In [35]:
CNBlocal = CNB.explain_local(X_test_discrete, y_test)
show(CNBlocal)

If you compare Categorical NB explanations with Gaussian NB explanations, the length and orientation of the bars are very similar. Never the same, as we have lost information with the discretization, but pretty similar.