In [19]:
import sys
import os

# Obtener el directorio actual
current_directory = os.getcwd()

# Subir dos niveles al directorio raíz, donde se encuentra la carpeta 'interpretml'
root_path = os.path.abspath(os.path.join(current_directory, '..', '..', '..'))

sys.path.append(root_path)

In [20]:
import pandas as pd
import numpy as np
import interpret
from interpret import show

# IRIS Dataset

In [21]:
iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
iris.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

# Create a binary problem
iris['species'] = np.where(iris['species'] == 'Iris-setosa', 1, 0)

X = iris.drop('species', axis=1)
y = iris['species']

# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Gaussian NB (compared with LogisticRegression)

In [22]:
# import linear from interpret
from interpret.glassbox._naivebayes import NaiveBayesClassifier
from interpret.glassbox._linear import LogisticRegression

nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)

lr = LogisticRegression()
lr.fit(X_train, y_train)

<interpret.glassbox._linear.LogisticRegression at 0x21431678bb0>

In [23]:
print(X_test.shape)
X_test.sample(3)

(30, 4)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
45,4.8,3.0,1.4,0.3
132,6.4,2.8,5.6,2.2
19,5.1,3.8,1.5,0.3


In [24]:
nb.predict(X_test)

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

In [25]:
print(nb._model().theta_)
print(nb._model().var_)

[[6.21875 2.86625 4.865   1.6525 ]
 [4.99    3.44    1.4525  0.2425 ]]
[[0.44427344 0.10923594 0.663775   0.17599375]
 [0.1239     0.1549     0.03299375 0.01144375]]


In [26]:
nb_local = nb.explain_local(X_test, y_test)
show(nb_local)

In [27]:
lr.predict(X_test)

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

In [28]:
lr_local = lr.explain_local(X_test, y_test)
show(lr_local)

# Categorical NB with discretized dataset

In [29]:
from interpret.glassbox._categoricalnaivebayes import NaiveBayesClassifier as CategoricalNaiveBayesClassifier

In [30]:
from sklearn.preprocessing import KBinsDiscretizer

kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform', subsample=200)
X_train_discrete = pd.DataFrame(kbd.fit_transform(X_train).astype(int), columns=X_train.columns)
X_test_discrete = pd.DataFrame(kbd.transform(X_test).astype(int), columns=X_test.columns)

In [31]:
X_train_discrete.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0,3,0,0
1,2,4,0,0
2,3,2,2,2
3,0,2,0,0
4,0,2,0,0


In [32]:
CNB = CategoricalNaiveBayesClassifier()
CNB.fit(X_train_discrete, y_train)

None


<interpret.glassbox._categoricalnaivebayes.NaiveBayesClassifier at 0x21434667fd0>

In [33]:
CNB.model.category_count_

[array([[ 2., 14., 33., 21., 10.],
        [15., 23.,  2.,  0.,  0.]]),
 array([[ 9., 34., 35.,  2.,  0.],
        [ 1.,  1., 20., 14.,  4.]]),
 array([[ 0.,  1., 25., 36., 18.],
        [40.,  0.,  0.,  0.,  0.]]),
 array([[ 0.,  7., 33., 24., 16.],
        [39.,  1.,  0.,  0.,  0.]])]

In [34]:
(CNB.predict(X_test_discrete) == y_test).sum() / len(y_test)

1.0

In [35]:
CNBglobal = CNB.explain_global()
show(CNBglobal)

In [40]:
CNBglobal._internal_obj['specific'][0]

{'names': array([0.00000000e+00, 2.49624812e-03, 4.99249625e-03, ...,
        4.98500750e+00, 4.98750375e+00, 4.99000000e+00]),
 'scores': [-2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2.309965200291668,
  -2

In [36]:
CNBlocal = CNB.explain_local(X_test_discrete, y_test)
show(CNBlocal)

If you compare Categorical NB explanations with Gaussian NB explanations, the length and orientation of the bars are very similar. Never the same, as we have lost information with the discretization, but pretty similar.