In [1]:
import sys
import os

# Obtener el directorio actual
current_directory = os.getcwd()

# Subir dos niveles al directorio raíz, donde se encuentra la carpeta 'interpretml'
root_path = os.path.abspath(os.path.join(current_directory, '..', '..', '..'))

sys.path.append(root_path)

In [2]:
import pandas as pd
import numpy as np
import interpret
from interpret import show

# IRIS Dataset

In [3]:
iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
iris.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

# Create a binary problem
iris['species'] = np.where(iris['species'] == 'Iris-setosa', 1, 0)

X = iris.drop('species', axis=1)
y = iris['species']

# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Gaussian NB (compared with LogisticRegression)

In [4]:
# import linear from interpret
from interpret.glassbox._naivebayes import NaiveBayesClassifier
from interpret.glassbox._linear import LogisticRegression

nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)

lr = LogisticRegression()
lr.fit(X_train, y_train)

<interpret.glassbox._linear.LogisticRegression at 0x199267aca00>

In [5]:
print(X_test.shape)
X_test.sample(3)

(30, 4)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
18,5.7,3.8,1.7,0.3
45,4.8,3.0,1.4,0.3
127,6.1,3.0,4.9,1.8


In [6]:
nb.predict(X_test)

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

In [7]:
print(nb._model().theta_)
print(nb._model().var_)

[[6.21875 2.86625 4.865   1.6525 ]
 [4.99    3.44    1.4525  0.2425 ]]
[[0.44427344 0.10923594 0.663775   0.17599375]
 [0.1239     0.1549     0.03299375 0.01144375]]


In [8]:
nb_local = nb.explain_local(X_test, y_test)
show(nb_local)

In [9]:
lr.predict(X_test)

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

In [10]:
lr_local = lr.explain_local(X_test, y_test)
show(lr_local)

# Categorical NB with discretized dataset

In [11]:
from interpret.glassbox._categoricalnaivebayes import NaiveBayesClassifier as CategoricalNaiveBayesClassifier

In [12]:
from sklearn.preprocessing import KBinsDiscretizer

kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform', subsample=200)
X_train_discrete = pd.DataFrame(kbd.fit_transform(X_train).astype(int), columns=X_train.columns)
X_test_discrete = pd.DataFrame(kbd.transform(X_test).astype(int), columns=X_test.columns)

In [13]:
X_train_discrete.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0,3,0,0
1,2,4,0,0
2,3,2,2,2
3,0,2,0,0
4,0,2,0,0


In [14]:
CNB = CategoricalNaiveBayesClassifier()
CNB.fit(X_train_discrete, y_train)

<interpret.glassbox._categoricalnaivebayes.NaiveBayesClassifier at 0x19938ab5370>

In [15]:
CNB.model.category_count_

[array([[ 2., 14., 33., 21., 10.],
        [15., 23.,  2.,  0.,  0.]]),
 array([[ 9., 34., 35.,  2.,  0.],
        [ 1.,  1., 20., 14.,  4.]]),
 array([[ 0.,  1., 25., 36., 18.],
        [40.,  0.,  0.,  0.,  0.]]),
 array([[ 0.,  7., 33., 24., 16.],
        [39.,  1.,  0.,  0.,  0.]])]

In [16]:
(CNB.predict(X_test_discrete) == y_test).sum() / len(y_test)

1.0

In [17]:
CNB.model.feature_log_prob_

[array([[-3.34403897, -1.73460106, -0.91629073, -1.3516088 , -2.04475598],
        [-1.03407377, -0.62860866, -2.7080502 , -3.80666249, -3.80666249]]),
 array([[-2.14006616, -0.8873032 , -0.85913232, -3.34403897, -4.44265126],
        [-3.11351531, -3.11351531, -0.76214005, -1.09861229, -2.19722458]]),
 array([[-4.44265126, -3.74950408, -1.18455472, -0.83173334, -1.49821228],
        [-0.09309042, -3.80666249, -3.80666249, -3.80666249, -3.80666249]]),
 array([[-4.44265126, -2.36320971, -0.91629073, -1.22377543, -1.60943791],
        [-0.11778304, -3.11351531, -3.80666249, -3.80666249, -3.80666249]])]

In [18]:
CNBglobal = CNB.explain_global()
show(CNBglobal)

[0.0, 1.0, 2.0, 3.0, 4.0]
[17.0, 37.0, 35.0, 21.0, 10.0]

[0.0, 1.0, 2.0, 3.0, 4.0]
[10.0, 35.0, 55.0, 16.0, 4.0]

[0.0, 1.0, 2.0, 3.0, 4.0]
[40.0, 1.0, 25.0, 36.0, 18.0]

[0.0, 1.0, 2.0, 3.0, 4.0]
[39.0, 8.0, 33.0, 24.0, 16.0]

4
['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
['nominal', 'nominal', 'nominal', 'nominal']
None

           Name         Type  # Unique  % Non-zero
0  sepal_length  categorical         5         NaN
1   sepal_width  categorical         5         NaN
2  petal_length  categorical         5         NaN
3   petal_width  categorical         5         NaN


In [21]:
CNBglobal._internal_obj['specific'][0]

{'type': 'univariate',
 'names': [0.0, 1.0, 2.0, 3.0, 4.0],
 'scores': [-2.309965200291668,
  -1.1059923959657325,
  1.7917594692280545,
  2.455053686638319,
  1.7619065060783734],
 'scores_range': None,
 'upper_bounds': None,
 'lower_bounds': None,
 'density': {'names': [0.0, 1.0, 2.0, 3.0, 4.0],
  'scores': [17.0, 37.0, 35.0, 21.0, 10.0]},
 'meta': {'label_names': [0, 1]}}

In [22]:
CNBlocal = CNB.explain_local(X_test_discrete, y_test)
show(CNBlocal)

ValueError: invalid literal for int() with base 10: 'medio'

If you compare Categorical NB explanations with Gaussian NB explanations, the length and orientation of the bars are very similar. Never the same, as we have lost information with the discretization, but pretty similar.