## 0. Imports

In [29]:
import pandas as pd
import numpy as np

import interpret_extension
from interpret_extension import show
from interpret_extension.glassbox import GaussianNB
from interpret_extension.glassbox import CategoricalNB as CategoricalNaiveBayesClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer

## 1. Loading Diabetes Dataset

We will use a diabetes dataset, which consists in several columns and one target feature: if the patient has diabetes or not.

In [30]:
X_train = pd.read_csv('data/diabetes/X_train.csv')
y_train = pd.read_csv('data/diabetes/y_train.csv').to_numpy().ravel()

X_test = pd.read_csv('data/diabetes/X_test.csv')
y_test = pd.read_csv('data/diabetes/y_test.csv').to_numpy().ravel()

In [31]:
print(X_train.columns.to_list())

['pregnancies', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi', 'dpf', 'age']


In [32]:
X_train.head(3)

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age
0,7,150,78,29,126,35.2,0.692,54
1,4,97,60,23,0,28.2,0.443,22
2,0,165,90,33,680,52.3,0.427,23


In [33]:
print("Percentage of positive class in training data: ", y_train[y_train == 1].shape[0] / y_train.shape[0])
print("Percentage of positive class in test data: ", y_test[y_test == 1].shape[0] / y_test.shape[0])
print()
print("Percentage of negative class in training data: ", y_train[y_train == 0].shape[0] / y_train.shape[0])
print("Percentage of negative class in test data: ", y_test[y_test == 0].shape[0] / y_test.shape[0])

Percentage of positive class in training data:  0.35993485342019543
Percentage of positive class in test data:  0.3051948051948052

Percentage of negative class in training data:  0.6400651465798045
Percentage of negative class in test data:  0.6948051948051948


In [None]:
X_train['pregnancies'] = pd.qcut(X_train['pregnancies'], q=4, labels=False)
X_train['glucose'] = pd.qcut(X_train['glucose'], q=4, labels=False)
X_train['diastolic'] = pd.qcut(X_train['diastolic'], q=4, labels=False)
X_train['triceps'] = pd.qcut(X_train['triceps'], q=3, labels=False)
X_train['insulin'] = pd.qcut(X_train['insulin'], q=5, labels=False)
X_train['bmi'] = pd.qcut(X_train['bmi'], q=4, labels=False)
X_train['dpf'] = pd.qcut(X_train['dpf'], q=4, labels=False)
X_train['age'] = pd.qcut(X_train['age'], q=4, labels=False)

ValueError: Bin edges must be unique: Index([0.0, 0.0, 100.0, 846.0], dtype='float64', name='insulin').
You can drop duplicate edges by setting the 'duplicates' kwarg

## 2. Naive Bayes Models

Let's use both Gaussian NB and Categorical NB to solve this classification problem.

### 2.1 Gaussian Naive Bayes

In [7]:
X_train.sample(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
41,4.5,2.3,1.3,0.3
27,5.2,3.5,1.5,0.2
1,4.9,3.0,1.4,0.2


In [6]:
gaussian_nb = GaussianNB()
gaussian_nb.fit(X_train, y_train)

<interpret_extension.glassbox._naivebayes.GaussianNB at 0x1bc42353df0>

In [7]:
pred = gaussian_nb.predict(X_test)
pred

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [8]:
print(gaussian_nb.score(X_test, y_test))

0.7922077922077922


Predictions are OK, but how were they created? Let's see how the model can be interpreted.

These are the main params of the model. They give us information about the distribution of both classes, and are key for interpreting the model.

In [9]:
print(gaussian_nb._model().theta_)
print(gaussian_nb._model().var_)

[[  3.3740458  109.94910941  68.38167939  19.56234097  71.5826972
   30.40483461   0.42569211  31.44274809]
 [  4.76470588 140.36199095  70.72850679  22.51131222 100.90497738
   35.41085973   0.53898643  37.42081448]]
[[9.40972184e+00 6.63539409e+02 3.32917948e+02 2.28297018e+02
  1.05936986e+04 6.03149777e+01 8.92086259e-02 1.43091520e+02]
 [1.49220260e+01 1.03466536e+03 4.64686486e+02 3.03209162e+02
  1.89007828e+04 5.04391719e+01 1.41139216e-01 1.23166820e+02]]


Firstly, let's see global explanations:

In [10]:
gaussian_nb_global = gaussian_nb.explain_global()
show(gaussian_nb_global)

Looking at the local explanations we can obtain other conclusions:

In [11]:
gaussian_nb_local = gaussian_nb.explain_local(X_test, y_test)
show(gaussian_nb_local)

This way we can observe which variables are the most influential into making an individual prediction.

### 2.2 Categorical Naive Bayes

In order to use Categorical Naive Bayes, we need to discretize the continuous features. We can use the KBinsDiscretizer from scikit-learn to discretize the features.

In [12]:
kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform', subsample=200)
X_train_discrete = pd.DataFrame(kbd.fit_transform(X_train).astype(int), columns=X_train.columns)
X_test_discrete = pd.DataFrame(kbd.transform(X_test).astype(int), columns=X_test.columns)

In [13]:
X_train_discrete.sample(3)

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age
95,0,3,2,1,3,2,0,0
121,1,2,3,2,0,2,1,2
352,0,2,3,2,0,2,0,3


Let's fit the model:

In [14]:
categorical_nb = CategoricalNaiveBayesClassifier()
categorical_nb.fit(X_train_discrete, y_train)

<interpret_extension.glassbox._naivebayes.CategoricalNB at 0x1bc43713790>

In [15]:
pred = categorical_nb.predict(X_test_discrete)
pred

array([1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

In [16]:
print(categorical_nb.score(X_test_discrete, y_test))

0.8246753246753247


These are the main params used to explain the model:

In [17]:
categorical_nb.model.feature_log_prob_

[array([[-0.49751428, -1.45385251, -2.09463171, -3.42150265, -5.98645201],
        [-0.83556752, -1.5287147 , -1.32619044, -2.78147767, -3.81109709]]),
 array([[-1.97911882, -0.80466845, -1.22427807, -2.29757255, -3.90701046],
        [-3.62877553, -1.75697335, -1.13007556, -1.32619044, -1.54933399]]),
 array([[-3.09608025, -3.68386691, -0.66833201, -0.9175478 , -4.04054186],
        [-2.78147767, -4.03424064, -0.94319818, -0.7110048 , -3.22331042]]),
 array([[-1.13442174, -1.30432078, -1.28597164, -2.15781061, -4.19469254],
        [-1.15785512, -1.86518694, -1.08980166, -1.75697335, -3.81109709]]),
 array([[-0.28935852, -1.72377213, -3.2784018 , -4.04054186, -4.04054186],
        [-0.55300055, -1.34299756, -2.32949255, -3.81109709, -3.11794991]]),
 array([[-3.90701046, -1.19066146, -0.5226202 , -2.5524648 , -5.29330482],
        [-4.72738782, -2.93562835, -0.30854721, -1.68286538, -4.03424064]]),
 array([[-0.3062794 , -1.52054389, -3.50154536, -4.60015764, -5.29330482],
        [-0.4

Let's see the global explanations:

In [18]:
categorical_nb_global = categorical_nb.explain_global()
show(categorical_nb_global)

In this case we don't have continuous functions as this model assumes categorical features. We can see the score of each bin of each variable, allowing us to interpret how it affects the model.

In [19]:
categorical_nb_local = categorical_nb.explain_local(X_test_discrete, y_test)
show(categorical_nb_local)

As before, the explanation of individual predictions.

If you compare Categorical NB explanations with Gaussian NB explanations, the length and orientation of the bars are very similar. Never the same, as we have lost information with the discretization, but pretty similar.