## 0. Imports

In [2]:
import pandas as pd
import numpy as np

from interpret_extension import show
from interpret_extension.glassbox import LinearDiscriminantAnalysisClassifier

from sklearn.model_selection import train_test_split

  from tqdm.autonotebook import tqdm


## 1. Loading Diabetes Dataset

We will use a diabetes dataset, which consists in several columns and one target feature: if the patient has diabetes or not.

In [3]:
X_train = pd.read_csv('data/diabetes/X_train.csv')
y_train = pd.read_csv('data/diabetes/y_train.csv').to_numpy().ravel()

X_test = pd.read_csv('data/diabetes/X_test.csv')
y_test = pd.read_csv('data/diabetes/y_test.csv').to_numpy().ravel()

In [4]:
print(X_train.columns.to_list())

['pregnancies', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi', 'dpf', 'age']


In [5]:
X_train.head(3)

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age
0,7,150,78,29,126,35.2,0.692,54
1,4,97,60,23,0,28.2,0.443,22
2,0,165,90,33,680,52.3,0.427,23


In [6]:
print("Percentage of positive class in training data: ", y_train[y_train == 1].shape[0] / y_train.shape[0])
print("Percentage of positive class in test data: ", y_test[y_test == 1].shape[0] / y_test.shape[0])
print()
print("Percentage of negative class in training data: ", y_train[y_train == 0].shape[0] / y_train.shape[0])
print("Percentage of negative class in test data: ", y_test[y_test == 0].shape[0] / y_test.shape[0])

Percentage of positive class in training data:  0.35993485342019543
Percentage of positive class in test data:  0.3051948051948052

Percentage of negative class in training data:  0.6400651465798045
Percentage of negative class in test data:  0.6948051948051948


## 2. Linear Discriminant Analysis Model

Let's now use LDA model. As we know, a linear model that can classify data generating a linear decision boundary.

In [7]:
LDA_model = LinearDiscriminantAnalysisClassifier()

Let's fit the model:

In [8]:
LDA_model.fit(X_train, y_train)

<interpret_extension.glassbox._lineardiscriminantanalysis.LinearDiscriminantAnalysisClassifier at 0x1e652175ba0>

How will be the predictions?

In [9]:
pred = LDA_model.predict(X_test)
pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

In [10]:
print(LDA_model.score(X_test, y_test))

0.8246753246753247


Some interesting LDA params used to make the predictions:

In [11]:
LDA_model.model.coef_

array([[ 0.09608044,  0.03592395, -0.01526292,  0.00453258, -0.00141029,
         0.08375644,  0.87400423,  0.02189811]])

In [12]:
LDA_model.model.covariance_

array([[ 1.13937785e+01,  4.21547955e+00,  8.75370600e+00,
        -5.61901610e+00, -4.20172600e+01, -1.52124753e+00,
        -9.03720624e-02,  2.03758931e+01],
       [ 4.21547955e+00,  7.97120558e+02,  7.52904773e+01,
         8.58687596e+00,  1.01453625e+03,  2.39489054e+01,
         4.59618776e-01,  6.42299508e+01],
       [ 8.75370600e+00,  7.52904773e+01,  3.80346024e+02,
         6.54956475e+01,  1.71994941e+02,  3.81081863e+01,
         2.79448483e-01,  5.06088545e+01],
       [-5.61901610e+00,  8.58687596e+00,  6.54956475e+01,
         2.55260496e+02,  7.83641631e+02,  4.80669453e+01,
         9.62168692e-01, -3.41700315e+01],
       [-4.20172600e+01,  1.01453625e+03,  1.71994941e+02,
         7.83641631e+02,  1.35837078e+04,  1.44504920e+02,
         6.62685541e+00, -1.30023701e+02],
       [-1.52124753e+00,  2.39489054e+01,  3.81081863e+01,
         4.80669453e+01,  1.44504920e+02,  5.67603172e+01,
         2.51281299e-01, -5.96180973e+00],
       [-9.03720624e-02,  4.596187

Now let's see the InterpretML visualizations:

In [13]:
LDA_global_explanation = LDA_model.explain_global()
show(LDA_global_explanation)

In [14]:
LDA_local_explanation = LDA_model.explain_local(X_test, y_test)
show(LDA_local_explanation)