In [153]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from interpret.glassbox import ExplainableBoostingClassifier, LinearRegression, LogisticRegression
from interpret import show

In [154]:
df = pd.read_csv('archive/loan_data.csv')
df = df.iloc[:1000]
df

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,22.0,female,Bachelor,57005.0,0,RENT,18000.0,DEBTCONSOLIDATION,10.71,0.32,3.0,554,No,1
996,24.0,female,High School,58556.0,0,RENT,18000.0,PERSONAL,6.54,0.31,2.0,634,No,1
997,23.0,male,High School,30483.0,0,OWN,8000.0,PERSONAL,5.79,0.26,3.0,574,Yes,0
998,25.0,female,Associate,67108.0,3,RENT,18000.0,EDUCATION,12.84,0.27,2.0,594,No,1


In [155]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

### Fast Preprocess

In [156]:
X.isna().sum()

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
dtype: int64

In [157]:
X['person_gender'].value_counts()

person_gender
male      553
female    447
Name: count, dtype: int64

In [158]:
X['person_gender'] = X['person_gender'].map({'male': 0, 'female': 1})

In [159]:
X['person_education'].value_counts()

person_education
Bachelor       328
High School    283
Associate      250
Master         138
Doctorate        1
Name: count, dtype: int64

In [160]:
X['person_education'] = X['person_education'].map({'High School': 1, 'Associate': 2, 'Bachelor': 3, 'Master': 4, 'Doctorate': 5})

In [161]:
X['previous_loan_defaults_on_file'].value_counts()

previous_loan_defaults_on_file
No     703
Yes    297
Name: count, dtype: int64

In [162]:
X['previous_loan_defaults_on_file'] = X['previous_loan_defaults_on_file'].map({'Yes': 1, 'No': 0})

In [163]:
# As this two would require one-hot encoding, we will drop them
X = X.drop(columns=['person_home_ownership', 'loan_intent'])

Now that every variable is numerical, let's normalize

In [164]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [165]:
X

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file
0,22.0,1,4,71948.0,0,35000.0,16.02,0.49,3.0,561,0
1,21.0,1,1,12282.0,0,1000.0,11.14,0.08,2.0,504,1
2,25.0,1,1,12438.0,3,5500.0,12.87,0.44,3.0,635,0
3,23.0,1,3,79753.0,0,35000.0,15.23,0.44,2.0,675,0
4,24.0,0,4,66135.0,1,35000.0,14.27,0.53,4.0,586,0
...,...,...,...,...,...,...,...,...,...,...,...
995,22.0,1,3,57005.0,0,18000.0,10.71,0.32,3.0,554,0
996,24.0,1,1,58556.0,0,18000.0,6.54,0.31,2.0,634,0
997,23.0,0,1,30483.0,0,8000.0,5.79,0.26,3.0,574,1
998,25.0,1,2,67108.0,3,18000.0,12.84,0.27,2.0,594,0


In [166]:
predictor_columns = ['person_age', 'person_gender', 'person_education', 'person_income', 'loan_amnt', 'previous_loan_defaults_on_file']

In [167]:
X[predictor_columns]

Unnamed: 0,person_age,person_gender,person_education,person_income,loan_amnt,previous_loan_defaults_on_file
0,22.0,1,4,71948.0,35000.0,0
1,21.0,1,1,12282.0,1000.0,1
2,25.0,1,1,12438.0,5500.0,0
3,23.0,1,3,79753.0,35000.0,0
4,24.0,0,4,66135.0,35000.0,0
...,...,...,...,...,...,...
995,22.0,1,3,57005.0,18000.0,0
996,24.0,1,1,58556.0,18000.0,0
997,23.0,0,1,30483.0,8000.0,1
998,25.0,1,2,67108.0,18000.0,0


### Prediction

In [168]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X[predictor_columns], y, test_size=0.1, random_state=0)

In [171]:
ebm = ExplainableBoostingClassifier()
ebm.fit(X_train, y_train)

In [172]:
ebm_global = ebm.explain_global()
show(ebm_global)

Utilizando librería modificada de interpret


In [173]:
ebm_global = ebm.explain_global()
show(ebm_global)

Utilizando librería modificada de interpret


In [174]:
ebm_local = ebm.explain_local(X_test, y_test)
show(ebm_local)

[[ 0.07210886 -0.12078173  0.30571065 ... -0.21104451  0.03762635
   0.42940794]
 [ 0.03345196  0.09404744 -0.16002787 ...  0.10837227 -0.05541169
  -1.84767391]
 [ 0.01577486  0.09404744 -0.16002787 ... -0.49918924  0.06253194
  -0.42629334]
 ...
 [-0.06695541  0.09404744  0.07335646 ...  0.09025045  0.16489076
   2.77730081]
 [ 0.01577486 -0.12078173 -0.16002787 ...  0.15300683 -0.0127113
   2.72120302]
 [ 0.03345196  0.09404744  0.30571065 ... -0.1715987   0.06592086
   0.37406595]]


## Categorical NB

In [175]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB

cnb = CategoricalNB()
cnb.fit(X_train, y_train)

In [176]:
cnb.class_count_

array([397., 503.])

In [177]:
X_train

Unnamed: 0,person_age,person_gender,person_education,person_income,loan_amnt,previous_loan_defaults_on_file
785,24.0,0,2,27558.0,4000.0,1
873,21.0,1,3,23899.0,700.0,1
65,22.0,0,2,15013.0,1500.0,0
902,25.0,1,4,57272.0,19000.0,0
317,25.0,1,4,161894.0,5000.0,0
...,...,...,...,...,...,...
835,23.0,0,1,132975.0,20000.0,0
192,24.0,0,3,81537.0,25000.0,0
629,22.0,1,1,57193.0,20000.0,0
559,24.0,0,1,101708.0,21000.0,0


In [178]:
X_train['person_emp_exp'].value_counts()

KeyError: 'person_emp_exp'

In [None]:
X_test.iloc[0]

person_age                           22.00
person_gender                         1.00
person_education                      4.00
person_income                     63459.00
person_emp_exp                        2.00
loan_amnt                         18000.00
loan_int_rate                        13.85
loan_percent_income                   0.28
cb_person_cred_hist_length            4.00
credit_score                        683.00
previous_loan_defaults_on_file        0.00
Name: 993, dtype: float64

In [None]:
cnb.category_count_[4][0][int(X_test.iloc[0].iloc[1])] / cnb.class_count_[0]

0.15365239294710328

In [None]:
import numpy as np
cp_0 = np.zeros(X.iloc[0].shape[0])
for j in range(X.iloc[0].shape[0]):
    cp_0[j] = cnb.category_count_[j][0][int(X.iloc[0].iloc[j])] / cnb.class_count_[0]

In [None]:
cp_0

array([0.16876574, 0.44080605, 0.13098237, 0.        , 0.39042821,
       0.02015113, 0.03526448, 1.        , 0.33501259, 0.        ,
       0.34508816])

In [None]:
cnb.category_count_[4]

array([[155.,  61.,  53.,  50.,  38.,  22.,  10.,   2.,   2.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   1.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          1.,   0.,   0.,   0.,   1.],
       [203.,  91.,  72.,  64.,  41.,  14.,  13.,   3.,   2.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,

In [None]:
(cnb.predict(X_test) == y_test).sum() / len(y_test)

0.88

In [None]:
from interpret.glassbox._categoricalnaivebayes import NaiveBayesClassifier as CatNB

In [None]:
catnb = CatNB()
catnb.fit(X_train, y_train)

<interpret.glassbox._categoricalnaivebayes.NaiveBayesClassifier at 0x13b8539b460>

In [None]:
cnb.category_count_[2].shape[1]

6

In [None]:
catnb_local = catnb.explain_local(X_test, y_test)
show(catnb_local)

Instance 0
-0.2366538894125487
[1.25461255e-01 4.41102757e-01 1.31513648e-01 1.66309379e-06
 1.03250478e-01 1.41250918e-04 9.80861244e-02 1.00000000e+00
 3.50746269e-01 1.65975104e-03 3.45864662e-01] [1.68209877e-01 4.35643564e-01 1.49312377e-01 1.66280066e-06
 1.16057234e-01 7.04146012e-04 7.44274809e-02 1.00000000e+00
 3.20866142e-01 2.28832952e-03 9.98019802e-01]
[-2.93215482e-01  1.24534611e-02 -1.26929972e-01  1.76272405e-04
 -1.16925602e-01 -1.60644787e+00  2.76020672e-01  0.00000000e+00
  8.90390485e-02 -3.21154470e-01 -1.05972557e+00]

Instance 1
-0.2366538894125487
[1.49446494e-01 5.58897243e-01 2.40694789e-01 1.66309379e-06
 1.03250478e-01 5.65003673e-05 2.39234450e-03 1.00000000e+00
 3.33333333e-01 8.29875519e-04 3.45864662e-01] [1.63580247e-01 5.64356436e-01 2.98624754e-01 1.66280066e-06
 1.16057234e-01 8.44975214e-05 5.15267176e-02 1.00000000e+00
 3.24803150e-01 7.62776506e-04 9.98019802e-01]
[-9.03652445e-02 -9.72039642e-03 -2.15658088e-01  1.76272405e-04
 -1.16925602e-01

In [None]:
X = X_test.iloc[0]

cp_0 = np.zeros(X.shape[0])
cp_1 = np.zeros(X.shape[0])
for j in range(X.shape[0]):
    cp_0[j] = (cnb.category_count_[j][0][int(X[j])] + 1) / (cnb.class_count_[0] + 1 * cnb.category_count_[j].shape[1])
    cp_1[j] = (cnb.category_count_[j][1][int(X[j])] + 1) / (cnb.class_count_[1] + 1 * cnb.category_count_[j].shape[1])


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [None]:
X_test.iloc[0]

person_age                           22.00
person_gender                         1.00
person_education                      4.00
person_income                     63459.00
person_emp_exp                        2.00
loan_amnt                         18000.00
loan_int_rate                        13.85
loan_percent_income                   0.28
cb_person_cred_hist_length            4.00
credit_score                        683.00
previous_loan_defaults_on_file        0.00
Name: 993, dtype: float64