In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

In [2]:
df = pd.read_csv('archive/loan_data.csv')
df = df.iloc[:1000]
df

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,22.0,female,Bachelor,57005.0,0,RENT,18000.0,DEBTCONSOLIDATION,10.71,0.32,3.0,554,No,1
996,24.0,female,High School,58556.0,0,RENT,18000.0,PERSONAL,6.54,0.31,2.0,634,No,1
997,23.0,male,High School,30483.0,0,OWN,8000.0,PERSONAL,5.79,0.26,3.0,574,Yes,0
998,25.0,female,Associate,67108.0,3,RENT,18000.0,EDUCATION,12.84,0.27,2.0,594,No,1


In [3]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

### Fast Preprocess

In [4]:
X.isna().sum()

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
dtype: int64

In [5]:
X['person_gender'].value_counts()

person_gender
male      553
female    447
Name: count, dtype: int64

In [6]:
X['person_gender'] = X['person_gender'].map({'male': 0, 'female': 1})

In [7]:
X['person_education'].value_counts()

person_education
Bachelor       328
High School    283
Associate      250
Master         138
Doctorate        1
Name: count, dtype: int64

In [8]:
X['person_education'] = X['person_education'].map({'High School': 1, 'Associate': 2, 'Bachelor': 3, 'Master': 4, 'Doctorate': 5})

In [9]:
X['previous_loan_defaults_on_file'].value_counts()

previous_loan_defaults_on_file
No     703
Yes    297
Name: count, dtype: int64

In [10]:
X['previous_loan_defaults_on_file'] = X['previous_loan_defaults_on_file'].map({'Yes': 1, 'No': 0})

In [11]:
# As this two would require one-hot encoding, we will drop them
X = X.drop(columns=['person_home_ownership', 'loan_intent'])

Now that every variable is numerical, let's normalize

In [12]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [13]:
X

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file
0,22.0,1,4,71948.0,0,35000.0,16.02,0.49,3.0,561,0
1,21.0,1,1,12282.0,0,1000.0,11.14,0.08,2.0,504,1
2,25.0,1,1,12438.0,3,5500.0,12.87,0.44,3.0,635,0
3,23.0,1,3,79753.0,0,35000.0,15.23,0.44,2.0,675,0
4,24.0,0,4,66135.0,1,35000.0,14.27,0.53,4.0,586,0
...,...,...,...,...,...,...,...,...,...,...,...
995,22.0,1,3,57005.0,0,18000.0,10.71,0.32,3.0,554,0
996,24.0,1,1,58556.0,0,18000.0,6.54,0.31,2.0,634,0
997,23.0,0,1,30483.0,0,8000.0,5.79,0.26,3.0,574,1
998,25.0,1,2,67108.0,3,18000.0,12.84,0.27,2.0,594,0


In [14]:
# scaler = MinMaxScaler()
# X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# X

### Prediction

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [16]:
ebm = ExplainableBoostingClassifier()
ebm.fit(X_train, y_train)

In [17]:
ebm_global = ebm.explain_global()
show(ebm_global)

Utilizando librería modificada de interpret


In [18]:
ebm_local = ebm.explain_local(X_test, y_test)
show(ebm_local)

[[ 0.05343145 -0.08366151  0.18004936 ...  0.28030289  0.03071651
  -0.11217586]
 [ 0.01960964  0.06514355 -0.04096746 ... -0.09533213  0.19608808
   0.08984029]
 [-0.02197137  0.06514355 -0.04096746 ...  0.11273568 -0.29195383
   0.43972341]
 ...
 [-0.01142354  0.06514355  0.05567625 ...  0.77939726  0.35285552
  -0.08310043]
 [-0.02197137 -0.08366151 -0.04096746 ...  0.73927019  0.37025668
  -0.14616986]
 [ 0.01960964  0.06514355  0.18004936 ...  0.36817215 -0.1715051
   0.36244684]]


## Categorical NB

In [19]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB

cnb = CategoricalNB()
cnb.fit(X_train, y_train)

In [20]:
cnb.class_count_

array([397., 503.])

In [21]:
X_train

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file
785,24.0,0,2,27558.0,0,4000.0,11.01,0.15,2.0,621,1
873,21.0,1,3,23899.0,1,700.0,12.29,0.03,2.0,658,1
65,22.0,0,2,15013.0,0,1500.0,14.84,0.10,3.0,688,0
902,25.0,1,4,57272.0,1,19000.0,11.99,0.33,2.0,648,0
317,25.0,1,4,161894.0,0,5000.0,16.49,0.03,2.0,609,0
...,...,...,...,...,...,...,...,...,...,...,...
835,23.0,0,1,132975.0,1,20000.0,12.42,0.15,3.0,630,0
192,24.0,0,3,81537.0,1,25000.0,10.99,0.31,3.0,674,0
629,22.0,1,1,57193.0,0,20000.0,16.77,0.35,3.0,532,0
559,24.0,0,1,101708.0,2,21000.0,10.75,0.21,4.0,593,0


In [22]:
X_train['person_emp_exp'].value_counts()

person_emp_exp
0      358
1      152
2      125
3      114
4       79
5       36
6       23
7        5
8        4
125      1
101      1
100      1
121      1
Name: count, dtype: int64

In [23]:
X_test.iloc[0]

person_age                           22.00
person_gender                         1.00
person_education                      4.00
person_income                     63459.00
person_emp_exp                        2.00
loan_amnt                         18000.00
loan_int_rate                        13.85
loan_percent_income                   0.28
cb_person_cred_hist_length            4.00
credit_score                        683.00
previous_loan_defaults_on_file        0.00
Name: 993, dtype: float64

In [24]:
cnb.category_count_[4][0][int(X_test.iloc[0].iloc[1])] / cnb.class_count_[0]

0.15365239294710328

In [25]:
import numpy as np
cp_0 = np.zeros(X.iloc[0].shape[0])
for j in range(X.iloc[0].shape[0]):
    cp_0[j] = cnb.category_count_[j][0][int(X.iloc[0].iloc[j])] / cnb.class_count_[0]

In [26]:
cp_0

array([0.16876574, 0.44080605, 0.13098237, 0.        , 0.39042821,
       0.02015113, 0.03526448, 1.        , 0.33501259, 0.        ,
       0.34508816])

In [27]:
cnb.category_count_[4]

array([[155.,  61.,  53.,  50.,  38.,  22.,  10.,   2.,   2.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   1.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          1.,   0.,   0.,   0.,   1.],
       [203.,  91.,  72.,  64.,  41.,  14.,  13.,   3.,   2.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,

In [28]:
(cnb.predict(X_test) == y_test).sum() / len(y_test)

0.88

In [29]:
from interpret.glassbox._categoricalnaivebayes import NaiveBayesClassifier as CatNB

In [30]:
catnb = CatNB()
catnb.fit(X_train, y_train)

<interpret.glassbox._categoricalnaivebayes.NaiveBayesClassifier at 0x1e9d256e4f0>

In [31]:
cnb.category_count_[2].shape[1]

6

In [32]:
catnb_local = catnb.explain_local(X_test, y_test)
show(catnb_local)

Instance 0
[1.25461255e-01 4.41102757e-01 1.31513648e-01 1.66309379e-06
 1.03250478e-01 1.41250918e-04 9.80861244e-02 1.00000000e+00
 3.50746269e-01 1.65975104e-03 3.45864662e-01] [0.21471173 0.43538767 0.14910537 0.         0.14314115 0.04771372
 0.07554672 1.         0.32206759 0.00397614 1.        ]
[-0.53729935  0.01304102 -0.12554259         inf -0.32667337 -5.82243635
  0.26109464  0.          0.08530164 -0.87364468 -1.06170773]

Instance 1
[1.49446494e-01 5.58897243e-01 2.40694789e-01 1.66309379e-06
 1.03250478e-01 5.65003673e-05 2.39234450e-03 1.00000000e+00
 3.33333333e-01 8.29875519e-04 3.45864662e-01] [0.20874751 0.56461233 0.30019881 0.         0.14314115 0.00397614
 0.05168986 1.         0.32604374 0.         1.        ]
[-0.33418703 -0.01017371 -0.22091525         inf -0.32667337 -4.25382043
 -3.0729878   0.          0.02211145         inf -1.06170773]

Instance 2
[1.42066421e-01 5.58897243e-01 2.40694789e-01 3.32618757e-06
 5.73613767e-03 8.47505509e-05 1.24401914e-01 1.


divide by zero encountered in divide



In [35]:
X = X_test.iloc[0]

cp_0 = np.zeros(X.shape[0])
cp_1 = np.zeros(X.shape[0])
for j in range(X.shape[0]):
    cp_0[j] = (cnb.category_count_[j][0][int(X[j])] + 1) / (cnb.class_count_[0] + 1 * cnb.category_count_[j].shape[1])
    cp_1[j] = (cnb.category_count_[j][1][int(X[j])] + 1) / (cnb.class_count_[1] + 1 * cnb.category_count_[j].shape[1])


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [42]:
X_test.iloc[0]

person_age                           22.00
person_gender                         1.00
person_education                      4.00
person_income                     63459.00
person_emp_exp                        2.00
loan_amnt                         18000.00
loan_int_rate                        13.85
loan_percent_income                   0.28
cb_person_cred_hist_length            4.00
credit_score                        683.00
previous_loan_defaults_on_file        0.00
Name: 993, dtype: float64