In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

In [2]:
df = pd.read_csv('archive/loan_data.csv')
df = df.iloc[:1000]
df

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,22.0,female,Bachelor,57005.0,0,RENT,18000.0,DEBTCONSOLIDATION,10.71,0.32,3.0,554,No,1
996,24.0,female,High School,58556.0,0,RENT,18000.0,PERSONAL,6.54,0.31,2.0,634,No,1
997,23.0,male,High School,30483.0,0,OWN,8000.0,PERSONAL,5.79,0.26,3.0,574,Yes,0
998,25.0,female,Associate,67108.0,3,RENT,18000.0,EDUCATION,12.84,0.27,2.0,594,No,1


In [3]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

### Fast Preprocess

In [4]:
X.isna().sum()

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
dtype: int64

In [5]:
X['person_gender'].value_counts()

person_gender
male      553
female    447
Name: count, dtype: int64

In [6]:
X['person_gender'] = X['person_gender'].map({'male': 0, 'female': 1})

In [7]:
X['person_education'].value_counts()

person_education
Bachelor       328
High School    283
Associate      250
Master         138
Doctorate        1
Name: count, dtype: int64

In [8]:
X['person_education'] = X['person_education'].map({'High School': 1, 'Associate': 2, 'Bachelor': 3, 'Master': 4, 'Doctorate': 5})

In [9]:
X['previous_loan_defaults_on_file'].value_counts()

previous_loan_defaults_on_file
No     703
Yes    297
Name: count, dtype: int64

In [10]:
X['previous_loan_defaults_on_file'] = X['previous_loan_defaults_on_file'].map({'Yes': 1, 'No': 0})

In [11]:
# As this two would require one-hot encoding, we will drop them
X = X.drop(columns=['person_home_ownership', 'loan_intent'])

Now that every variable is numerical, let's normalize

In [12]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [13]:
X

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file
0,22.0,1,4,71948.0,0,35000.0,16.02,0.49,3.0,561,0
1,21.0,1,1,12282.0,0,1000.0,11.14,0.08,2.0,504,1
2,25.0,1,1,12438.0,3,5500.0,12.87,0.44,3.0,635,0
3,23.0,1,3,79753.0,0,35000.0,15.23,0.44,2.0,675,0
4,24.0,0,4,66135.0,1,35000.0,14.27,0.53,4.0,586,0
...,...,...,...,...,...,...,...,...,...,...,...
995,22.0,1,3,57005.0,0,18000.0,10.71,0.32,3.0,554,0
996,24.0,1,1,58556.0,0,18000.0,6.54,0.31,2.0,634,0
997,23.0,0,1,30483.0,0,8000.0,5.79,0.26,3.0,574,1
998,25.0,1,2,67108.0,3,18000.0,12.84,0.27,2.0,594,0


In [14]:
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file
0,0.00813,1.0,0.75,0.101368,0.000,1.000000,0.727023,0.742424,0.5,0.316667,0.0
1,0.00000,1.0,0.00,0.000000,0.000,0.014493,0.392318,0.121212,0.0,0.158333,1.0
2,0.03252,1.0,0.00,0.000265,0.024,0.144928,0.510974,0.666667,0.5,0.522222,0.0
3,0.01626,1.0,0.50,0.114628,0.000,1.000000,0.672840,0.666667,0.0,0.633333,0.0
4,0.02439,0.0,0.75,0.091492,0.008,1.000000,0.606996,0.803030,1.0,0.386111,0.0
...,...,...,...,...,...,...,...,...,...,...,...
995,0.00813,1.0,0.50,0.075981,0.000,0.507246,0.362826,0.484848,0.5,0.297222,0.0
996,0.02439,1.0,0.00,0.078616,0.000,0.507246,0.076818,0.469697,0.0,0.519444,0.0
997,0.01626,0.0,0.00,0.030922,0.000,0.217391,0.025377,0.393939,0.5,0.352778,1.0
998,0.03252,1.0,0.25,0.093145,0.024,0.507246,0.508916,0.409091,0.0,0.408333,0.0


### Prediction

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [16]:
ebm = ExplainableBoostingClassifier()
ebm.fit(X_train, y_train)

In [17]:
ebm_global = ebm.explain_global()
show(ebm_global)

Utilizando librería modificada de interpret


In [18]:
ebm_local = ebm.explain_local(X_test, y_test)
show(ebm_local)

[[ 0.05343145 -0.08366151  0.18004936 ...  0.28030289  0.03071651
  -0.11217586]
 [ 0.01960964  0.06514355 -0.04096746 ... -0.09533213  0.19608808
   0.08984029]
 [-0.02197137  0.06514355 -0.04096746 ...  0.11273568 -0.29195383
   0.43972341]
 ...
 [-0.01142354  0.06514355  0.05567625 ...  0.77939726  0.35285552
  -0.08310043]
 [-0.02197137 -0.08366151 -0.04096746 ...  0.73927019  0.37025668
  -0.14616986]
 [ 0.01960964  0.06514355  0.18004936 ...  0.36817215 -0.1715051
   0.36244684]]
