# Explicabilidade - Interpret - Decision Tree

https://github.com/interpretml/interpret

Fonte: https://nbviewer.jupyter.org/github/interpretml/interpret/blob/master/examples/python/notebooks/Interpretable%20Classification%20Methods.ipynb

In [3]:
!pip install interpret

Collecting interpret
  Downloading interpret-0.1.22-py3-none-any.whl (1.4 kB)
Collecting interpret-core[dash,debug,decisiontree,ebm,lime,linear,notebook,plotly,required,sensitivity,shap,treeinterpreter]>=0.1.22
  Downloading interpret_core-0.1.22-py3-none-any.whl (8.3 MB)
[K     |████████████████████████████████| 8.3 MB 436 kB/s eta 0:00:01
[?25hCollecting dash>=1.0.0; extra == "dash"
  Downloading dash-1.13.4.tar.gz (67 kB)
[K     |████████████████████████████████| 67 kB 555 kB/s eta 0:00:011
[?25hCollecting dash-table>=4.1.0; extra == "dash"
  Downloading dash_table-4.8.1.tar.gz (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 6.2 MB/s eta 0:00:01
[?25hCollecting dash-cytoscape>=0.1.1; extra == "dash"
  Downloading dash_cytoscape-0.1.1.tar.gz (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 6.0 MB/s eta 0:00:01
Collecting lime>=0.1.1.33; extra == "lime"
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[K     |████████████████████████████████| 275 kB 4.6 MB/s eta 0

Collecting brotli
  Downloading Brotli-1.0.7-cp37-cp37m-manylinux1_x86_64.whl (352 kB)
[K     |████████████████████████████████| 352 kB 5.6 MB/s eta 0:00:01


Building wheels for collected packages: dash, dash-table, dash-cytoscape, lime, SALib, dill, shap, flask-compress, dash-renderer, dash-core-components, dash-html-components
  Building wheel for dash (setup.py) ... [?25ldone
[?25h  Created wheel for dash: filename=dash-1.13.4-py3-none-any.whl size=74951 sha256=2ad768ceeb2ff5233bcf56bb0cbf16fb0a95a5945b67dabb8476025418d89059
  Stored in directory: /home/barbara/.cache/pip/wheels/06/28/f7/a5ce9b564da2bb24f6c8fbe190c0458c6ff4a497a937815103
  Building wheel for dash-table (setup.py) ... [?25ldone
[?25h  Created wheel for dash-table: filename=dash_table-4.8.1-py3-none-any.whl size=1779391 sha256=807f33b46e317d96f2e68870d8864c237ceead02945b5aa944378493db14a056
  Stored in directory: /home/barbara/.cache/pip/wheels/66/81/95/b2774b227694b28cf1a6a7dcb92af258cebac354dfa67af6bb
  Building wheel for dash-cytoscape (setup.py) ... [?25ldone
[?25h  Created wheel for dash-cytoscape: filename=dash_cytoscape-0.1.1-py3-none-any.whl size=3430004 sha2

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None)
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
# df = df.sample(frac=0.1, random_state=1)
train_cols = df.columns[0:-1]
label = df.columns[-1]
X = df[train_cols]
y = df[label].apply(lambda x: 0 if x == " <=50K" else 1) #Turning response into 0 and 1

seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

In [6]:
df.head()

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
df.shape

(32561, 15)

In [4]:
from interpret import show
from interpret.data import ClassHistogram

In [5]:
hist = ClassHistogram().explain_data(X_train, y_train, name = 'Train Data')
show(hist)

In [8]:
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression, ClassificationTree, DecisionListClassifier

ebm = ExplainableBoostingClassifier(random_state=seed)
ebm.fit(X_train, y_train)   #Works on dataframes and numpy arrays

ExplainableBoostingClassifier(binning_strategy='quantile', data_n_episodes=2000,
                              early_stopping_run_length=50,
                              early_stopping_tolerance=1e-05,
                              feature_names=['Age', 'WorkClass', 'fnlwgt',
                                             'Education', 'EducationNum',
                                             'MaritalStatus', 'Occupation',
                                             'Relationship', 'Race', 'Gender',
                                             'CapitalGain', 'CapitalLoss',
                                             'HoursPerWeek', 'NativeCountry'],
                              feature_step_n_inner_b...
                                             'continuous', 'categorical',
                                             'categorical', 'categorical',
                                             'categorical', 'categorical',
                                             'continuous', 

In [9]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

In [19]:
ebm_local = ebm.explain_local(X_test[:100], y_test[:100], name='EBM')
show(ebm_local)

In [20]:
from interpret.perf import ROC

ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)

In [21]:
from interpret.glassbox import LogisticRegression, ClassificationTree

# We have to transform categorical variables to use Logistic Regression and Decision Tree
X_enc = pd.get_dummies(X, prefix_sep='.')
feature_names = list(X_enc.columns)
X_train_enc, X_test_enc, y_train, y_test = train_test_split(X_enc, y, test_size=0.20, random_state=seed)

lr = LogisticRegression(random_state=seed, feature_names=feature_names, penalty='l1', solver='liblinear')
lr.fit(X_train_enc, y_train)

tree = ClassificationTree()
tree.fit(X_train_enc, y_train)

<interpret.glassbox.decisiontree.ClassificationTree at 0x7f4496f7e510>

In [24]:
lr_perf = ROC(lr.predict_proba).explain_perf(X_test_enc, y_test, name='Logistic Regression')
tree_perf = ROC(tree.predict_proba).explain_perf(X_test_enc, y_test, name='Classification Tree')

show(lr_perf)
show(tree_perf)
show(ebm_perf)

In [27]:
lr_global = lr.explain_global(name='Logistic Regression')
tree_global = tree.explain_global(name='Classification Tree')

show(lr_global)
show(tree_global)
show(ebm_global)

In [28]:
# Do everything in one shot with the InterpretML Dashboard by passing a list into show

show([hist, lr_global, lr_perf, tree_global, tree_perf, ebm_global, ebm_perf], share_tables=True)