In [1]:
## Setup a classification experiment

import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv(
    'processed.cleveland.data',
    header=None)
df.columns = [
    "age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang",
    "oldpeak","slope","ca","thal","disease"
]
#df = df.sample(frac=0.1, random_state=1)
train_cols = df.columns[0:-1]
label = df.columns[-1]
X = df[train_cols]
y = df[label].apply(lambda x: 0 if x == 0 else 1) #Turning response into 0 and 1

seed = 1

Xf = X.loc[X['sex']==0]
Xm = X.loc[X['sex']==1]

yf = y.loc[X['sex']==0]
ym = y.loc[X['sex']==1]

Xf_train, Xf_test, yf_train, yf_test = train_test_split(Xf, yf, test_size=0.20, random_state=seed)
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.20, random_state=seed)

In [13]:
## Explore the dataset

from interpret import show
from interpret.data import ClassHistogram
from interpret.perf import ROC

histf = ClassHistogram().explain_data(Xf_train, yf_train, name = 'Train Data f')
show(histf)

histm = ClassHistogram().explain_data(Xm_train, ym_train, name = 'Train Data m')
show(histm)

In [3]:
## Train the Explainable Boosting Machine (EBM)

from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression, ClassificationTree, DecisionListClassifier

ebmf = ExplainableBoostingClassifier(random_state=seed)
ebmf.fit(Xf_train, yf_train)   #Works on dataframes and numpy arrays

ebmm = ExplainableBoostingClassifier(random_state=seed)
ebmm.fit(Xm_train, ym_train)   #Works on dataframes and numpy arrays

ExplainableBoostingClassifier(feature_names=['age', 'sex', 'cp', 'trestbps',
                                             'chol', 'fbs', 'restecg',
                                             'thalach', 'exang', 'oldpeak',
                                             'slope', 'ca', 'thal'],
                              feature_types=['continuous', 'categorical',
                                             'continuous', 'continuous',
                                             'continuous', 'categorical',
                                             'continuous', 'continuous',
                                             'categorical', 'continuous',
                                             'continuous', 'categorical',
                                             'categorical'],
                              random_state=1)

In [4]:
## Global Explanations: What the model learned overall

ebmf_global = ebmf.explain_global(name='EBM f')
show(ebmf_global)

ebmm_global = ebmm.explain_global(name='EBM m')
show(ebmm_global)

In [5]:
## Local Explanations: How an individual prediction was made

ebmf_local = ebmf.explain_local(Xf_test[:5], yf_test[:5], name='EBM f')
show(ebmf_local)

ebmm_local = ebmm.explain_local(Xm_test[:5], ym_test[:5], name='EBM m')
show(ebmm_local)

In [11]:
from interpret.glassbox import LogisticRegression, ClassificationTree

# We have to transform categorical variables to use Logistic Regression and Decision Tree
Xf_enc = pd.get_dummies(Xf, prefix_sep='.')
Xm_enc = pd.get_dummies(Xm, prefix_sep='.')
feature_namesf = list(Xf_enc.columns)
feature_namesm = list(Xm_enc.columns)
Xf_train_enc, Xf_test_enc, yf_train, yf_test = train_test_split(Xf_enc, yf, test_size=0.20, random_state=seed)
Xm_train_enc, Xm_test_enc, ym_train, ym_test = train_test_split(Xm_enc, ym, test_size=0.20, random_state=seed)

lrf = LogisticRegression(random_state=seed, feature_names=feature_namesf, penalty='l1', solver='liblinear')
lrm = LogisticRegression(random_state=seed, feature_names=feature_namesm, penalty='l1', solver='liblinear')
lrf.fit(Xf_train_enc, yf_train)
lrm.fit(Xm_train_enc, ym_train)

treef = ClassificationTree()
treem = ClassificationTree()
treef.fit(Xf_train_enc, yf_train)
treem.fit(Xm_train_enc, ym_train)

treef_global = treef.explain_global(name='Classification Tree f')
treem_global = treem.explain_global(name='Classification Tree m')

show(treef_global)
show(treem_global)

In [16]:
lrf_perf = ROC(lrf.predict_proba).explain_perf(Xf_test_enc, yf_test, name='Logistic Regression f')
lrm_perm = ROC(lrm.predict_proba).explain_perf(Xm_test_enc, ym_test, name='Logistic Regression m')
treef_perf = ROC(treef.predict_proba).explain_perf(Xf_test_enc, yf_test, name='Classification Tree f')
treem_perm = ROC(treem.predict_proba).explain_perf(Xm_test_enc, ym_test, name='Classification Tree m')

show(lrf_perf)
show(lrm_perm)
show(treef_perf)
show(treem_perm)