In [None]:
import os, warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

import urllib.request as request

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score

In [None]:
!pip install lime

In [None]:
import lime
import lime.lime_tabular

In [None]:
# UCI Adult Dataset Download
# https://archive.ics.uci.edu/ml/datasets/Adult
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

In [None]:
root_dir = os.getcwd()
data_dir = os.path.join(root_dir, 'data')
os.makedirs(data_dir, exist_ok=True)

In [None]:
save_fname = os.path.join(data_dir, 'adult.data')
request.urlretrieve(url, save_fname)

In [None]:
cols = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num",
       "Marital Status","Occupation", "Relationship", "Race", "Sex",
       "Capital Gain", "Capital Loss","Hours per week", "Country", "Income"]

In [None]:
df = pd.read_csv(save_fname, header=None,
                names=cols)

In [None]:
col_categorical = list()
col_numerical = list()
col_target = list()

for col, type_ in df.dtypes.to_dict().items():
    if col == 'Income':
        col_target.append(col)
    elif str(type_) == 'int64':
        col_numerical.append(col)
    elif str(type_) == 'object':
        col_categorical.append(col)
    else:
        raise ValueError
        
print(f"X_Features(Categorical): \n {col_categorical} \n")
print(f"X_Features(Numercial): \n {col_numerical} \n")
print(f"Y_Features(Categorical): \n {col_target}")

In [None]:
col_target

In [None]:
scaler_LB = LabelEncoder()
df[col_target[0]] = scaler_LB.fit_transform(df[col_target[0]])
df_target_class = scaler_LB.classes_
print(df_target_class.ravel())

In [None]:
df[col_categorical].astype(str)

In [None]:
categorical_names = {}
for col in col_categorical:
    scaler_LB = LabelEncoder()
    df[col] = scaler_LB.fit_transform(df[col])
    categorical_names[col] = scaler_LB.classes_

cat_dummy = pd.get_dummies(df[col_categorical].astype(str))

col_categorical_oh = cat_dummy.columns.tolist()

df = pd.merge(df[col_numerical+col_target], cat_dummy, how='left', left_index=True, right_index=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[col_numerical+col_categorical_oh],
                                                          df[col_target],
                                                          train_size=0.80)

In [None]:
X_train.head()

model_gb = GradientBoostingClassifier(n_estimators=100, random_state=20220713)
model_gb.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, model_gb.predict(X_test))

In [None]:
predict_fn = lambda x: model_gb.predict_proba(x)

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(training_data=X_train.values,
                                                   feature_names=X_train.columns,
                                                   categorical_features=col_categorical_oh,
                                                   class_names=df_target_class,
                                                   kernel_width=3)

In [None]:
i = 2
exp = explainer.explain_instance(X_test.values[i], predict_fn, num_features=5)
exp.show_in_notebook(show_all=False)

In [None]:
!pip install shap

In [None]:
import shap
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(model=model_gb, data=X_train)

In [None]:
shap_values = explainer(X_test)

In [None]:
shap.plots.waterfall(shap_values[1])

In [None]:
shap.plots.force(explainer.expected_value, shap_values.values[0, :], features=X_test.iloc[0, :])


In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.bar(shap_values)

In [None]:
for name in X_train.columns:
    shap.dependence_plot(name, shap_values.values, X_test, display_features=X_test)