In [0]:
!pip install shap


In [0]:
import pandas as pd
import numpy as np
import model_evaluation_utils as meu #es necesario tener el módulo model_evaluation_utils.py
import matplotlib.pyplot as plt
from collections import Counter
import shap




In [0]:
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
%matplotlib inline

In [0]:
shap.initjs()

In [0]:
# Vamos a cargar un dataset muy conocido de información de censo americano, ya limpio que nos proporciona shap. La definición del dataset podéis encontrarla aquí https://archive.ics.uci.edu/ml/datasets/adult 
data, labels = shap.datasets.adult(display=True)
labels = np.array([int(label) for label in labels])

print(data.shape, labels.shape)
data.head()

In [0]:
#¿Está balanceado?
Counter(labels)

In [0]:
#Ingeniería de Características básica convirtiendo valores de cadena en numéricos

cat_cols = data.select_dtypes(['category']).columns
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
data.head()

In [0]:
#Generamos nuestros datos de entrenamiento y test. Mantenemos dos conjuntos de datos, uno con los valores categóricos codificados, y otro con los valores originales, de forma que podamos entrenar con el codificado, manteniendo el original para poder interpretar después del modelo

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)
X_train.head(3)

In [0]:
data_disp, labels_disp = shap.datasets.adult(display=True)
X_train_disp, X_test_disp, y_train_disp, y_test_disp = train_test_split(data_disp, labels_disp, test_size=0.3, random_state=42)
print(X_train_disp.shape, X_test_disp.shape)
X_train_disp.head(3)

In [0]:
#Entrenamos el modelo
%%time

import xgboost as xgb
xgc = xgb.XGBClassifier(n_estimators=500, max_depth=5, base_score=0.5,
                        objective='binary:logistic', random_state=42)
xgc.fit(X_train, y_train)

In [0]:
#hacemos predicciones
predictions = xgc.predict(X_test)
predictions[:10]

# Interpretación de Modelos con SHAP

In [0]:
#Vamos a utilizar el TreeExplainer incluido dentro de SHAP para XGBoost
explainer = shap.TreeExplainer(xgc)
shap_values = explainer.shap_values(X_test)
print('Expected Value:', explainer.expected_value)
pd.DataFrame(shap_values).head()

In [0]:
shap.initjs()
shap.force_plot(explainer.expected_value, 
                shap_values[0,:], X_test_disp.iloc[0,:])

In [0]:
shap.initjs()
shap.force_plot(explainer.expected_value, 
                shap_values[2,:], X_test_disp.iloc[2,:])

In [0]:
shap.initjs()
shap.force_plot(explainer.expected_value, 
                shap_values[:1000,:], X_test_disp.iloc[:1000,:])

In [0]:
#importancia de Características

shap.summary_plot(shap_values, 
                  X_test, plot_type="bar")


In [0]:

shap.summary_plot(shap_values, X_test)

In [0]:
shap.dependence_plot(ind='Age', interaction_index='Age',
                     shap_values=shap_values, 
                     features=X_test,  
                     display_features=X_test_disp)

In [0]:

shap.dependence_plot(ind='Education-Num', interaction_index='Education-Num',
                     shap_values=shap_values, 
                     features=X_test,  
                     display_features=X_test_disp)

In [0]:

shap.dependence_plot(ind='Relationship', interaction_index='Relationship',
                     shap_values=shap_values, 
                     features=X_test,  
                     display_features=X_test_disp)

In [0]:

shap.dependence_plot(ind='Capital Gain', interaction_index='Capital Gain',
                     shap_values=shap_values, 
                     features=X_test,  
                     display_features=X_test_disp)

In [0]:
shap.dependence_plot(ind='Age', interaction_index='Capital Gain', 
                     shap_values=shap_values, features=X_test, 
                     display_features=X_test_disp)

In [0]:
shap.dependence_plot(ind='Education-Num', interaction_index='Relationship', 
                     shap_values=shap_values, features=X_test, 
                     display_features=X_test_disp)

In [0]:

shap.dependence_plot(ind='Marital Status', interaction_index='Relationship', 
                     shap_values=shap_values, features=X_test, 
                     display_features=X_test_disp)

In [0]:

shap.dependence_plot(ind='Age', interaction_index='Hours per week', 
                     shap_values=shap_values, features=X_test, 
                     display_features=X_test_disp)