In [0]:
!pip install shap


In [0]:
import pandas as pd
import numpy as np
import model_evaluation_utils as meu #es necesario tener el módulo model_evaluation_utils.py
import matplotlib.pyplot as plt
from collections import Counter
import shap



In [0]:
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
%matplotlib inline

In [0]:
# Vamos a cargar un dataset muy conocido de información de censo americano, ya limpio que nos proporciona shap. La definición del dataset podéis encontrarla aquí https://archive.ics.uci.edu/ml/datasets/adult 
data, labels = shap.datasets.adult(display=True)
labels = np.array([int(label) for label in labels])

print(data.shape, labels.shape)
data.head()

In [0]:
#¿Está balanceado?
Counter(labels)

In [0]:
#Ingeniería de Características básica convirtiendo valores de cadena en numéricos

cat_cols = data.select_dtypes(['category']).columns
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
data.head()

In [0]:
#Generamos nuestros datos de entrenamiento y test. Mantenemos dos conjuntos de datos, uno con los valores categóricos codificados, y otro con los valores originales, de forma que podamos entrenar con el codificado, manteniendo el original para poder interpretar después del modelo

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)
X_train.head(3)

In [0]:
data_disp, labels_disp = shap.datasets.adult(display=True)
X_train_disp, X_test_disp, y_train_disp, y_test_disp = train_test_split(data_disp, labels_disp, test_size=0.3, random_state=42)
print(X_train_disp.shape, X_test_disp.shape)
X_train_disp.head(3)

In [0]:
#Entrenamos el modelo
%%time

import xgboost as xgb
xgc = xgb.XGBClassifier(n_estimators=500, max_depth=5, base_score=0.5,
                        objective='binary:logistic', random_state=42)
xgc.fit(X_train, y_train)

In [0]:
#hacemos predicciones
predictions = xgc.predict(X_test)
predictions[:10]

In [0]:
class_labels = list(set(labels))
meu.display_model_performance_metrics(true_labels=y_test, 
                                      predicted_labels=predictions, 
                                      classes=class_labels)

In [0]:
#¿qué nos proporciona directamente el algoritmo?
#Vamos a analizar los cálculos de importancia de caracteristicas que nos proporciona el algoritmo XGBoost, que nos posibilita ver la importancia de características basado en:
#Peso de la característica. el Número de veces que aparece la característica en un árbol a través del ensamblado de árboles
#Ganancia (Gain). Basado en la ganancia media de splits que usa la característica
#Covertura (Coverage). Número de muestras afectadas que utiliza la característica

#Veremos como en este caso, los resultados no son concluyentes, por lo que se hace necesario un framework que nos permita obtener la información correcta

fig = plt.figure(figsize = (16, 12))
title = fig.suptitle("Default Feature Importances from XGBoost", fontsize=14)

ax1 = fig.add_subplot(2,2, 1)
xgb.plot_importance(xgc, importance_type='weight', ax=ax1)
t=ax1.set_title("Feature Importance - Feature Weight")

ax2 = fig.add_subplot(2,2, 2)
xgb.plot_importance(xgc, importance_type='gain', ax=ax2)
t=ax2.set_title("Feature Importance - Split Mean Gain")

ax3 = fig.add_subplot(2,2, 3)
xgb.plot_importance(xgc, importance_type='cover', ax=ax3)
t=ax3.set_title("Feature Importance - Sample Coverage")

Uso de SKATER

In [0]:
!pip install skater

In [0]:
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel

interpreter = Interpretation(training_data=X_test, training_labels=y_test, 
                             feature_names=list(data.columns))
im_model = InMemoryModel(xgc.predict_proba, examples=X_train, 
                         target_names=['$50K or less', 'More than $50K'])

In [0]:
#importancia de características con Skater
plots = interpreter.feature_importance.plot_feature_importance(im_model, ascending=True, 
                                                               n_samples=23000)

In [0]:
#Dependencias parciales Demostración PDP

r = interpreter.partial_dependence.plot_partial_dependence(['Age'], im_model, grid_resolution=50, 
                                                           grid_range=(0,1), n_samples=23000, 
                                                           with_variance=True, figsize = (6, 4))
yl = r[0][1].set_ylim(0, 1) 

In [0]:
r = interpreter.partial_dependence.plot_partial_dependence(['Education-Num'], im_model, grid_resolution=50, 
                                                           grid_range=(0,1), n_samples=23000, 
                                                           with_variance=True, figsize = (6, 4))
yl = r[0][1].set_ylim(0, 1) 

In [0]:

r = interpreter.partial_dependence.plot_partial_dependence(['Capital Gain'], im_model, grid_resolution=50, 
                                                           grid_range=(0,1), 
                                                           with_variance=True, figsize = (8, 4), n_samples=23000)
yl = r[0][1].set_ylim(0, 1) 
s, e = r[0][1].get_xlim()
xl = r[0][1].set_xticks(np.arange(s, e, 10000))

In [0]:
pd.concat([data_disp[['Relationship']], data[['Relationship']]], 
          axis=1).drop_duplicates()

In [0]:

r = interpreter.partial_dependence.plot_partial_dependence(['Relationship'], im_model, grid_resolution=50, 
                                                           grid_range=(0,1), n_samples=23000, 
                                                           with_variance=True, figsize = (6, 4))
yl = r[0][1].set_ylim(0, 1) 

In [0]:
plots_list = interpreter.partial_dependence.plot_partial_dependence([('Age', 'Education-Num')], 
                                                                    im_model, grid_range=(0,1), 
                                                                    n_samples=23000,
                                                                    figsize=(12, 5),
                                                                    grid_resolution=100)

In [0]:
plots_list = interpreter.partial_dependence.plot_partial_dependence([('Education-Num', 'Capital Gain')], 
                                                                    im_model, grid_range=(0,1), 
                                                                    n_samples=23000,
                                                                    figsize=(12, 5),
                                                                    grid_resolution=100)