# SHAP with XGBoost

In this notebook, we will use the Titanic dataset to predict the survival of passengers using XGBoost. We will use SHAP to explain the predictions of the models.

### Install and import the necessary libraries

In [None]:
# Install the necessary libraries

# !pip install -q dalex xgboost shap

In [1]:
# Import the necessary libraries
import dalex as dx
import xgboost as xgb
import shap

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

### Load the Titanic dataset

In [2]:
df = dx.datasets.load_titanic()
df.head()

Unnamed: 0,gender,age,class,embarked,fare,sibsp,parch,survived
0,male,42.0,3rd,Southampton,7.11,0,0,0
1,male,13.0,3rd,Southampton,20.05,0,2,0
2,male,16.0,3rd,Southampton,20.05,1,1,0
3,female,39.0,3rd,Southampton,20.05,1,1,1
4,female,16.0,3rd,Southampton,7.13,0,0,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2207 entries, 0 to 2206
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   gender    2207 non-null   object 
 1   age       2207 non-null   float64
 2   class     2207 non-null   object 
 3   embarked  2207 non-null   object 
 4   fare      2207 non-null   float64
 5   sibsp     2207 non-null   int64  
 6   parch     2207 non-null   int64  
 7   survived  2207 non-null   int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 138.1+ KB


The columns labelled 'object' as their dtype are categorical columns. We can convert them into a 'category' dtype so that they can be used in the model.

In [4]:
df.loc[:, df.dtypes == 'object'] = df.select_dtypes(['object']).apply(lambda x: x.astype('category'))

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2207 entries, 0 to 2206
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   gender    2207 non-null   object 
 1   age       2207 non-null   float64
 2   class     2207 non-null   object 
 3   embarked  2207 non-null   object 
 4   fare      2207 non-null   float64
 5   sibsp     2207 non-null   int64  
 6   parch     2207 non-null   int64  
 7   survived  2207 non-null   int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 138.1+ KB


### Split the data into features and target

We will also use the 'get_dummies' method to convert gender to a 0-1 encoding.

In [None]:
X = df.drop('survived', axis=1)
y = df['survived']

X = pd.get_dummies(X, columns=['gender'], drop_first=True)

In [None]:
X.head()

### Split the data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Create our XGBoost model

In [None]:
model = xgb.XGBClassifier(
    n_estimators = 200,        # Number of trees to fit
    max_depth = 4,             # Maximum tree depth for individual trees
    use_label_encoder = False, # Leave this as False to avoid warnings
    enable_categorical = True,  # Leave this as True to use categorical columns
    tree_method = 'hist'       # Use a histogram-based method for faster training
)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

### Explain with Dalex

In [None]:
def pf_xgboost_classifier_categorical(model, df):
    df.loc[:, df.dtypes == 'object'] = \
        df.select_dtypes(['object']) \
            .apply(lambda x: x.astype('category'))
    return model.predict_proba(df)[:, 1]

explainer = dx.Explainer(model, X, y, predict_function=pf_xgboost_classifier_categorical, label='XGBoost')

In [None]:
explainer.model_performance()

In [None]:
explainer.predict(X_test[0:10])

In [None]:
shap_attributions = [explainer.predict_parts(X.iloc[[i]], type="shap", label=f'passenger {i}') for i in range(5)]

In [None]:
shap_attributions[0].plot(shap_attributions[1::])

In [None]:
bd_attributions = [explainer.predict_parts(X.iloc[[i]], type="break_down", label=f'passenger {i}') for i in range(5)]

In [None]:
bd_attributions[0].plot(bd_attributions[1::])

### Explain with SHAP

In [None]:
# Need to convert the categorical columns to one-hot encoding
X_ohe = pd.get_dummies(X, columns=['class', 'embarked'], drop_first=True)

In [None]:
X_ohe.head()

In [None]:
X_train_ohe, X_test_ohe, y_train, y_test = train_test_split(X_ohe, y, test_size=0.2, random_state=42)

model_ohe = xgb.XGBClassifier(
    n_estimators = 200,        # Number of trees to fit
    max_depth = 4,             # Maximum tree depth for individual trees
    use_label_encoder = False, # Leave this as False to avoid warnings
    tree_method = 'hist'       # Use a histogram-based method for faster training
)

In [None]:
model_ohe.fit(X_train_ohe, y_train)

In [None]:
model_ohe.score(X_test_ohe, y_test)

In [None]:
explainer_ohe = shap.explainers.Tree(model_ohe, data=X_train_ohe, model_output='probability')

In [None]:
shap_values = explainer_ohe.shap_values(X_test_ohe)

In [None]:
shap.summary_plot(shap_values, X_test_ohe)

In [None]:
for i in range(5):
    shap.force_plot(explainer_ohe.expected_value, shap_values[i], X_test_ohe.iloc[i], matplotlib=True)