## Install libraries

In [None]:
pip install shap xgboost

In [None]:
pip install plotly

## Import libraries

In [None]:
import matplotlib.pylab as pl
import numpy as np
import xgboost
from sklearn.model_selection import train_test_split
import shap

# print the JS visualization code to the notebook
shap.initjs()

## Import dataset

In [None]:
X, y = shap.datasets.adult()
X_display, y_display = shap.datasets.adult(display=True)

In [None]:
print(X_display)

In [None]:
print(y_display)

## Create test-train split

In [None]:
# create a train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
d_train = xgboost.DMatrix(X_train, label=y_train)
d_test = xgboost.DMatrix(X_test, label=y_test)

In [None]:
print(X.shape)

In [None]:
print(X_train)

In [None]:
print(X_train.shape)

## Define parameters for xgboost

In [None]:
params = {
    "eta": 0.01,
    "objective": "binary:logistic",
    "subsample": 0.5,
    "base_score": np.mean(y_train),
    "eval_metric": "logloss",
}
model = xgboost.train(
    params,
    d_train,
    5000,
    evals=[(d_test, "test")],
    verbose_eval=100,
    early_stopping_rounds=20,
)

## Print accuracy

In [None]:
from sklearn.metrics import accuracy_score

# Make predictions
y_pred_prob = model.predict(d_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

## Weight metric

In [None]:
xgboost.plot_importance(model,importance_type="weight")
pl.title("xgboost.plot_importance(model)")
pl.show()

## Cover feature

In [None]:
#The "cover" metric measures the coverage of a feature, which is the number of
#samples or observations that are affected by splits involving that feature.
#Essentially, it represents how frequently a feature is used to partition
#the data and how many data points fall into those partitions.

xgboost.plot_importance(model, importance_type="cover")
pl.title('xgboost.plot_importance(model, importance_type="cover")')
pl.show()

## Gain feature

In [None]:
#Gain represents the improvement in the objective function (such as accuracy or log loss)
#that a feature provides when it is used in a split. It quantifies the
#contribution of a feature to the model's performance.

xgboost.plot_importance(model, importance_type="gain")
pl.title('xgboost.plot_importance(model, importance_type="gain")')
pl.show()

## SHAP explainer

In [None]:
# This takes 5-6 minutes since we are explaining over 30
#thousand samples in a model with over a thousand trees
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

## SHAP force plot

In [None]:
import plotly.io as pio

# Set the default renderer for Plotly to 'colab'
pio.renderers.default = 'colab'

# print the JS visualization code to the notebook
shap.initjs()  # Ensure this line is uncommented

shap.force_plot(explainer.expected_value, shap_values[100, :], X_display.iloc[100, :])

In [None]:
# Set the default renderer for Plotly to 'colab'
pio.renderers.default = 'colab'

# print the JS visualization code to the notebook
shap.initjs()  # Ensure this line is uncommented

shap.force_plot(
    explainer.expected_value, shap_values[:1000, :], X_display.iloc[:1000, :]
)

## SHAP summary plot

In [None]:
shap.summary_plot(shap_values, X_display, plot_type="bar")

In [None]:
shap.summary_plot(shap_values, X)

## SHAP dependence plot

In [None]:
for name in X_train.columns:
    shap.dependence_plot(name, shap_values, X, display_features=X_display)