# CP and PDP with XGBoost

In [None]:
import dalex as dx
import xgboost

import sklearn

import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = dx.datasets.load_titanic()

X = df.drop(columns='survived')
X = pd.get_dummies(X, columns=["gender", "class", "embarked"], drop_first=True)
y = df.survived

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, test_size=0.33, random_state=42)

In [None]:
model = xgboost.XGBClassifier(
    n_estimators=50,
    max_depth=2,
    use_label_encoder=False,
    eval_metric="logloss",
    enable_categorical=True,
    tree_method="hist"
)

model.fit(X_train, y_train)

In [None]:
def pf_xgboost_classifier_categorical(model, df):
    df.loc[:, df.dtypes == 'object'] = \
        df.select_dtypes(['object']) \
            .apply(lambda x: x.astype('category'))
    return model.predict_proba(df)[:, 1]

explainer = dx.Explainer(model, X_test, y_test, predict_function=pf_xgboost_classifier_categorical)

In [None]:
explainer.model_performance()

In [None]:
explainer.model_parts().result

# Ceteris Paribus

Now we will calculate the Ceteris Paribus profiles for the observation number 400.

See the API documentation for all possible parameters:

- [Explainer.predict_profile](https://dalex.drwhy.ai/python/reference/dalex.explainer.Explainer.html#dalex.explainer.Explainer.predict_profile)
- [CeterisParibus](https://dalex.drwhy.ai/python/reference/dalex.cp.CeterisParibus.html)
- [CeterisParibus.plot](https://dalex.drwhy.ai/python/reference/dalex.cp.CeterisParibus.html#dalex.cp.CeterisParibus.plot)

In [None]:
cp = explainer.predict_profile(new_observation=X.iloc[[400]])

In [None]:
cp.plot(variables=["age", "sibsp"])

We can also calculate the profiles for multiple observations at once.

In [None]:
cp_10 = explainer.predict_profile(new_observation=X.iloc[400:410])
cp_10.plot(variables=["age", "sibsp"])

### Partial Dependence Plots

Now we will calculate the Partial Dependence Plots.

In [None]:
pdp = explainer.model_profile()

In [None]:
pdp.result

In [None]:
pdp.plot(variables=["age", "fare"])

In [None]:
pdp.plot(variables=["age", "fare"], geom="profiles", title="Partial Dependence Plot with individual profiles")

We can compare the profiles for different groups, such as by recorded gender:

In [None]:
pdp_grouped = explainer.model_profile(groups="gender_male")

In [None]:
pdp_grouped.plot(variables=["age", "fare"], title="PDP")

### Comparing Models

Create a larger model with more trees, copying the code from above. Name it `model_large`.

In [None]:
### Your code here

In [None]:
explainer_large = dx.Explainer(
    model_large,
    X_test,
    y_test,
    predict_function=pf_xgboost_classifier_categorical,
    label="Larger XGBoost",
    verbose=False
)

Now we can compare the two models.

In [None]:
pd.concat([explainer.model_performance().result, explainer_large.model_performance().result])

In [None]:
pdp_large = explainer_large.model_profile()

In [None]:
pdp_large.plot(pdp, variables=["age", "fare"], title="PDP")