# Environment

In [None]:
# LeafSim
from leafsim import LeafSim, SUPPORTED_MODELS

# Helper function for the toy example
from utils import apply_formatting

# Models currently supported by LeafSim
# from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

# Dataset used in the example
from sklearn.datasets import load_iris

# To perform train - test splits
from sklearn.model_selection import train_test_split

# Useful libraries
import numpy as np
import pandas as pd

# Useful function
from IPython.display import display

In [None]:
# Make the example reproducible
np.random.seed(42)

# Data

In [None]:
data = load_iris(as_frame=True)

In [None]:
# Features as pandas dataframe
X = data["data"]
# Target as pandas series
y = data["target"]

# Names of feature columns
feature_cols = data["feature_names"]
# Name of target column
target_col = y.name
# Name of prediction column
predicted_col = "Predicted" + target_col

In [None]:
# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=46
)

In [None]:
X_train.shape[0], X_test.shape[0]

# Model

In [None]:
# Currently supported models
# For your own custom model
# see end of notebook for an example
SUPPORTED_MODELS

In [None]:
# Train model
model = RandomForestClassifier(
    random_state=46, n_jobs=-1, n_estimators=50, min_samples_leaf=4
)
model.fit(
    X_train,
    y_train,
)

In [None]:
# Store as series (just like y_train and y_test)
y_hat_train = pd.Series(model.predict(X_train))
y_hat_test = pd.Series(model.predict(X_test))

# Apply LeafSim

In [None]:
leafsim_instance = LeafSim(model)

In [None]:
top_n_ids, top_n_similarities = leafsim_instance.generate_explanations(
    X_train[feature_cols], X_test[feature_cols]
)

In [None]:
# One row for each observation in the test set
# By default, finding top 10 most similar observation
# in the training set
top_n_ids.shape

In [None]:
# One row for each observation in the test set
# By default, the similarity scores of the top 10
# most similar observations
top_n_similarities.shape

## Example where model performs poorly

In [None]:
def showcase(idx_to_explain: int):
    """
    :param idx_to_explain:
        Integer location of row in test feature dataframe and target series
    """
    # Let's explain the prediction of this test observation
    test_example = X_test.iloc[idx_to_explain].to_frame().T
    # Adding the target information
    test_example[predicted_col] = y_hat_test.iloc[idx_to_explain]
    test_example[target_col] = y_test.iloc[idx_to_explain]
    # Showcasing the observation
    display(test_example)

    # Show the main observations the model relies on
    # when prediction our examples
    top_n_explanations = top_n_ids[idx_to_explain]

    similar_obs_example = X_train.iloc[top_n_explanations].copy()
    similar_obs_example[f"{predicted_col}"] = y_hat_train.iloc[
        top_n_explanations
    ].values
    similar_obs_example[f"{target_col}"] = y_train.iloc[top_n_explanations]
    similar_obs_example["similarity"] = top_n_similarities[idx_to_explain]

    formatting = {
        c: lambda x: f"{x:,.1f}"
        for c in similar_obs_example.columns
    }
    display(
        apply_formatting(
            test_example,
            similar_obs_example.drop(columns=[predicted_col]),
            formatting
        )
    )

In [None]:
# Class for which the model performs poorly
idx_to_explain = np.argmax(y_test != y_hat_test.values)
showcase(idx_to_explain)

## Example where model performs well

In [None]:
# Class for which the model performs well
idx_to_explain = np.argmax(y_test == y_hat_test.values)
showcase(idx_to_explain)

## Custom model

Currently, LeafSim is not by default supporting XGBoost. To use it and any other ensemble model, one can modify the initialising of the LeafSim instance:

In [None]:
from xgboost import XGBRegressor

In [None]:
# Create a model wrapper instance
# The custom model really can be any type of instance
# that makes sense for LeafSim
# All that it requires is for it to have two attributes:
#   - get_leaf_indices (see next cell)
CustomModel = XGBRegressor

In [None]:
# Set the attribute "get_leaf_indices" that every custom model
# to be passed to LeafSim needs to have.
# This function takes in X (n_data, n_features)
# and provides a matrix of (n_data, n_predictions)
# where predictors are the individual predictors in the ensemble (trees)
# and the values are the leaves.
setattr(CustomModel, 'get_leaf_indices', XGBRegressor.apply)

In [None]:
# Verify that the model wrapper indeed has the desired attribute
CustomModel.get_leaf_indices

In [None]:
model = XGBRegressor()
model.fit(
    X_train,
    y_train,
)

In [None]:
leafsim_instance = CustomModel(model)

In [None]:
top_n_ids, top_n_similarities = leafsim_instance.generate_explanations(
    X_train[feature_cols], X_test[feature_cols]
)

...

# Further resources

For a more comprehensive usage example, please refer to this blog post [https://datascience.ch/leafsim/](https://datascience.ch/leafsim/) and the corresponding [notebook](https://renkulab.io/projects/lucas.chizzali/leafsim/files/blob/notebooks/LeafSim.ipynb).