In [19]:
# standard data science packages
import numpy as np
import pandas as pd

# imodels imports
from imodels.tree.rf_plus.rf_plus.rf_plus_models import \
    RandomForestPlusRegressor, RandomForestPlusClassifier
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import \
    RFPlusMDI, AloRFPlusMDI

# functions for subgroup experiments
from subgroup_detection import *
from subgroup_experiment import *
import shap

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, \
    accuracy_score, r2_score, f1_score, log_loss, root_mean_squared_error

# pipeline imports
from subgroup import *

In [20]:
import lime
def get_lime(X, rf, task):
    result = np.zeros((X.shape[0], X.shape[1]))
    explainer = lime.lime_tabular.LimeTabularExplainer(X_train, verbose = False,
                                                       mode = task)
    num_features = X.shape[1]
    for i in range(X.shape[0]):
        if task == 'classification':
            exp = explainer.explain_instance(X[i, :], rf.predict_proba,
                                             num_features = num_features)
        else:
            exp = explainer.explain_instance(X[i, :], rf.predict,
                                             num_features = num_features)
        original_feature_importance = exp.as_map()[1]
        # print("----------------")
        # print("Original feature importance")
        # print(original_feature_importance)
        sorted_feature_importance = sorted(original_feature_importance, key=lambda x: x[0])
        # print("----------------")
        # print("Sorted feature importance")
        # print(sorted_feature_importance)
        # print("----------------")
        for j in range(num_features):
            result[i, j] = sorted_feature_importance[j][1]
    return result

In [21]:
# set inputs
seed = 1
dataids = [361247, 361243, 361242, 361251, 361253, 361260, 361259, 361256, 361254, 361622]
dataid = dataids[0]
clustertype = "hierarchical"

In [22]:
# get data
X, y = get_openml_data(dataid)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,
                                                    random_state=seed)

# check if task is regression or classification
if len(np.unique(y)) == 2:
    task = 'classification'
else:
    task = 'regression'
    
# fit the prediction models
rf, rf_plus_baseline, rf_plus = fit_models(X_train, y_train, task)

  X, y = get_openml_data(dataid)
  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)
  return datasets.get_dataset(self.dataset_id)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.6min finished


In [23]:
# obtain shap feature importances
shap_explainer = shap.TreeExplainer(rf)
shap_train_values, shap_train_rankings = get_shap(X_train, shap_explainer,
                                                    task)
shap_test_values, shap_test_rankings = get_shap(X_test, shap_explainer,
                                                task)

# get lime feature importances
lime_train_values = get_lime(X_train, rf, task)
lime_test_values = get_lime(X_test, rf, task)

In [24]:
# create list of lmdi variants
lmdi_variants = create_lmdi_variant_map()

In [25]:
# obtain lmdi feature importances
lmdi_explainers = get_lmdi_explainers(rf_plus, lmdi_variants,
                                        rf_plus_baseline = rf_plus_baseline)
lfi_train_values, lfi_train_rankings = get_lmdi(X_train, y_train,
                                                lmdi_variants,
                                                lmdi_explainers)
lfi_test_values, lfi_test_rankings = get_lmdi(X_test, None,
                                                lmdi_variants,
                                                lmdi_explainers)
# add shap to the dictionaries
lfi_train_values["shap"] = shap_train_values
lfi_train_rankings["shap"] = shap_train_rankings
lfi_test_values["shap"] = shap_test_values
lfi_test_rankings["shap"] = shap_test_rankings

# add lime to the dictionaries
lfi_train_values["lime"] = lime_train_values
lfi_test_values["lime"] = lime_test_values

# add the raw data to the dictionaries as a baseline of comparison
lfi_train_values["rawdata"] = X_train
lfi_test_values["rawdata"] = X_test

In [26]:
# get the clusterings
# method_to_labels, method_to_indices = get_train_clusters(lfi_train_values, clustertype)
train_clusters = get_train_clusters(lfi_train_values, clustertype)
cluster_centroids = get_cluster_centroids(lfi_train_values, train_clusters)
test_clusters = get_test_clusters(lfi_test_values, cluster_centroids)

In [27]:
# compute the performance
metrics_to_scores = compute_performance(X_train, X_test, y_train, y_test,
                                        train_clusters, test_clusters, task)

In [28]:
metrics_to_scores

{'r2': {'lmdi_baseline': {2: -2.336045506625965,
   3: -3.005802671352251,
   4: -21.624380540066195,
   5: -7.218289660433478,
   6: -69.72658022011487,
   7: -2104.952208042981,
   8: -2037.866249202602,
   9: -2656.2605027100244,
   10: -2654.052975487508},
  'aloo_l2_signed_normed_leafavg_rank': {2: 0.9925519955163827,
   3: 0.9205601738779873,
   4: 0.9977108495250491,
   5: 0.9977210802341477,
   6: 0.9987282808399349,
   7: 0.9987878584955502,
   8: 0.9993031134884751,
   9: 0.9993042108049444,
   10: 0.9993008218759488},
  'aloo_l2_signed_normed_leafavg_norank': {2: 0.9925519955163827,
   3: 0.9205601738779873,
   4: 0.9977108495250491,
   5: 0.9977210802341477,
   6: 0.9987282808399349,
   7: 0.9987878584955502,
   8: 0.9993031134884751,
   9: 0.9993042108049444,
   10: 0.9993008218759488},
  'aloo_l2_signed_normed_noleafavg_rank': {2: 0.9925519955163827,
   3: 0.8842846473707685,
   4: 0.9977098389153599,
   5: 0.9977210927216864,
   6: 0.9987219670536219,
   7: 0.99877235320