In [1]:
# standard data science packages
import numpy as np
import pandas as pd

# imodels imports
from imodels.tree.rf_plus.rf_plus.rf_plus_models import \
    RandomForestPlusRegressor, RandomForestPlusClassifier
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import \
    RFPlusMDI, AloRFPlusMDI

# functions for subgroup experiments
from subgroup_detection import *
from subgroup_experiment import *
import shap

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, \
    accuracy_score, r2_score, f1_score, log_loss, root_mean_squared_error

# pipeline imports
from subgroup import *

In [2]:
# set inputs
seed = 1
dataids = [361247, 361243, 361242, 361251, 361253, 361260, 361259, 361256, 361254, 361622]
dataid = dataids[0]
clustertype = "hierarchical"

In [3]:
# get data
X, y = get_openml_data(dataid)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5,
                                                    random_state=seed)

print("Step 1")

# check if task is regression or classification
if len(np.unique(y)) == 2:
    task = 'classification'
else:
    task = 'regression'
    
# fit the prediction models
rf, rf_plus_baseline, rf_plus = fit_models(X_train, y_train, task)

rf_plus_ridge = RandomForestPlusRegressor(rf_model=rf, prediction_model=RidgeCV(cv=5))
rf_plus_ridge.fit(X_train, y_train)

rf_plus_lasso = RandomForestPlusRegressor(rf_model=rf,
                                              prediction_model=LassoCV(cv=5,
                                                max_iter=10000, random_state=0))
rf_plus_lasso.fit(X_train, y_train)

print("Step 2")

# obtain shap feature importances
shap_explainer = shap.TreeExplainer(rf)
shap_test_values, shap_test_rankings = get_shap(X_test, shap_explainer,
                                                task)

print("Step 3")

# get lime feature importances
lime_test_values, lime_test_rankings = get_lime(X_test, rf, task)

print("Step 4")

# create list of lmdi variants
lmdi_variants = create_lmdi_variant_map()

# obtain lmdi feature importances
lmdi_explainers = get_lmdi_explainers(rf_plus, lmdi_variants,
                                          rf_plus_baseline = rf_plus_baseline,
                                          rf_plus_lasso = rf_plus_lasso,
                                          rf_plus_ridge = rf_plus_ridge)

print("Step 5")

# we don't actually want to use the training values, but for leaf averaging
# variants, we need to have the training data to compute the leaf averages
lfi_train_values, lfi_train_rankings = get_lmdi(X_train, y_train,
                                                lmdi_variants,
                                                lmdi_explainers)
lfi_test_values, lfi_test_rankings = get_lmdi(X_test, None,
                                                lmdi_variants,
                                                lmdi_explainers)

print("Step 6")

# add shap to the dictionaries
lfi_test_values["shap"] = shap_test_values
lfi_test_rankings["shap"] = shap_test_rankings

# add the raw data to the dictionaries as a baseline of comparison
lfi_test_values["rawdata"] = X_test

# add lime to the dictionaries
lfi_test_values["lime"] = lime_test_values
    
# get the clusterings - while we are not doing this on the training values,
# the get_train_clusters function still does what we want it to.
clusters = get_train_clusters(lfi_test_values, clustertype)

print("Step 7")

# for each cluster, assign half of the indices to the "fitting" set and
# the other half to the "evaluation" set
fitting_clusters = {}
evaluation_clusters = {}
for variant, nclust_map in clusters.items():
    fitting_nclust_to_c = {}
    evaluation_nclust_to_c = {}
    for nclust, cluster_map in nclust_map.items():
        fitting_c_to_idxs = {}
        evaluation_c_to_idxs = {}
        for c, idxs in cluster_map.items():
            # shuffle the indices and split them in half
            np.random.shuffle(idxs)
            half = len(idxs) // 2
            fitting_c_to_idxs[c] = idxs[:half]
            evaluation_c_to_idxs[c] = idxs[half:]
        fitting_nclust_to_c[nclust] = fitting_c_to_idxs
        evaluation_nclust_to_c[nclust] = evaluation_c_to_idxs
    fitting_clusters[variant] = fitting_nclust_to_c
    evaluation_clusters[variant] = evaluation_nclust_to_c
    
print("fitting_clusters", fitting_clusters["shap"][2])
print("evaluation_clusters", evaluation_clusters["shap"][2])
    
print("Step 8")
    
# obtain dataframes X_fit, y_fit, X_eval, y_eval
# X_fit = []
# y_fit = []
# X_eval = []
# y_eval = []
# for variant, nclust_map in fitting_clusters.items():
#     for nclust, cluster_map in nclust_map.items():
#         for c, idxs in cluster_map.items():
#             X_fit.append(X_test[idxs])
#             y_fit.append(y_test[idxs])
# for variant, nclust_map in evaluation_clusters.items():
#     for nclust, cluster_map in nclust_map.items():
#         for c, idxs in cluster_map.items():
#             X_eval.append(X_test[idxs])
#             y_eval.append(y_test[idxs])
# X_fit = np.vstack(X_fit)
# y_fit = np.hstack(y_fit)
# X_eval = np.vstack(X_eval)
# y_eval = np.hstack(y_eval)

# print("X_fit shape", X_fit.shape)
# print("X_eval shape", X_eval.shape)

print("Step 9")
    
# compute the performance - we are using test data for both, not an error
metrics_to_scores = compute_performance(X_test, X_test, y_test, y_test,
                                        fitting_clusters,
                                        evaluation_clusters, task)


  X, y = get_openml_data(dataid)
  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)
  return datasets.get_dataset(self.dataset_id)


Step 1


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  mod

Step 2
Step 3
Step 4
Step 5
Step 6
Step 7
fitting_clusters {0: array([426, 948, 105, 720, 877,  84, 252, 385, 767, 912, 216, 350, 428,
       497, 423, 828, 596, 725, 856, 229, 142, 357, 867, 150,  31,  49,
       544, 407, 362, 898, 479, 669, 162, 643, 984, 319, 523, 997, 649,
       260, 328, 465, 291, 950, 662, 262, 484,   7, 949, 883, 790, 705,
        42, 911, 155, 805, 994, 633, 962,  70, 951, 116, 665, 976, 591,
       584, 121, 304, 277, 773, 632, 224, 450, 163, 772, 533, 470, 239,
       837, 695, 389, 566,  92, 125, 129, 507, 175, 623, 658, 490, 757,
       174, 785, 459, 896, 712, 943, 278, 782, 169, 127, 483, 244, 338,
        33,  37,  38, 191, 280, 967, 320, 585, 814, 809, 271, 135, 913,
       250, 452, 354, 181, 139, 433, 454, 728, 531, 573, 340, 963,  26,
        87,  19, 501, 689, 455, 959, 166, 580, 973, 791, 838, 259, 936,
       795, 136, 438,  21, 575, 861, 734, 849, 118, 917, 505, 583, 392,
       627, 258, 621, 157, 646, 709, 807, 396, 743, 655, 491, 449, 845,
 

In [4]:
metrics_to_scores

{'r2': {'lmdi_baseline': {2: 0.7992074117784433,
   3: 0.9390334652869762,
   4: 0.9713182320280844,
   5: 0.9764709176942732,
   6: 0.9728849782565656,
   7: 0.9832270126374433,
   8: 0.9834457237157822,
   9: 0.9767287097287944,
   10: 0.9899374987628033},
  'lmdi_lasso': {2: 0.9954593209188786,
   3: 0.994245609475536,
   4: 0.999196774577157,
   5: 0.9993400364677412,
   6: 0.9994040109911236,
   7: 0.9993889676458666,
   8: 0.999633892585917,
   9: 0.9996355579936119,
   10: 0.999851211258718},
  'lmdi_ridge': {2: 0.9943770624342212,
   3: 0.9944156870598685,
   4: 0.9993091622328684,
   5: 0.9993920493177577,
   6: 0.9992283505830387,
   7: 0.9995160553518698,
   8: 0.9996309700854302,
   9: 0.9992661374242299,
   10: 0.9998320381985973},
  'aloo_l2_signed_normed_noleafavg_rank': {2: 0.9939486598064695,
   3: 0.9979568983564288,
   4: 0.9978316129787653,
   5: 0.9980183933127245,
   6: 0.9987020488897803,
   7: 0.9988547522485848,
   8: 0.998719769441688,
   9: -961.5687107940583