In [1]:
# standard data science packages
import numpy as np
import pandas as pd

# imodels imports
from imodels.tree.rf_plus.rf_plus.rf_plus_models import \
    RandomForestPlusRegressor, RandomForestPlusClassifier
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import \
    RFPlusMDI, AloRFPlusMDI

# functions for subgroup experiments
from subgroup_detection import *
from subgroup_experiment import *
import shap

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, \
    accuracy_score, r2_score, f1_score, log_loss, root_mean_squared_error

# pipeline imports
from subgroup import *

In [2]:
# set inputs
seed = 5
dataids = [361247, 361243, 361242, 361251, 361253, 361260, 361259, 361256, 361254, 361622]
dataid = 361617
clustertype = "hierarchical"
standardize = True

In [3]:
# get data
X, y = get_openml_data(dataid, standardize)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5,
                                                    random_state=seed)

print("Step 1")

# check if task is regression or classification
if len(np.unique(y)) == 2:
    task = 'classification'
else:
    task = 'regression'
    
# fit the prediction models
    rf, rf_plus_baseline, rf_plus_ridge, rf_plus_lasso, rf_plus_elastic = \
        fit_models(X_train, y_train, task)

print("Step 2")

# obtain shap feature importances
shap_explainer = shap.TreeExplainer(rf)
shap_test_values, shap_test_rankings = get_shap(X_test, shap_explainer,
                                                task)

print("Step 3")

# get lime feature importances
lime_test_values, lime_test_rankings = get_lime(X_test, rf, task)

print("Step 4")

# create list of lmdi variants
lmdi_variants = create_lmdi_variant_map()

# obtain lmdi feature importances
lmdi_explainers = get_lmdi_explainers(rf_plus_baseline, rf_plus_ridge,
                                          rf_plus_lasso, rf_plus_elastic,
                                          lmdi_variants)

print("Step 5")

# we don't actually want to use the training values, but for leaf averaging
# variants, we need to have the training data to compute the leaf averages
lfi_train_values, lfi_train_rankings = get_lmdi(X_train, y_train,
                                                lmdi_variants,
                                                lmdi_explainers)
lfi_test_values, lfi_test_rankings = get_lmdi(X_test, None,
                                                lmdi_variants,
                                                lmdi_explainers)

print("Step 6")

# add shap to the dictionaries
lfi_test_values["shap"] = shap_test_values
lfi_test_rankings["shap"] = shap_test_rankings

# add the raw data to the dictionaries as a baseline of comparison
lfi_test_values["rawdata"] = X_test

# add lime to the dictionaries
lfi_test_values["lime"] = lime_test_values
    
# get the clusterings - while we are not doing this on the training values,
# the get_train_clusters function still does what we want it to.
clusters = get_train_clusters(lfi_test_values, clustertype)

print("Step 7")

# for each cluster, assign half of the indices to the "fitting" set and
# the other half to the "evaluation" set
fitting_clusters = {}
evaluation_clusters = {}
for variant, nclust_map in clusters.items():
    fitting_nclust_to_c = {}
    evaluation_nclust_to_c = {}
    for nclust, cluster_map in nclust_map.items():
        fitting_c_to_idxs = {}
        evaluation_c_to_idxs = {}
        for c, idxs in cluster_map.items():
            if nclust == 9 and variant == "lasso_l2_signed_nonnormed_noleafavg_rank":
                print(f"Cluster {c} in variant {variant} has {len(idxs)} " + \
                    "observations.")
            if len(idxs) < 3:
                print(f"For {nclust} clusters, cluster #{c} in " + \
                    f"variant {variant} has fewer than 3 observations.")
                # # warning message that the cluster is too small
                # warnings.warn(f"For {nclust} clusters, cluster #{c} in " + \
                #     f"variant {variant} has fewer than 3 observations.",
                #     Warning)
                # continue
            # shuffle the indices and split them in half
            np.random.seed(1)
            np.random.shuffle(idxs)
            half = len(idxs) // 2
            fitting_c_to_idxs[c] = idxs[half:]
            evaluation_c_to_idxs[c] = idxs[:half]
        fitting_nclust_to_c[nclust] = fitting_c_to_idxs
        evaluation_nclust_to_c[nclust] = evaluation_c_to_idxs
    fitting_clusters[variant] = fitting_nclust_to_c
    evaluation_clusters[variant] = evaluation_nclust_to_c
    
print("Step 8")
    
# obtain dataframes X_fit, y_fit, X_eval, y_eval
# X_fit = []
# y_fit = []
# X_eval = []
# y_eval = []
# for variant, nclust_map in fitting_clusters.items():
#     for nclust, cluster_map in nclust_map.items():
#         for c, idxs in cluster_map.items():
#             X_fit.append(X_test[idxs])
#             y_fit.append(y_test[idxs])
# for variant, nclust_map in evaluation_clusters.items():
#     for nclust, cluster_map in nclust_map.items():
#         for c, idxs in cluster_map.items():
#             X_eval.append(X_test[idxs])
#             y_eval.append(y_test[idxs])
# X_fit = np.vstack(X_fit)
# y_fit = np.hstack(y_fit)
# X_eval = np.vstack(X_eval)
# y_eval = np.hstack(y_eval)

# print("X_fit shape", X_fit.shape)
# print("X_eval shape", X_eval.shape)

print("Step 9")
    
# compute the performance - we are using test data for both, not an error
metrics_to_scores = compute_performance(X_test, X_test, y_test, y_test,
                                        fitting_clusters,
                                        evaluation_clusters, task)


  X, y = get_openml_data(dataid, standardize)


Step 1


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   11.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.9s finished


Step 2
Step 3
Step 4
Step 5
Step 6
Step 7
Cluster 0 in variant lasso_l2_signed_nonnormed_noleafavg_rank has 46 observations.
Cluster 1 in variant lasso_l2_signed_nonnormed_noleafavg_rank has 32 observations.
Cluster 2 in variant lasso_l2_signed_nonnormed_noleafavg_rank has 95 observations.
Cluster 3 in variant lasso_l2_signed_nonnormed_noleafavg_rank has 82 observations.
Cluster 4 in variant lasso_l2_signed_nonnormed_noleafavg_rank has 55 observations.
Cluster 5 in variant lasso_l2_signed_nonnormed_noleafavg_rank has 8 observations.
Cluster 6 in variant lasso_l2_signed_nonnormed_noleafavg_rank has 30 observations.
Cluster 7 in variant lasso_l2_signed_nonnormed_noleafavg_rank has 26 observations.
Cluster 8 in variant lasso_l2_signed_nonnormed_noleafavg_rank has 10 observations.
Step 8
Step 9
Cluster 0 in variant lasso_l2_signed_nonnormed_noleafavg_rank has RMSE 0.11628180543355045
model coef:
[-1.22974173e-01 -1.59912510e+11  7.91999974e+10 -2.73711338e+11
  2.89916992e-04  9.14573669e-

In [4]:
metrics_to_scores["rmse"]["lasso_l2_signed_nonnormed_noleafavg_rank"]

{2: 0.22644940353139587,
 3: 0.16843560066612098,
 4: 0.168237575339965,
 5: 0.1644902141996583,
 6: 0.1727456979236419,
 7: 0.17394522700718568,
 8: 0.17470725839037096,
 9: 1145868541.5440714,
 10: 1145868541.5421774}

In [5]:
# {2: 0.22644940353139587,
#  3: 0.19826779591905566,
#  4: 0.1953203953041628,
#  5: 0.18712289629518433,
#  6: 0.1799813913000766,
#  7: 0.18342879068896262,
#  8: 0.17688507989749783,
#  9: 0.18064867586887284,
#  10: 0.1718511382385029}