In [None]:
import copy
import os
from os.path import join as oj
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, ElasticNetCV, ElasticNet
from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor, RandomForestPlusClassifier
sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")
warnings.filterwarnings("ignore", message="Bins whose width")
from sklearn.model_selection import train_test_split
from sklearn import model_selection
import openml
import shap
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import *
from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.ensemble import GradientBoostingClassifier
import sklearn
from scripts.competing_methods_local import *
from scripts.simulations_util import *

### Test MOE

In [None]:
def apply_splitting_strategy(X: np.ndarray,
                             y: np.ndarray,
                             splitting_strategy: str,
                             split_seed: str):
    if splitting_strategy in {'train-test-lowdata', 'train-tune-test-lowdata'}:
        test_size = 0.90
    elif splitting_strategy == "train-test":
        test_size = 0.33
    else:
        test_size = 0.2

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=test_size, random_state=split_seed)
    X_tune = None
    y_tune = None

    if splitting_strategy in {'train-tune-test', 'train-tune-test-lowdata'}:
        X_train, X_tune, y_train, y_tune = model_selection.train_test_split(
            X_train, y_train, test_size=0.2, random_state=split_seed)

    return X_train, X_tune, X_test, y_train, y_tune, y_test

In [None]:
task = openml.tasks.get_task(361260) #361260 361259 361253 361254 361242
dataset = task.get_dataset()
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute,dataset_format="array")
keep_idx = np.random.choice(X.shape[0], 2000, replace=False)
X = X[keep_idx, :]
y = y[keep_idx]
X_train, X_tune, X_test, y_train, y_tune, y_test = apply_splitting_strategy(X, y, "train-test", 0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
est = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=42)
est.fit(X_train, y_train)

rf_plus_elastic = RandomForestPlusRegressor(rf_model=est, prediction_model=ElasticNetCV(cv=3, l1_ratio=[0.1,0.5,0.99], max_iter=2000,random_state=0))
rf_plus_elastic.fit(X_train, y_train)

rf_plus_default = RandomForestPlusRegressor(rf_model=est)
rf_plus_default.fit(X_train, y_train)

# rf_plus_moe_elasticnet = SklearnRFPlusRegMOE(rfplus_model=rf_plus_elastic, checkpoint_path="/accounts/projects/binyu/zhongyuan_liang/local_MDI+/imodels-experiments/feature_importance/tempcheckpoints")
# rf_plus_moe_elasticnet.fit(X_train,y_train)

# rf_plus_moe_default = SklearnRFPlusRegMOE(rfplus_model=rf_plus_default, checkpoint_path="/accounts/projects/binyu/zhongyuan_liang/local_MDI+/imodels-experiments/feature_importance/tempcheckpoints")
# rf_plus_moe_default.fit(X_train,y_train)

# rf_plus_moe_default_loo = SklearnRFPlusRegMOE(rfplus_model=rf_plus_default, checkpoint_path="/accounts/projects/binyu/zhongyuan_liang/local_MDI+/imodels-experiments/feature_importance/tempcheckpoints", use_loo=True)
# rf_plus_moe_default_loo.fit(X_train,y_train)

est_r2 = r2_score(y_test, est.predict(X_test))
rf_plus_elastic_r2 = r2_score(y_test, rf_plus_elastic.predict(X_test))
rf_plus_default_r2 = r2_score(y_test, rf_plus_default.predict(X_test))
# rf_plus_moe_elasticnet_r2 = r2_score(y_test, rf_plus_moe_elasticnet.predict(X_test))
# rf_plus_moe_default_r2 = r2_score(y_test, rf_plus_moe_default.predict(X_test))
# rf_plus_moe_default_loo_r2 = r2_score(y_test, rf_plus_moe_default_loo.predict(X_test))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.5min finished


In [6]:
print(est_r2, rf_plus_elastic_r2, rf_plus_default_r2)#, rf_plus_moe_elasticnet_r2, rf_plus_moe_default_r2, rf_plus_moe_default_loo_r2)

0.8392554548518385 0.45922955936080434 0.8652551201070563


In [None]:
alpha_elastic = []
for i in range(100):
    alpha_elastic.append(rf_plus_elastic.estimators_[i].alpha_)
alpha_elastic = np.array(alpha_elastic)
print(alpha_elastic.mean())

184.94850081751116


In [15]:
alpha_loo = []
for i in range(100):
    alpha_loo.append(rf_plus_default.estimators_[i].alpha_)
alpha_loo = np.array(alpha_loo)
print(alpha_loo.mean())

112.94685509618063


In [None]:
LFI_evaluation_MDIRFPlus_all_ranking_retrain(X_train=X_train, y_train=y_train, X_test=X_test, fit=rf_plus_elastic, mode="absolute")
LFI_evaluation_MDIRFPlus_all_ranking_default_retrain(X_train=X_train, y_train=y_train, X_test=X_test, fit=rf_plus_default, mode="absolute")
LFI_evaluation_MDIRFPlus_all_ranking_moe_retrain(X_train=X_train, y_train=y_train, X_test=X_test, fit=rf_plus_moe_elasticnet, mode="absolute")
LFI_evaluation_MDIRFPlus_all_ranking_moe_retrain(X_train=X_train, y_train=y_train, X_test=X_test, fit=rf_plus_moe_default, mode="absolute")
LFI_evaluation_MDIRFPlus_all_ranking_moe_default_retrain(X_train=X_train, y_train=y_train, X_test=X_test, fit=rf_plus_moe_default_loo, mode="absolute")

In [None]:
assert False

In [None]:
def apply_splitting_strategy(X: np.ndarray,
                             y: np.ndarray,
                             splitting_strategy: str,
                             split_seed: str):
    if splitting_strategy in {'train-test-lowdata', 'train-tune-test-lowdata'}:
        test_size = 0.90
    elif splitting_strategy == "train-test":
        test_size = 0.33
    else:
        test_size = 0.2

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=test_size, random_state=split_seed)
    X_tune = None
    y_tune = None

    if splitting_strategy in {'train-tune-test', 'train-tune-test-lowdata'}:
        X_train, X_tune, y_train, y_tune = model_selection.train_test_split(
            X_train, y_train, test_size=0.2, random_state=split_seed)

    return X_train, X_tune, X_test, y_train, y_tune, y_test

def mask_unimportant_features(X, sorted_feature, mask, mask_values):
    array = copy.deepcopy(X)
    num_features = array.shape[1]
    num_masked = int(np.ceil(num_features * mask))
    selected_indices = sorted_feature[:, num_masked:]
    for row_idx in range(array.shape[0]):
        for col_idx in selected_indices[row_idx]:
            if isinstance(mask_values[col_idx], (int, float, np.integer, np.floating)):
                array[row_idx, col_idx] = mask_values[col_idx]
            else:
                unique_vals = mask_values[col_idx]
                array[row_idx, col_idx] = unique_vals[1] if array[row_idx, col_idx] == unique_vals[0] else unique_vals[0] 
    
    return num_masked, array

def LFI_evaluation_MDIRFPlus_all_ranking_retrain(X_train, y_train, X_test, fit=None, mode="absolute"):
    assert isinstance(fit, RandomForestPlusRegressor) or isinstance(fit, RandomForestPlusClassifier)
    rf_plus_mdi = RFPlusMDI(fit, mode = 'only_k', evaluate_on="all")
    local_fi_score_train = rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)
    local_fi_score_test = rf_plus_mdi.explain_linear_partial(X=X_test, y=None, ranking = True)
    if mode == "absolute":
        return np.abs(local_fi_score_train), np.abs(local_fi_score_test)
    else:
        return local_fi_score_train, local_fi_score_test

def tree_shap_evaluation_RF_retrain(X_train, y_train, X_test, fit=None, mode="absolute"):
    """
    Compute average treeshap value across observations.
    Larger absolute values indicate more important features.
    :param X: design matrix
    :param y: response
    :param fit: fitted model of interest (tree-based)
    :return: dataframe of shape: (n_samples, n_features)
    """
    explainer = shap.TreeExplainer(fit)
    local_fi_score_train = explainer.shap_values(X_train, check_additivity=False)
    local_fi_score_test = explainer.shap_values(X_test, check_additivity=False)
    if isinstance(fit, GradientBoostingClassifier):
        if mode == "absolute":
            return np.abs(local_fi_score_train), np.abs(local_fi_score_test)
        else:
            return local_fi_score_train, local_fi_score_test
    if sklearn.base.is_classifier(fit):
        if mode == "absolute":
            return np.abs(local_fi_score_train[:,:,1]), np.abs(local_fi_score_test[:,:,1])
        else:
            return local_fi_score_train[:,:,1], local_fi_score_test[:,:,1]
    if mode == "absolute":
        return np.abs(local_fi_score_train), np.abs(local_fi_score_test)
    else:
        return local_fi_score_train, local_fi_score_test

In [None]:
task = openml.tasks.get_task(361260) #361260 361259 361253 361254 361242
dataset = task.get_dataset()
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute,dataset_format="array")
keep_idx = np.random.choice(X.shape[0], 2000, replace=False)
X = X[keep_idx, :]
y = y[keep_idx]
X_train, X_tune, X_test, y_train, y_tune, y_test = apply_splitting_strategy(X, y, "train-test", 0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
est = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=42)
est.fit(X_train, y_train)

rf_plus_elastic = RandomForestPlusRegressor(rf_model=est, prediction_model=ElasticNetCV(cv=3, l1_ratio=[0.1,0.5,0.99], max_iter=2000,random_state=0))
rf_plus_elastic.fit(X_train, y_train)

rf_plus_default = RandomForestPlusRegressor(rf_model=est)
rf_plus_default.fit(X_train, y_train)

rf_plus_moe_elasticnet = SklearnRFPlusRegMOE(rfplus_model=rf_plus_elastic)
rf_plus_moe_elasticnet.fit(X_train,y_train)

rf_plus_moe_default = SklearnRFPlusRegMOE(rfplus_model=rf_plus_default)
rf_plus_moe_default.fit(X_train,y_train)

rf_plus_moe_default_loo = SklearnRFPlusRegMOE(rfplus_model=rf_plus_default)
rf_plus_moe_default_loo.fit(X_train,y_train)

est_r2 = r2_score(y_test, est.predict(X_test))
rf_plus_elastic_r2 = r2_score(y_test, rf_plus_elastic.predict(X_test))
rf_plus_default_r2 = r2_score(y_test, rf_plus_default.predict(X_test))
rf_plus_moe_elasticnet_r2 = r2_score(y_test, rf_plus_moe_elasticnet.predict(X_test))
rf_plus_moe_default_r2 = r2_score(y_test, rf_plus_moe_default.predict(X_test))
rf_plus_moe_default_loo_r2 = r2_score(y_test, rf_plus_moe_default_loo.predict(X_test))

In [None]:
est_r2 = r2_score(y_test, est.predict(X_test))
rf_plus_elastic_r2 = r2_score(y_test, rf_plus_elastic.predict(X_test))
rf_plus_default_r2 = r2_score(y_test, rf_plus_default.predict(X_test))
rf_plus_moe_elasticnet_r2 = r2_score(y_test, rf_plus_moe_elasticnet.predict(X_test))
rf_plus_moe_default_r2 = r2_score(y_test, rf_plus_moe_default.predict(X_test))
rf_plus_moe_default_loo_r2 = r2_score(y_test, rf_plus_moe_default_loo.predict(X_test))

In [None]:
# mask_values = {}
# for i in range(X_train.shape[1]):
#     unique_values = np.unique(X_train[:, i])
#     if len(unique_values) > 2:
#         mask_values[i] = np.mean(X_train[:, i])
#     else:
#         mask_values[i] = list(unique_values)

In [None]:
# est = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features="sqrt", random_state=0)
# est.fit(X_train, y_train)

# rf_plus_elastic = RandomForestPlusClassifier(rf_model=est, prediction_model=LogisticRegressionCV(penalty='elasticnet', l1_ratios=[0.1,0.5,0.99], solver = 'saga', cv=3, n_jobs=-1, tol=5e-4, max_iter=2000, random_state=0))
# rf_plus_elastic.fit(X_train, y_train)

from sklearn.linear_model import RidgeCV

est = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=0)
est.fit(X_train, y_train)

rf_plus_default = RandomForestPlusRegressor(rf_model=est) #[0.1,0.5,0.99]
rf_plus_default.fit(X_train, y_train)


rf_plus_ridge = RandomForestPlusRegressor(rf_model=est, prediction_model=RidgeCV())
rf_plus_ridge.fit(X_train, y_train)


rf_plus_elastic = RandomForestPlusRegressor(rf_model=est, prediction_model=ElasticNetCV(cv=3, l1_ratio=[0.1,0.5,0.99], max_iter=2000,random_state=0))
rf_plus_elastic.fit(X_train, y_train)


In [None]:
rf_plus_mdi = RFPlusMDI(rf_plus_default, mode = 'only_k', evaluate_on="all")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)

In [None]:
rf_plus_elastic = RandomForestPlusRegressor(rf_model=est, prediction_model=ElasticNetCV(cv=3, alphas=[0.1], l1_ratio=[0.1,0.5,0.99], max_iter=2000,random_state=0))
rf_plus_elastic.fit(X_train, y_train)

In [None]:
# get R2 for est, rf_plus_elastic, rf_plus_default
y_pred = est.predict(X_test)
r2_est = r2_score(y_test, y_pred)
y_pred = rf_plus_elastic.predict(X_test)
r2_rf_plus_elastic = r2_score(y_test, y_pred)
y_pred = rf_plus_default.predict(X_test)
r2_rf_plus_default = r2_score(y_test, y_pred)
y_pred = rf_plus_ridge.predict(X_test)
r2_rf_plus_ridge = r2_score(y_test, y_pred)
print(r2_est, r2_rf_plus_elastic, r2_rf_plus_default, r2_rf_plus_ridge)

In [None]:
from imodels.tree.rf_plus.rf_plus.MOE.rfplus_MOE import SklearnRFPlusRegMOE

est = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=0)
est.fit(X_train, y_train)

rf_plus_elastic = RandomForestPlusRegressor(rf_model=est, prediction_model=ElasticNetCV(cv=3, l1_ratio=[0.1,0.5,0.99], max_iter=2000,random_state=0))
rf_plus_elastic.fit(X_train, y_train)

sklearn_rfplus_moe = SklearnRFPlusRegMOE(rfplus_model=rf_plus_elastic)
sklearn_rfplus_moe.fit(X_train,y_train)

rf_plus_mdi = RFPlusMDI(rf_plus_elastic, mode = 'only_k', evaluate_on="all")
local_fi_score_train = rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)
local_fi_score_train_moe = rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True, moe_weight=sklearn_rfplus_moe.get_weights(X_train))
local_fi_score_test = rf_plus_mdi.explain_linear_partial(X=X_test, y=None, ranking = True)
local_fi_score_test_moe = rf_plus_mdi.explain_linear_partial(X=X_test, y=None, ranking = True, moe_weight=sklearn_rfplus_moe.get_weights(X_test))

In [None]:
sklearn_rfplus_moe.get_weights(X_test).shape

In [None]:
np.expand_dims(sklearn_rfplus_moe.get_weights(X_test), axis=1)

In [None]:
rf_plus_default

In [None]:
sklearn_rfplus_moe = SklearnRFPlusRegMOE(rfplus_model=rf_plus_default, use_loo=True)
sklearn_rfplus_moe.fit(X_train,y_train)

In [None]:
np.argsort(local_fi_score_test_moe, axis=1)

In [None]:
np.argsort(local_fi_score_test, axis=1)

In [None]:
local_fi_score_test

In [None]:
local_fi_score_train

In [None]:
rf_plus_mdi = RFPlusMDI(rf_plus_elastic, mode = 'only_k', evaluate_on="all")
local_fi_score_train = rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)

In [None]:
expanded_moe_weight = np.expand_dims(moe_weight, axis=1) 

In [None]:
expanded_moe_weight.shape

In [None]:
expanded_moe_weight = np.expand_dims(moe_weight, axis=1)  # Shape: (670, 1, 100)

# Compute the weighted sum while ignoring NaNs
local_fi_score_train = np.nansum(local_fi_score_train * expanded_moe_weight, axis=-1)

In [None]:
# check if there are NA in local_fi_score_train
np.isnan(local_fi_score_train).any()

In [None]:
expanded_moe_weight = np.expand_dims(moe_weight, axis=1) 
np.nansum(expanded_moe_weight, axis=-1)

In [None]:
expanded_moe_weight

In [None]:
expanded_moe_weight = np.expand_dims(moe_weight, axis=1) 
# Compute the weighted average along the last axis (-1)
local_fi_score_train = np.nansum(local_fi_score_train * expanded_moe_weight, axis=-1) / np.nansum(expanded_moe_weight, axis=-1)

# The resulting shape will be (670, 48)


In [None]:
local_fi_score_test = rf_plus_mdi.explain_linear_partial(X=X_test, y=None, ranking = True)

In [None]:
LFI_evaluation_MDIRFPlus_all_ranking_retrain(X_train, y_train, X_test, fit=rf_plus_elastic, mode="absolute")[0]

In [None]:
# get R2 for est, rf_plus_elastic, sklearn_rfplus_moe
y_pred = est.predict(X_test)
r2_est = r2_score(y_test, y_pred)
y_pred = rf_plus_elastic.predict(X_test)
r2_rf_plus_elastic = r2_score(y_test, y_pred)
y_pred = sklearn_rfplus_moe.predict(X_test)
r2_sklearn_rfplus_moe = r2_score(y_test, y_pred)

print(r2_est, r2_rf_plus_elastic, r2_sklearn_rfplus_moe)

In [None]:
# get AUROC for est, rf_plus_elastic, sklearn_rfplus_moe
y_pred = est.predict_proba(X_test)[:,1]
print("AUROC for RF: ", roc_auc_score(y_test, y_pred))
y_pred = rf_plus_elastic.predict_proba(X_test)[:,1]
print("AUROC for RF+: ", roc_auc_score(y_test, y_pred))
y_pred = sklearn_rfplus_moe.predict_proba(X_test)[:,1]
print("AUROC for RF+MOE: ", roc_auc_score(y_test, y_pred))

In [None]:
sklearn_rfplus_moe.get_weights(X_test)[0, :]

In [None]:
local_fi_score_train_lmdi, _ = LFI_evaluation_MDIRFPlus_all_ranking_retrain(X_train, y_train, X_test, fit=rf_plus_elastic, mode="absolute")
local_fi_score_train_shap, _ = tree_shap_evaluation_RF_retrain(X_train, y_train, X_test, fit=est, mode="absolute")

sorted_feature_train_lmdi = np.argsort(-local_fi_score_train_lmdi)
sorted_feature_train_shap = np.argsort(-local_fi_score_train_shap)
ablation_models = {"RF_Classifier": RandomForestClassifier(random_state=0)}
#ablation_models = {"RF_Regressor": RandomForestRegressor(random_state=0)}

In [None]:
mask_ratio = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
for mask in mask_ratio:
    num_features_masked, X_train_masked = mask_unimportant_features(X_train, sorted_feature_train_lmdi, mask, mask_values)
    for a_model in ablation_models:
        ablation_models[a_model].fit(X_train_masked, y_train)
        y_pred = ablation_models[a_model].predict_proba(X_test)[:, 1]
        print(roc_auc_score(y_test, y_pred))
        # y_pred = ablation_models[a_model].predict(X_test)
        # print(r2_score(y_test, y_pred))

In [None]:
mask_ratio = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
for mask in mask_ratio:
    num_features_masked, X_train_masked = mask_unimportant_features(X_train, sorted_feature_train_shap, mask, mask_values)
    for a_model in ablation_models:
        ablation_models[a_model].fit(X_train_masked, y_train)
        y_pred = ablation_models[a_model].predict_proba(X_test)[:, 1]
        print(roc_auc_score(y_test, y_pred))
        # y_pred = ablation_models[a_model].predict(X_test)
        # print(r2_score(y_test, y_pred))