In [1]:
import numpy as np
import pandas as pd
from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusClassifier, RandomForestPlusRegressor
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import  AloRFPlusMDI, RFPlusMDI
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from imodels import get_clean_dataset

In [None]:
# X = np.load('X.npy')
# y = np.load('y.npy')
# sample train and test data from diabetes dataset
X, y, feature_names = get_clean_dataset('diabetes_regr')
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# print shape of X_train
print("X_train shape: ", X_train.shape)

fetching diabetes from sklearn
X_train shape:  (309, 10)


In [3]:
# initialize RF model
rf = RandomForestRegressor(n_estimators = 100, min_samples_leaf = 5,
                           max_depth = 5, max_features = 0.33,
                           random_state = 42)

# fit RF+ model
rf_plus = RandomForestPlusRegressor(rf_model=rf,
                                    prediction_model=SGDRegressor(alpha=0.001))
rf_plus.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.0s finished


In [4]:
rf_plus.check_data_time

3.886222839355469e-05

In [5]:
rf_plus.fit_rf_time

0.19572162628173828

In [6]:
rf_plus.fit_forest_time

9.045751571655273

In [7]:
lst = []

In [8]:
rf_plus.fit_trees_time.mean()

init_transformer        0.000505
get_transformed_data    0.014320
fit_prediction_model    0.004274
total_ith_tree          0.020401
dtype: float64

In [9]:
# get feature importances
mdi_explainer = RFPlusMDI(rf_plus)

In [10]:
mdi_explainer.init_ppm_time

0.015544652938842773

In [11]:
# train_lmdi = mdi_explainer.explain_r2(X_train, y_train, l2norm=True)
test_lmdi = mdi_explainer.explain_linear_partial(X_train, y_train, leaf_average=True, l2norm=True, njobs = 1)

In [17]:
lst = list()
lst2 = list()
for explainer in mdi_explainer.tree_explainers:
    lst.append(explainer._total_partial_preds_time)
    lst2.append(explainer._partial_preds_time)
print("Total partial preds time: ", np.array(lst).mean())
print("Partial preds time: ", np.array(lst2).mean())

Total partial preds time:  0.0034155869483947754
Partial preds time:  0.0003127286434173584


In [13]:
mdi_explainer.get_leafs_in_test_samples_time

4.76837158203125e-06

In [14]:
mdi_explainer.partial_predictions_time

12.056845903396606

In [15]:
mdi_explainer.leaf_average_time

0.1785750389099121

In [16]:
mdi_explainer.get_lfi_time

12.23560905456543

In [12]:
mdi_explainer.rf_plus_model.

AttributeError: 'RFPlusMDI' object has no attribute '_partial_preds_time'

In [16]:
data = np.load("X.npy")
data.shape
n_samples = 100
n_features = 5

In [17]:
# randomly sample n_samples and n_features from X and y
sampled_rows = np.random.choice(X.shape[0], n_samples, replace=False)

# Randomly sample m columns
sampled_cols = np.random.choice(X.shape[1], n_features, replace=False)

# Create the sampled array
X_train = X[sampled_rows][:, sampled_cols]

In [20]:
data.shape

(174368, 367)

In [1]:
from scripts.simulations_util import *
from sklearn.ensemble import RandomForestRegressor
from util import apply_splitting_strategy
from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import AloRFPlusMDI

# get data
X = sample_real_data_X(source = "imodels", data_name = "diabetes_regr", sample_row_n = 400)
y = linear_model(X, beta = 1, sigma = None, heritability = 0.8, s = 5)
X_train, X_tune, X_test, y_train, y_tune, y_test = apply_splitting_strategy(X, y, "train-test", 1)

# initialize RF model
rf = RandomForestRegressor(n_estimators = 100, min_samples_leaf = 5, max_features = 0.33, random_state = 42)

# fit RF+ model
rf_plus = RandomForestPlusRegressor(rf_model=rf)
rf_plus.fit(X_train, y_train)

fetching diabetes from sklearn


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   15.4s finished


In [2]:
# get feature importances
mdi_explainer = AloRFPlusMDI(rf_plus, evaluate_on='oob')
mdi, partial_preds = mdi_explainer.explain(np.asarray(X_train), y_train, leaf_average=True)
# mdi_rankings = mdi_explainer.get_rankings(mdi)

In [3]:
mdi

array([[-0.15401222, -0.12353409, -0.09846435, ..., -0.1649551 ,
        -0.16848943, -0.16695095],
       [-0.04393247, -0.08856134, -0.01087691, ..., -0.02182093,
        -0.03546607, -0.03541808],
       [-0.04454344, -0.01119317, -0.05182914, ..., -0.06075052,
        -0.05162437, -0.04708101],
       ...,
       [-0.12245341, -0.05899384, -0.06858969, ..., -0.09567499,
        -0.11051321, -0.10448243],
       [-0.06144286, -0.06667639, -0.01863273, ..., -0.02394332,
        -0.0196163 , -0.0212131 ],
       [-0.0227569 , -0.01432783, -0.01247537, ..., -0.06030641,
        -0.06521143, -0.06407156]])

In [None]:
X = sample_real_data_X(source = "imodels", data_name = "diabetes_regr", sample_row_n = 400)
y = linear_model(X, beta = 1, sigma = None, heritability = 0.8, s = 5)
X_train, X_tune, X_test, y_train, y_tune, y_test = apply_splitting_strategy(X, y, "train-test", 1)
np.random.seed(42)
indices_train = np.random.choice(X_train.shape[0], int(X_train.shape[0]*.25), replace=False)
indices_test = np.random.choice(X_test.shape[0], int(X_test.shape[0]*.25), replace=False)
X_train_subset = X_train[indices_train]
y_train_subset = y_train[indices_train]
X_test_subset = X_test[indices_test]
y_test_subset = y_test[indices_test]
# fit RF model
est = RandomForestRegressor(n_estimators = 100, min_samples_leaf = 5, max_features = 0.33, random_state = 42)

est.fit(X_train, y_train)

# fit RF_plus model
rf_plus_base = RandomForestPlusRegressor(rf_model=est)
rf_plus_base.fit(X_train, y_train)


In [13]:
local_fi_score_train, local_parital_pred_train, local_fi_score_test, local_partial_pred_test, local_fi_score_test_subset, local_partial_pred_test_subset = LFI_evaluation_RF_plus(X_train=X_train, y_train=y_train,
                       X_train_subset = X_train_subset, y_train_subset=y_train_subset,
                       X_test_subset=X_test_subset, X_test=X_test,
                       fit=rf_plus_base)
local_fi_score_train_subset = local_fi_score_train[indices_train]
local_partial_pred_train_subset = local_parital_pred_train[indices_train]

In [14]:
local_fi_score_test_subset = pd.DataFrame(local_fi_score_test_subset)
auroc = []
auprc = []
f1 = []
support = [1,1,1,1,1,0,0,0,0,0]
for rownum in range(local_fi_score_test_subset.shape[0]):
    auroc.append(roc_auc_score(support, local_fi_score_test_subset.iloc[rownum,:]))
    auprc.append(average_precision_score(support, local_fi_score_test_subset.iloc[rownum,:]))
    f1.append(f1_score(support, local_fi_score_test_subset.iloc[rownum,:] > 0.5))
print(auroc)
print(auprc)
print(f1)
print(np.array(auroc).mean())
print(np.array(auprc).mean())
print(np.array(f1).mean())
                    

[0.6399999999999999, 1.0, 0.44000000000000006, 0.76, 0.8400000000000001, 0.4, 0.7600000000000001, 0.6399999999999999, 0.88, 0.92, 1.0, 0.28, 0.24000000000000002, 0.6400000000000001, 0.72, 0.8400000000000001, 1.0, 0.8, 0.6799999999999999, 0.4, 0.6, 0.6799999999999999, 0.7600000000000001, 1.0, 0.88, 0.52, 0.7200000000000002, 0.6799999999999999, 0.7600000000000001, 0.92, 0.28, 0.8, 0.48]
[0.8, 1.0, 0.6746031746031746, 0.8211111111111111, 0.911111111111111, 0.5088888888888888, 0.8599999999999999, 0.8, 0.925, 0.9428571428571428, 1.0, 0.45460317460317456, 0.5305555555555554, 0.7642857142857142, 0.8333333333333333, 0.911111111111111, 1.0, 0.8999999999999999, 0.8111111111111111, 0.6638888888888889, 0.7888888888888888, 0.7833333333333332, 0.8599999999999999, 1.0, 0.925, 0.7088888888888889, 0.81, 0.8111111111111111, 0.8599999999999999, 0.9428571428571428, 0.5412698412698412, 0.8999999999999999, 0.6888888888888889]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0