In [1]:
import numpy as np
import pandas as pd
from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusClassifier, RandomForestPlusRegressor
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import  AloRFPlusMDI, RFPlusMDI
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, \
    GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import SGDRegressor
from imodels import get_clean_dataset
from sklearn.linear_model import RidgeCV, LogisticRegressionCV, LinearRegression, LogisticRegression

In [2]:
# create a random data matrix X with 1000 rows, 10 features, all N(0, 1)
np.random.seed(0)
X = np.random.randn(1000, 100)
# create a binary target vector y which is a linear combination of the first 3 features
y = X[:, 0] + 2*X[:, 1] + 3*X[:, 2]
y_class = y > 0
y_class = y_class.astype(float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.3, random_state=42)

In [3]:
gb_regr = GradientBoostingRegressor(random_state=42)
gb_regr.fit(X_train, y_train)
rf_regr = RandomForestRegressor(n_estimators=100, min_samples_leaf=5,
                            max_features=0.33, random_state=42)
rf_regr.fit(X_train, y_train)

In [4]:
gb_class = GradientBoostingClassifier(random_state=42)#, init = "zero")
gb_class.fit(X_train, y_train_class)
rf_class = RandomForestClassifier(n_estimators=100, min_samples_leaf=1,
                            max_features='sqrt', random_state=42)
rf_class.fit(X_train, y_train_class)

In [5]:
gb_regr_plus_baseline = RandomForestPlusRegressor(rf_model=gb_regr,
                                             prediction_model=LinearRegression(),
                                             include_raw=False)
gb_regr_plus_baseline.fit(X_train, y_train)
rf_regr_plus_baseline = RandomForestPlusRegressor(rf_model=rf_regr,
                                                prediction_model=LinearRegression(),
                                                fit_on="inbag",
                                                include_raw=False)
rf_regr_plus_baseline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.3s finished


In [6]:
np.all(np.isclose(gb_regr_plus_baseline.predict(X_test), gb_regr.predict(X_test)))

False

In [7]:
np.all(np.isclose(rf_regr_plus_baseline.predict(X_test), rf_regr.predict(X_test)))

True

In [8]:
float(round(np.mean(y_train_class)))

0.0

In [9]:
gb_class_plus_baseline = RandomForestPlusClassifier(rf_model=gb_class,
                                                prediction_model=LinearRegression(),
                                                include_raw=False)
gb_class_plus_baseline.fit(X_train, y_train_class)
rf_class_plus_baseline = RandomForestPlusClassifier(rf_model=rf_class,
                                                 prediction_model=LogisticRegression(),
                                                 fit_on="inbag",
                                                 include_raw=False)
rf_class_plus_baseline.fit(X_train, y_train_class)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.1s finished


In [10]:
np.mean(gb_class_plus_baseline.predict(X_test) == gb_class.predict(X_test))

0.9766666666666667

In [11]:
np.mean(rf_class_plus_baseline.predict(X_test) == rf_class.predict(X_test))

0.99

In [12]:
# for i in range(100):
#     gb_pred = gb.estimators_[i][0].predict(X_train)
#     transformed_data = gb_plus_baseline.transformers_[i].transform(X_train)
#     gb_base_pred = gb_plus_baseline.estimators_[i].predict(transformed_data.get_all_data())
#     print(np.mean(gb_pred/gb_base_pred))
#     gb_tree_pred = gb_pred > 0
#     gb_base_tree_pred = gb_base_pred > 0
#     # print(np.all(gb_tree_pred == gb_base_tree_pred))

In [13]:
rf_regr_plus = RandomForestPlusRegressor(rf_model=rf_regr,
                                         prediction_model=RidgeCV(cv=5))
rf_regr_plus.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.1s finished


In [14]:
rf_regr_explainer = RFPlusMDI(rf_regr_plus, mode = "only_k", evaluate_on="all")

In [None]:
len(gb_regr.estimators_)

100

In [16]:
rf_regr_explainer.explain_linear_partial(X_test, None, ranking = True)

array([[97.        , 98.        , 99.        , ..., 67.75675676,
        77.75675676, 86.28571429],
       [97.        , 98.        , 99.        , ..., 87.48648649,
        80.7027027 , 73.57142857],
       [98.96938776, 97.02      , 98.05      , ..., 68.2972973 ,
        83.43243243, 83.60714286],
       ...,
       [97.12244898, 97.88      , 99.        , ..., 80.64864865,
        75.24324324, 73.78571429],
       [97.        , 98.27      , 98.73      , ..., 83.59459459,
        77.86486486, 80.39285714],
       [97.        , 99.        , 98.        , ..., 80.97297297,
        87.16216216, 75.03571429]])

In [17]:
import shap

In [18]:
explainer = shap.TreeExplainer(gb_class)
local_fi_score_train = explainer.shap_values(X_train, check_additivity=False)
local_fi_score_test = explainer.shap_values(X_test, check_additivity=False)
local_fi_score_train.shape

(700, 100)

In [19]:
local_fi_score_train.shape

(700, 100)

In [20]:
import lime



In [21]:
train_result = np.zeros((X_train.shape[0], X_train.shape[1]))
test_result = np.zeros((X_test.shape[0], X_test.shape[1]))
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, verbose=False, mode="classification")
num_features = X_train.shape[1]
for i in range(X_train.shape[0]):
    exp = explainer.explain_instance(X_train[i, :], gb_class.predict_proba, num_features=num_features)
    original_feature_importance = exp.as_map()[1]
    sorted_feature_importance = sorted(original_feature_importance, key=lambda x: x[0])
    for j in range(num_features):
        train_result[i, j] = sorted_feature_importance[j][1]
train_result.shape

(700, 100)