In [1]:
# imodels imports
from imodels import get_clean_dataset
from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusClassifier, RandomForestPlusRegressor
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import  AloRFPlusMDI, RFPlusMDI

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegressionCV, LinearRegression
from sklearn.metrics import accuracy_score

# other important libraries
import numpy as np
import pandas as pd

In [2]:
# get abalone data
X, y, feature_names = get_clean_dataset("compas_two_year_clean", data_source='imodels')

In [3]:
# split data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=1)

In [4]:
# fit rf
rf = RandomForestClassifier(n_estimators=100, max_features='sqrt',
                            min_samples_leaf=5, random_state=1)
rf_baseline = RandomForestRegressor(n_estimators=100, max_features='sqrt',
                                    min_samples_leaf=5, random_state=1)
rf.fit(X_train, y_train)
rf_baseline.fit(X_train, y_train)

# fit rf+
rf_plus = RandomForestPlusClassifier(rf_model = rf,
                                     prediction_model = LogisticRegressionCV())
rf_plus_baseline = RandomForestPlusRegressor(rf_model = rf_baseline,
                                             include_raw=False, fit_on='inbag',
                                             prediction_model = LinearRegression())
rf_plus.fit(X_train, y_train)
rf_plus_baseline.fit(X_train, y_train)

# check performance on test set
yhat_rfplus = rf_plus.predict(X_test)
yhat_rfplus_baseline = rf_plus_baseline.predict(X_test)

# evaluate accuracy on test set
accuracy_rf_plus = accuracy_score(y_test, yhat_rfplus)
accuracy_rf_plus_baseline = accuracy_score(y_test, yhat_rfplus_baseline > 0.5)
print(f'RF+ Test Set Accuracy: {accuracy_rf_plus}')
print(f'RF+ Baseline Test Set Accuracy: {accuracy_rf_plus_baseline}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.8s finished


RF+ Test Set Accuracy: 0.7046436285097192
RF+ Baseline Test Set Accuracy: 0.6868250539956804


In [5]:
# create explainers
mdi_explainer = RFPlusMDI(rf_plus)
baseline_explainer = RFPlusMDI(rf_plus_baseline, evaluate_on='inbag')

# get feature importances for train and test sets
mdi_train_values = mdi_explainer.explain_linear_partial(X_train, y_train, l2norm = True, sign = True, leaf_average=False)
mdi_test_values = mdi_explainer.explain_linear_partial(X_test, y=None, l2norm = True, sign = True, leaf_average=False)
baseline_train_values = baseline_explainer.explain_linear_partial(X=X_train, y=y_train)
baseline_test_values = baseline_explainer.explain_linear_partial(X=X_test, y=None)

# get feature rankings for train and test sets
mdi_train_rankings = mdi_explainer.get_rankings(mdi_train_values)
mdi_test_rankings = mdi_explainer.get_rankings(mdi_test_values)
baseline_train_rankings = baseline_explainer.get_rankings(baseline_train_values)
baseline_test_rankings = baseline_explainer.get_rankings(baseline_test_values)

In [10]:
# get unique number of points for raw feature importance values
print(f'Train Data Shape: {X_train.shape}; Test Data Shape: {X_test.shape}')

print(f'Number of unique rows of MDI+ training values: {np.unique(mdi_test_values, axis=0, return_counts=True)[0].shape}')
print(f'Number of unique rows of baseline MDI+ training values: {np.unique(baseline_test_values, axis=0, return_counts=True)[0].shape}')

print(f'Number of unique rows of MDI+ test values: {np.unique(mdi_test_values, axis=0, return_counts=True)[0].shape}')
print(f'Number of unique rows of baseline MDI+ test values: {np.unique(baseline_test_values, axis=0, return_counts=True)[0].shape}')

# get unique number of points for feature rankings
print(f'Number of unique rows of MDI+ training rankings: {np.unique(mdi_train_rankings, axis=0, return_counts=True)[0].shape}')
print(f'Number of unique rows of baseline MDI+ training rankings: {np.unique(baseline_train_rankings, axis=0, return_counts=True)[0].shape}')

print(f'Number of unique rows of MDI+ test rankings: {np.unique(mdi_test_rankings, axis=0, return_counts=True)[0].shape}')
print(f'Number of unique rows of baseline MDI+ test rankings: {np.unique(baseline_test_rankings, axis=0, return_counts=True)[0].shape}')

Train Data Shape: (4320, 20); Test Data Shape: (1852, 20)
Number of unique rows of MDI+ training values: (1738, 20)
Number of unique rows of baseline MDI+ training values: (1737, 20)
Number of unique rows of MDI+ test values: (1738, 20)
Number of unique rows of baseline MDI+ test values: (1737, 20)
Number of unique rows of MDI+ training rankings: (4318, 20)
Number of unique rows of baseline MDI+ training rankings: (4313, 20)
Number of unique rows of MDI+ test rankings: (1671, 20)
Number of unique rows of baseline MDI+ test rankings: (1708, 20)
