In [1]:
# import required functions
import sys
sys.path.append('..')
from scripts.simulations_util import *
from scripts.competing_methods_local import *
from util import apply_splitting_strategy
from subgroup_detection import *
from metrics import rbo

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


In [2]:
# create a simulated dataset with four subgroups:
# - subgroups based on mean shift are equivalent to moving the intercept,
#   so we instead choose to change the betas.
# - ten features total
# - group 1 has even features (odd indices bc 0) being important (betas = 1)
# - group 2 has odd features being important (betas = 1)
# - group 3 has first five features being important (betas = 1)
# - group 4 has last five features being important (betas = 1)
# - y is a linear combination of the features

X = sample_normal_X(1000, 10)
y = np.zeros(1000)

# group 1
y[:250] = 1 * X[:250,1] + 2 * X[:250,3] + 3 * X[:250,5] + 4 * X[:250,7] + 5 * X[:250,9]

# group 2
y[250:500] = 1 * X[250:500,0] + 2 * X[250:500,2] + 3 * X[250:500,4] + 4 * X[250:500,6] + 5 * X[250:500,8]

# group 3
y[500:750] = 1 * X[500:750,0] + 2 * X[500:750,1] + 3 * X[500:750,2] + 4 * X[500:750,3] + 5 * X[500:750,4]

# group 4
y[750:] = 1 * X[750:,5] + 2 * X[750:,6] + 3 * X[750:,7] + 4 * X[750:,8] + 5 * X[750:,9]

# split the data into train and test sets
trainX = np.concatenate((X[:200,:], X[250:450,:], X[500:700,:], X[750:950,:]))
testX = np.concatenate((X[200:250,:], X[450:500,:], X[700:750,:], X[950:,:]))
trainy = np.concatenate((y[:200], y[250:450], y[500:700], y[750:950]))
testy = np.concatenate((y[200:250], y[450:500], y[700:750], y[950:]))

In [3]:
# fit RF model
est = RandomForestRegressor(n_estimators = 100, min_samples_leaf = 5,
                            max_features = 0.33, random_state = 0)

# fit RF_plus model
rf_plus_base = RandomForestPlusRegressor(rf_model=est)
rf_plus_base.fit(trainX, trainy)
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="oob")
train_scores, train_partial_preds = rf_plus_mdi.explain(X=trainX, y=trainy)
test_scores, test_partial_preds = rf_plus_mdi.explain(X=testX, y=None)
train_rankings = rf_plus_mdi.get_rankings(train_scores)
test_rankings = rf_plus_mdi.get_rankings(test_scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   19.8s finished


In [4]:
print("Average Feature Rankings for Training Group #1:", np.mean(train_rankings[:200,:], axis = 0))
print("Average Feature Rankings for Training Group #2:", np.mean(train_rankings[200:400,:], axis = 0))
print("Average Feature Rankings for Training Group #3:", np.mean(train_rankings[400:600,:], axis = 0))
print("Average Feature Rankings for Training Group #4:", np.mean(train_rankings[600:800,:], axis = 0))
print("Average Feature Rankings for Test Group #1:", np.mean(test_rankings[:50,:], axis = 0))
print("Average Feature Rankings for Test Group #2:", np.mean(test_rankings[50:100,:], axis = 0))
print("Average Feature Rankings for Test Group #3:", np.mean(test_rankings[100:150,:], axis = 0))
print("Average Feature Rankings for Test Group #4:", np.mean(test_rankings[150:200,:], axis = 0))

Average Feature Rankings for Training Group #1: [5.85  5.495 4.695 4.34  3.655 3.55  3.65  3.785 4.37  5.61 ]
Average Feature Rankings for Training Group #2: [6.02  4.895 4.93  3.825 3.795 3.11  3.555 4.23  4.7   5.94 ]
Average Feature Rankings for Training Group #3: [5.06  4.85  4.25  3.975 3.575 3.315 3.53  4.48  5.35  6.615]
Average Feature Rankings for Training Group #4: [6.515 5.49  4.835 4.185 3.515 3.39  3.565 3.815 4.515 5.175]
Average Feature Rankings for Test Group #1: [6.16 5.66 4.78 4.32 4.14 3.   2.78 3.86 4.48 5.82]
Average Feature Rankings for Test Group #2: [5.9  5.24 5.   3.86 3.96 2.84 4.48 3.24 4.62 5.86]
Average Feature Rankings for Test Group #3: [5.64 4.68 5.38 4.66 3.66 3.58 3.4  4.28 4.2  5.52]
Average Feature Rankings for Test Group #4: [5.82 5.72 5.   4.22 4.28 3.06 3.34 3.9  3.94 5.72]


In [5]:
print("Median Feature Rankings for Training Group #1:", np.median(train_rankings[:200,:], axis = 0))
print("Median Feature Rankings for Training Group #2:", np.median(train_rankings[200:400,:], axis = 0))
print("Median Feature Rankings for Training Group #3:", np.median(train_rankings[400:600,:], axis = 0))
print("Median Feature Rankings for Training Group #4:", np.median(train_rankings[600:800,:], axis = 0))
print("Median Feature Rankings for Test Group #1:", np.median(test_rankings[:50,:], axis = 0))
print("Median Feature Rankings for Test Group #2:", np.median(test_rankings[50:100,:], axis = 0))
print("Median Feature Rankings for Test Group #3:", np.median(test_rankings[100:150,:], axis = 0))
print("Median Feature Rankings for Test Group #4:", np.median(test_rankings[150:200,:], axis = 0))

Median Feature Rankings for Training Group #1: [6. 6. 5. 5. 3. 3. 3. 3. 4. 6.]
Median Feature Rankings for Training Group #2: [6. 5. 5. 3. 3. 3. 3. 4. 5. 6.]
Median Feature Rankings for Training Group #3: [4.  4.5 4.  3.  3.  3.  3.  5.  6.  7. ]
Median Feature Rankings for Training Group #4: [7.  6.  5.  4.  3.  3.  3.  4.  4.  5.5]
Median Feature Rankings for Test Group #1: [7.  6.  4.5 5.  4.  2.  2.  4.  4.  6.5]
Median Feature Rankings for Test Group #2: [6.  5.  4.5 4.  4.  2.5 4.5 2.  5.  6. ]
Median Feature Rankings for Test Group #3: [6.5 4.5 5.  5.  4.  3.  2.5 4.  4.  5.5]
Median Feature Rankings for Test Group #4: [7.  6.  5.  4.5 4.  2.  2.  4.  4.  6. ]
