In [1]:
import imodels
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, mean_squared_error, r2_score
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import *
from sklearn.preprocessing import StandardScaler
import copy
import matplotlib.pyplot as plt
import openml


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X, y, _ = imodels.get_clean_dataset("diabetes")
# X = np.delete(X, 4,1)
# dataset = openml.datasets.get_dataset(588)
# X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute, dataset_format="array")

fetching diabetes from pmlb


In [3]:
# np.random.seed(42) 
# data = np.random.randn(1000, 10)
# n_groups = 2
# group_indicator = np.random.choice(n_groups, size=1000)
# y = np.zeros(1000)
# coefficients = np.random.randn(n_groups, data.shape[1])
# for group in range(n_groups):
#     group_mask = group_indicator == group
#     selected_features = data[group_mask]
#     y[group_mask] = np.dot(selected_features, coefficients[group])
# X = np.column_stack((data, group_indicator))

In [4]:
# X, y, _ = imodels.get_clean_dataset("diabetes_regr")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape)
# standardize the data using sklearn's StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

(514, 8) (254, 8)


In [5]:
rf = RandomForestClassifier(n_estimators=100, min_samples_leaf= 3, max_features= 'sqrt', random_state= 42)
rf.fit(X_train, y_train)
rf_plus_base = RandomForestPlusClassifier(rf_model=rf)
rf_plus_base.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.7s finished


In [6]:
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
partial_preds_subtract_intercept = rf_plus_mdi.explain_subtract_intercept(X=X_test)

AttributeError: 'AloMDIPlusPartialPredictionModelClassifier' object has no attribute 'predict_partial_k_subtract_intercept'

In [None]:
partial_preds_subtract_intercept[0]

In [None]:
partial_preds_subtract_intercept[0]

In [None]:
rf_plus_base.predict_proba(X_test)

In [None]:
temp = rf_plus_mdi.explain(X=X_test, y=y_test)

In [None]:
temp[0][0]

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=41, min_samples_leaf=5, max_features=0.33)
rf.fit(X_train, y_train)
rf_plus_base = RandomForestPlusRegressor(rf_model=copy.deepcopy(rf))
rf_plus_base.fit(X_train, y_train)
test_all_mse_rf = mean_squared_error(y_test, rf.predict(X_test))
test_all_r2_rf = r2_score(y_test, rf.predict(X_test))
test_all_mse_rf_plus = mean_squared_error(y_test, rf_plus_base.predict(X_test))
test_all_r2_rf_plus = r2_score(y_test, rf_plus_base.predict(X_test))
print("Test MSE RF: ", test_all_mse_rf)
print("Test R2 RF: ", test_all_r2_rf)
print("Test MSE RF+: ", test_all_mse_rf_plus)
print("Test R2 RF+: ", test_all_r2_rf_plus)

In [None]:
# Get shap values
import shap
explainer = shap.TreeExplainer(rf)
shap_values_train = explainer.shap_values(X_train, check_additivity=True)
# shap_values_train = np.abs(shap_values_train)
shap_values_test = explainer.shap_values(X_test, check_additivity=True)
# shap_values_test = np.abs(shap_values_test)

In [None]:
def ablation_removal(train_mean, data, feature_importance_rank, feature_index):
    data_copy = data.copy()
    indices = feature_importance_rank[:, feature_index]
    data_copy[np.arange(data.shape[0]), indices] = train_mean[indices]
    return data_copy
def ablation_removal_positive(train_mean, data, feature_importance_rank, feature_importance, feature_index):
    data_copy = data.copy()
    indices = feature_importance_rank[:, feature_index]
    sum = 0
    for i in range(data.shape[0]):
        if feature_importance[i, indices[i]] > 0:
            sum += 1
            data_copy[i, indices[i]] = train_mean[indices[i]]
    print("Remove sum: ", sum)
    return data_copy
def ablation_removal_negative(train_mean, data, feature_importance_rank, feature_importance, feature_index):
    data_copy = data.copy()
    indices = feature_importance_rank[:, feature_index]
    sum = 0
    for i in range(data.shape[0]):
        if feature_importance[i, indices[i]] < 0:
            sum += 1
            data_copy[i, indices[i]] = train_mean[indices[i]]
    print("Remove sum: ", sum)
    return data_copy

In [None]:
ablation_models = {"RF_Regressor": rf,
                    "Linear": LinearRegression(),
                    "RF_Plus_Regressor": rf_plus_base}
X_data = X_test
y_data = y_test
ablation_data="test"
ablation_models["Linear"].fit(X_train, y_train)

### Shap

In [None]:
a_model = "Linear"
ablation_est = ablation_models[a_model]
metric_results = {}
local_fi_score_data =shap_values_test
local_fi_score_data_rank = np.argsort(shap_values_test)
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)
y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    print(f"enter i: {i}")
    ablation_X_data = ablation_removal_negative(train_mean, X_temp, local_fi_score_data_rank, local_fi_score_data, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
a_model = "Linear"
ablation_est = ablation_models[a_model]
metric_results = {}
local_fi_score_data =shap_values_test
local_fi_score_data_rank = np.argsort(-1*shap_values_test)
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)
y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal_positive(train_mean, X_temp, local_fi_score_data_rank, local_fi_score_data, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
a_model = "Linear"
ablation_est = ablation_models[a_model]
metric_results = {}
local_fi_score_data_rank = np.argsort(-1*np.abs(shap_values_test))#np.argsort(-1*local_feature_importances_train) #np.argsort(-1*shap_values_train) #np.argsort(-1*np.random.rand(X_train.shape[0], 10))
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)

y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal(train_mean, X_temp, local_fi_score_data_rank, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
a_model = "RF_Regressor"
ablation_est = ablation_models[a_model]
metric_results = {}
local_fi_score_data =shap_values_test
local_fi_score_data_rank = np.argsort(shap_values_test)
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)
y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    print(f"enter i: {i}")
    ablation_X_data = ablation_removal_negative(train_mean, X_temp, local_fi_score_data_rank, local_fi_score_data, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
a_model = "RF_Regressor"
ablation_est = ablation_models[a_model]
metric_results = {}
local_fi_score_data =shap_values_test
local_fi_score_data_rank = np.argsort(-1*shap_values_test)
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)
y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal_positive(train_mean, X_temp, local_fi_score_data_rank, local_fi_score_data, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
a_model = "RF_Regressor"
ablation_est = ablation_models[a_model]
metric_results = {}
local_fi_score_data_rank = np.argsort(-1*np.abs(shap_values_test))#np.argsort(-1*local_feature_importances_train) #np.argsort(-1*shap_values_train) #np.argsort(-1*np.random.rand(X_train.shape[0], 10))
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)

y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal(train_mean, X_temp, local_fi_score_data_rank, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
a_model = "RF_Plus_Regressor"
ablation_est = ablation_models[a_model]
metric_results = {}
local_fi_score_data =shap_values_test
local_fi_score_data_rank = np.argsort(shap_values_test)
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)
y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    print(f"enter i: {i}")
    ablation_X_data = ablation_removal_negative(train_mean, X_temp, local_fi_score_data_rank, local_fi_score_data, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
a_model = "RF_Plus_Regressor"
ablation_est = ablation_models[a_model]
metric_results = {}
local_fi_score_data =shap_values_test
local_fi_score_data_rank = np.argsort(-1*shap_values_test)
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)
y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal_positive(train_mean, X_temp, local_fi_score_data_rank, local_fi_score_data, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
a_model = "RF_Plus_Regressor"
ablation_est = ablation_models[a_model]
metric_results = {}
local_fi_score_data_rank = np.argsort(-1*np.abs(shap_values_test))#np.argsort(-1*local_feature_importances_train) #np.argsort(-1*shap_values_train) #np.argsort(-1*np.random.rand(X_train.shape[0], 10))
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)

y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal(train_mean, X_temp, local_fi_score_data_rank, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

### Check whether sum to hat_y

In [None]:
assert np.all(rf.predict(X_test)) == np.all(np.sum(shap_values_test, axis=1) + explainer.expected_value)

In [None]:
explainer.expected_value

In [None]:
# get localMDI+
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
local_feature_importances_train, a = rf_plus_mdi.explain(X=X_train, y=y_train)
local_feature_importances_train[0]

In [None]:
from sklearn.linear_model import Ridge
rf_plus_base = RandomForestPlusRegressor(rf_model=copy.deepcopy(rf), include_raw=False, fit_on="inbag", prediction_model=Ridge(alpha=1e-6))
rf_plus_base.fit(X_train, y_train)
rf_plus_mdi = RFPlusMDI(rf_plus_base, evaluate_on="inbag")
local_feature_importances_train, _ = rf_plus_mdi.explain(X=X_train, y=y_train)
local_feature_importances_train[0]

### LMDI+

In [None]:
a_model = "Linear"
ablation_est = ablation_models[a_model]
X_data = X_train
y_data = y_train
ablation_data="test"
metric_results = {}
local_fi_score_data =local_feature_importances_train
local_fi_score_data_rank = np.argsort(local_feature_importances_train)
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)
y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal_negative(train_mean, X_temp, local_fi_score_data_rank, local_fi_score_data, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
a_model = "Linear"
ablation_est = ablation_models[a_model]
X_data = X_train
y_data = y_train
ablation_data="test"
metric_results = {}
local_fi_score_data =local_feature_importances_train
local_fi_score_data_rank = np.argsort(-1*local_feature_importances_train)
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)
y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal_positive(train_mean, X_temp, local_fi_score_data_rank, local_fi_score_data, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
a_model = "Linear"
ablation_est = ablation_models[a_model]
X_data = X_train
y_data = y_train
ablation_data="test"
metric_results = {}
local_fi_score_data_rank = np.argsort(-1*np.abs(local_feature_importances_train))#np.argsort(-1*local_feature_importances_train) #np.argsort(-1*shap_values_train) #np.argsort(-1*np.random.rand(X_train.shape[0], 10))
num_ablate_features = X_train.shape[1]
train_mean = np.mean(X_train, axis=0)

y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal(train_mean, X_temp, local_fi_score_data_rank, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

mse_before = metric_results[a_model + f'_{ablation_data}_MSE_before_ablation']
r2_before = metric_results[a_model + f'_{ablation_data}_R_2_before_ablation']

mse_after = [metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i}'] for i in range(1, num_ablate_features)]
r2_after = [metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i}'] for i in range(1, num_ablate_features)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

#### Analysis 1

In [None]:
pd.DataFrame(X_train).corr()

In [None]:
local_fi_score_data = np.argsort(-1*local_feature_importances_train)#np.argsort(-1*local_feature_importances_train) #np.argsort(-1*shap_values_train) #np.argsort(-1*np.random.rand(X_train.shape[0], 10))

y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
imp_vals = copy.deepcopy(local_fi_score_data)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal_positive(train_mean, X_temp, imp_vals, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
    differences = X_temp != X_train
    differences_per_col = np.sum(differences, axis=0)
    print(differences_per_col)
    lm = LinearRegression()
    lm.fit(X_temp, y_train)
    print(r2_score(y_train, lm.predict(X_temp)))
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

In [None]:
local_fi_score_data = np.argsort(-1*np.random.rand(X_train.shape[0], 10))#np.argsort(-1*local_feature_importances_train) #np.argsort(-1*shap_values_train) #np.argsort(-1*np.random.rand(X_train.shape[0], 10))

y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
imp_vals = copy.deepcopy(local_fi_score_data)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal_positive(train_mean, X_temp, imp_vals, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
    differences = X_temp != X_train
    differences_per_col = np.sum(differences, axis=0)
    print(differences_per_col)
    lm = LinearRegression()
    lm.fit(X_temp, y_train)
    print(r2_score(y_train, lm.predict(X_temp)))
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

In [None]:
mse_before = metric_results['Linear_train_MSE_before_ablation']
r2_before = metric_results['Linear_train_R_2_before_ablation']

mse_after = [metric_results[f'Linear_train_MSE_after_ablation_{i}'] for i in range(1, 10)]
r2_after = [metric_results[f'Linear_train_R_2_after_ablation_{i}'] for i in range(1, 10)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, 10)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, 10)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
local_fi_score_data = np.argsort(-1*shap_values_train)#np.argsort(-1*local_feature_importances_train) #np.argsort(-1*shap_values_train) #np.argsort(-1*np.random.rand(X_train.shape[0], 10))

y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
imp_vals = copy.deepcopy(local_fi_score_data)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal_positive(train_mean, X_temp, imp_vals, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
    differences = X_temp != X_train
    differences_per_col = np.sum(differences, axis=0)
    print(differences_per_col)
    lm = LinearRegression()
    lm.fit(X_temp, y_train)
    print(r2_score(y_train, lm.predict(X_temp)))
for i in range(num_ablate_features):
    metric_results[f'{a_model}_{ablation_data}_MSE_after_ablation_{i+1}'] = ablation_results_list[i]
    metric_results[f'{a_model}_{ablation_data}_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]

In [None]:
mse_before = metric_results['Linear_train_MSE_before_ablation']
r2_before = metric_results['Linear_train_R_2_before_ablation']

mse_after = [metric_results[f'Linear_train_MSE_after_ablation_{i}'] for i in range(1, 10)]
r2_after = [metric_results[f'Linear_train_R_2_after_ablation_{i}'] for i in range(1, 10)]

# Plotting MSE
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, 10)], [mse_before] + mse_after, color='tab:red', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('MSE')
plt.title(f'{a_model} MSE Before and After Ablation')
plt.show()

# Plotting R^2
plt.figure(figsize=(12, 6))
plt.plot(['Before'] + [f'After {i}' for i in range(1, num_ablate_features)], [r2_before] + r2_after, color='tab:blue', marker='o')
plt.xlabel('Ablation Step')
plt.ylabel('R^2')
plt.title(f'{a_model} R^2 Before and After Ablation')
plt.show()

In [None]:
y_pred = ablation_est.predict(X_data)
metric_results[a_model + f'_{ablation_data}_MSE_before_ablation'] = mean_squared_error(y_data, y_pred)
metric_results[a_model + f'_{ablation_data}_R_2_before_ablation'] = r2_score(y_data, y_pred)
imp_vals = copy.deepcopy(local_fi_score_data)
ablation_results_list = [0] * num_ablate_features
ablation_results_list_r2 = [0] * num_ablate_features
X_temp = X_data.copy()
var = []
for i in range(num_ablate_features):
    ablation_X_data = ablation_removal_positive(train_mean, X_temp, imp_vals, i)
    ablation_results_list[i] = mean_squared_error(y_data, ablation_est.predict(ablation_X_data))
    ablation_results_list_r2[i] = r2_score(y_data, ablation_est.predict(ablation_X_data))
    X_temp = ablation_X_data
    differences = X_temp != X_train
    differences_per_col = np.sum(differences, axis=0)
    # print(differences_per_col)
    # print(np.array(differences_per_col).var())
    var.append(np.array(differences_per_col).var())

In [None]:
ablation_results_list

In [None]:
ablation_results_list_r2

In [None]:
differences = X_temp != X_train
differences_per_row = np.sum(differences, axis=1)
print(differences_per_row)

In [None]:
X_train.shape

In [None]:
differences = X_temp != X_train
differences_per_col = np.sum(differences, axis=0)
print(differences_per_col)
print(np.array(differences_per_col).var())

In [None]:
train_mean

In [None]:
ablation_est.coef_

In [None]:
ablation_est.intercept_

In [None]:
y_train.mean()

In [None]:
((y_train - y_train.mean())**2).sum()

In [None]:
((y_train - ablation_est.predict(X_train))**2).sum()

In [None]:
((y_train - ablation_est.predict(X_temp))**2).sum()

### Analysis 2

In [None]:
lm = LinearRegression()
lm.fit(X_temp, y_train)
r2_score(y_train, lm.predict(X_temp))

In [None]:
lm.coef_

In [None]:
ablation_est.coef_

In [None]:
lm.coef_-ablation_est.coef_