In [8]:
import imodels
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, mean_squared_error, r2_score
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import *
from sklearn.preprocessing import StandardScaler
import copy
import matplotlib.pyplot as plt
import openml
import sys
sys.path.append('..')
sys.path.append('../..')
sys.path.append('.')
sys.path.append('./scripts')
from competing_methods_local import *
from simulations_util import *

In [9]:
X = sample_real_data_X(source="uci", data_id=189)
y = sample_real_data_y(source="uci", data_id=189, return_support=False)

In [10]:
# apply train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=42)
rf.fit(X_train, y_train)
rf_plus_base = RandomForestPlusRegressor(rf_model=rf)
rf_plus_base.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   33.2s finished


In [12]:
train_data, test_data = LFI_evaluation_RFPlus_all_ranking_retrain(X_train, y_train, X_test, fit=rf_plus_base, mode="absolute")

In [15]:
train_data[0]

array([13.16, 14.13,  5.62, 14.16,  9.41,  7.43,  6.22,  5.52,  7.24,
        4.43,  5.65,  7.56,  4.37,  4.72,  9.56, 15.22, 11.59, 13.48,
       11.53])

In [21]:
np.argsort(-1*train_data[0])[:3].tolist()

[15, 3, 1]

In [None]:
assert False

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Example NumPy array
data = LFI_evaluation_RFPlus_all_ranking_retrain(X_train, y_train, fit=rf_plus_base, mode="absolute")
data = np.argsort(data, axis=1)  # Sort the indices of the features

# Adjust the figure size
plt.figure(figsize=(12, 6))  # Width = 12 inches, Height = 6 inches
plt.imshow(data, cmap='viridis', interpolation='nearest', aspect='auto')
plt.colorbar()  # Add a color bar to show the scale
plt.title("Heatmap of NumPy Array")
plt.show()



In [None]:
LFI_evaluation_RFPlus_all_l2_norm_ranking_retrain(X_train, y_train, fit=rf_plus_base, mode="absolute")

In [None]:
rf_plus_mdi = RFPlusMDI(rf_plus_base, evaluate_on="all")
temp1 = rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train)
temp_10 = rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, leaf_average=True)
temp2 = rf_plus_mdi.explain_linear_partial_error_metric(X=X_train, y=y_train)
temp3 = rf_plus_mdi.explain_linear_partial_error_metric(X=X_train, y=y_train, leaf_average=True)
temp4 = rf_plus_mdi.explain_linear_partial_error_metric(X=X_train, y=y_train, ranking=True)


In [None]:
from collections import defaultdict
leaf_indices = rf.apply(X_train).flatten()
leaf_mapping = defaultdict(list)
for sample_idx, leaf_idx in enumerate(leaf_indices):
    leaf_mapping[leaf_idx].append(sample_idx)
leaf_mapping[20]

In [None]:
temp3[148]

In [None]:
temp3[82]

In [None]:
temp1[0]

In [None]:
(y_train[0] - rf_plus_base.predict(X_train[0].reshape(1, -1)) + temp1[0])**2

In [None]:
temp2[0]

In [None]:
temp.shape

In [None]:
y_train.shape

In [None]:
result = (temp - y_train[:, np.newaxis, np.newaxis])**2

In [None]:
y_train[:, np.newaxis, np.newaxis].shape

In [None]:
y_train[20]

In [None]:
(0.1676066)**2

In [None]:
temp[20]

In [None]:
result[20]

In [None]:
rf_plus_base = RandomForestPlusRegressor(rf_model=rf)
rf_plus_base.fit(X_train, y_train)
rf_plus_base.score(X_test, y_test)

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
parkinsons_telemonitoring = fetch_ucirepo(id=189) 
  
# data (as pandas dataframes) 
X = parkinsons_telemonitoring.data.features 
y = parkinsons_telemonitoring.data.targets 


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

categorical_cols = X.select_dtypes(include=["object", "category"]).columns
numerical_cols = X.select_dtypes(include=["number"]).columns

# Step 2: Handle missing values (if any)
# Check if there are missing values in the numerical columns
if X[numerical_cols].isnull().any().any():
    # Impute missing values in numerical columns with the mean
    num_imputer = SimpleImputer(strategy="mean")
    X[numerical_cols] = num_imputer.fit_transform(X[numerical_cols])

# Check if there are missing values in the categorical columns
if len(categorical_cols) > 0 and X[categorical_cols].isnull().any().any():
    # Convert categorical columns to string to ensure consistent types
    X[categorical_cols] = X[categorical_cols].astype(str)

    # Impute missing values in categorical columns with the most frequent value
    cat_imputer = SimpleImputer(strategy="most_frequent")
    X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

# Step 3: Encode categorical variables using OneHotEncoder (if any categorical columns)
if len(categorical_cols) > 0:
    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_categorical = encoder.fit_transform(X[categorical_cols])

    # Convert encoded categorical data back to DataFrame
    X_categorical_df = pd.DataFrame(
        X_categorical,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=X.index
    )

    # Step 4: Concatenate numerical columns and the encoded categorical DataFrame
    X = pd.concat([X[numerical_cols], X_categorical_df], axis=1)
else:
    # If no categorical columns, we just use the numerical columns
    X = X[numerical_cols]
X = X.to_numpy()

In [None]:
X

In [None]:
if y.to_numpy().shape[1] > 1:
    y = y.iloc[:, 0].to_numpy().flatten()
else:
    y = y.to_numpy().flatten()

In [None]:
# Fit a random forest model
rf = RandomForestRegressor(n_estimators=100, max_depth=5)
rf.fit(X, y)
rf.score(X, y)

In [None]:
rf_plus_base = RandomForestPlusRegressor(rf_model=rf)
rf_plus_base.fit(X, y)
rf_plus_base.score(X, y)

In [None]:
# X, y, _ = imodels.get_clean_dataset("diabetes")
X, y, _ = imodels.get_clean_dataset("diabetes_regr")

In [None]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
## Debug
# RF Regressor
est = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=42)
est.fit(X_train, y_train)

# RFplus default(fit on all)
rf_plus_base = RandomForestPlusRegressor(rf_model=est)
rf_plus_base.fit(X_train, y_train)

rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train)

In [None]:
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
temp = rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train)

In [None]:
temp.shape

In [None]:
temp[0,0,:].shape

In [None]:
r2_score([y_train[0]]*100, temp[1,1,:])

In [None]:
# # RF Regressor
# est = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=42)
# est.fit(X_train, y_train)

# # RFplus default(fit on all)
# rf_plus_base = RandomForestPlusRegressor(rf_model=est)
# rf_plus_base.fit(X_train, y_train)

# # RFplus oob 
# rf_plus_base_oob = RandomForestPlusRegressor(rf_model=est, fit_on="oob")
# rf_plus_base_oob.fit(X_train, y_train)

# #RFplus inbag RF
# rf_plus_base_inbag = RandomForestPlusRegressor(rf_model=est, include_raw=False, fit_on="inbag", prediction_model=LinearRegression())
# rf_plus_base_inbag.fit(X_train, y_train)

In [None]:
# RF Classifier
est = RandomForestClassifier(n_estimators=100, min_samples_leaf=3, max_features='sqrt', random_state=42)
est.fit(X_train, y_train)

# RFplus default(fit on all)
rf_plus_base = RandomForestPlusClassifier(rf_model=est)
rf_plus_base.fit(X_train, y_train)

# RFplus oob 
rf_plus_base_oob = RandomForestPlusClassifier(rf_model=est, fit_on="oob")
rf_plus_base_oob.fit(X_train, y_train)

rf_plus_base_inbag = RandomForestPlusClassifier(rf_model=est, include_raw=False, fit_on="inbag")
rf_plus_base_inbag.fit(X_train, y_train)

# #RFplus inbag RF
# est_regressor = RandomForestRegressor(n_estimators=100, min_samples_leaf=3, max_features='sqrt', random_state=42)
# est_regressor.fit(X_train, y_train)
# rf_plus_base_inbag = RandomForestPlusRegressor(rf_model=est_regressor, include_raw=False, fit_on="inbag", prediction_model=LinearRegression())
# rf_plus_base_inbag.fit(X_train, y_train)

In [None]:
# Inbag LMDI+
rf_plus_mdi = RFPlusMDI(rf_plus_base_inbag, evaluate_on="inbag")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train)

# OOB LMDI+
rf_plus_mdi = AloRFPlusMDI(rf_plus_base_oob, evaluate_on="oob")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train)

# ALL LMDI+
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train)

# Inbag LMDI+ l2 norm with sign
rf_plus_mdi = RFPlusMDI(rf_plus_base_inbag, evaluate_on="inbag")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=True)

# OOB LMDI+ l2 norm with sign
rf_plus_mdi = AloRFPlusMDI(rf_plus_base_oob, evaluate_on="oob")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=True)

# ALL LMDI+ l2 norm with sign
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=True)

# Inbag LMDI+ l2 norm without sign
rf_plus_mdi = RFPlusMDI(rf_plus_base_inbag, evaluate_on="inbag")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=False)

# OOB LMDI+ l2 norm without sign
rf_plus_mdi = AloRFPlusMDI(rf_plus_base_oob, evaluate_on="oob")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=False)

# ALL LMDI+ l2 norm without sign
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=False)

# Inbag LMDI+ with ranking then average
rf_plus_mdi = RFPlusMDI(rf_plus_base_inbag, evaluate_on="inbag")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)

# OOB LMDI+ with ranking then average
rf_plus_mdi = AloRFPlusMDI(rf_plus_base_oob, evaluate_on="oob")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)

# ALL LMDI+ with ranking then average
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)

In [None]:
# Inbag LMDI+ l2 norm with sign
rf_plus_mdi = RFPlusMDI(rf_plus_base_inbag, evaluate_on="inbag")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=True)

# OOB LMDI+ l2 norm with sign
rf_plus_mdi = AloRFPlusMDI(rf_plus_base_oob, evaluate_on="oob")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=True)

# ALL LMDI+ l2 norm with sign
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=True)

In [None]:
# Inbag LMDI+ l2 norm without sign
rf_plus_mdi = RFPlusMDI(rf_plus_base_inbag, evaluate_on="inbag")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=False)

# OOB LMDI+ l2 norm without sign
rf_plus_mdi = AloRFPlusMDI(rf_plus_base_oob, evaluate_on="oob")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=False)

# ALL LMDI+ l2 norm without sign
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=False)

In [None]:
# Inbag LMDI+ with ranking then average
rf_plus_mdi = RFPlusMDI(rf_plus_base_inbag, evaluate_on="inbag")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)

# OOB LMDI+ with ranking then average
rf_plus_mdi = AloRFPlusMDI(rf_plus_base_oob, evaluate_on="oob")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)

# ALL LMDI+ with ranking then average
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, ranking = True)

In [None]:
rf_plus_mdi = AloRFPlusMDI(rf_plus_base_oob, evaluate_on="oob")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=True)

In [None]:
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="all")
rf_plus_mdi.explain_linear_partial(X=X_train, y=y_train, l2norm=True, sign=True)

In [None]:
# RF Classifier
est = RandomForestClassifier(n_estimators=100, min_samples_leaf=3, max_features='sqrt', random_state=42)
est.fit(X_train, y_train)

# RFplus default(fit on all)
rf_plus_base = RandomForestPlusClassifier(rf_model=est)
rf_plus_base.fit(X_train, y_train)

# RFplus oob 
rf_plus_base_oob = RandomForestPlusClassifier(rf_model=est, fit_on="oob")
rf_plus_base_oob.fit(X_train, y_train)

#RFplus inbag RF
est_regressor = RandomForestRegressor(n_estimators=100, min_samples_leaf=3, max_features='sqrt', random_state=42)
est_regressor.fit(X_train, y_train)
rf_plus_base_inbag = RandomForestPlusRegressor(rf_model=est_regressor, include_raw=False, fit_on="inbag", prediction_model=LinearRegression())
rf_plus_base_inbag.fit(X_train, y_train)

In [None]:
X_test_pred = est.predict(X_test)
print("R2 score of RF: ", r2_score(y_test, X_test_pred))

In [None]:
X_test_pred = rf_plus_base.predict(X_test)
print("R2 score of RF+: ", r2_score(y_test, X_test_pred))

In [None]:
if X_train.shape[0] > 100:
    indices_train = np.random.choice(X_train.shape[0], 100, replace=False)
    X_train_subset = X_train[indices_train]
    y_train_subset = y_train[indices_train]
else:
    indices_train = np.arange(X_train.shape[0])
    X_train_subset = X_train
    y_train_subset = y_train

if X_test.shape[0] > 100:
    indices_test = np.random.choice(X_test.shape[0], 100, replace=False)
    X_test_subset = X_test[indices_test]
    y_test_subset = y_test[indices_test]
else:
    indices_test = np.arange(X_test.shape[0])
    X_test_subset = X_test
    y_test_subset = y_test

In [None]:
rf_plus_mdi = AloRFPlusMDI(rf_plus_base, evaluate_on="oob")

In [None]:
rf_plus_mdi.explain(X=X_train, y=y_train)

In [None]:
temp = rf_plus_mdi.explain(X=X_train, y=y_train)[0][:5]

In [None]:
temp = np.abs(temp)
temp

In [None]:
np.argsort(-1*temp)

In [None]:
np.mean(np.argsort(-1*temp), axis=0)

In [None]:
rf_plus_mdi.explain(X=X_train, y=y_train)[1][0]

In [None]:
rf_plus_base.estimators_

In [None]:
y_train[0]

In [None]:
treeshap_fi, _, _, _ = tree_shap_evaluation_RF(X_train, y_train, X_train_subset, y_train_subset, X_test, y_test, X_test_subset, y_test_subset, fit=est, mode="absolute", train_only=True)

In [None]:
lmdi_fi, _, _, _ = LFI_evaluation_RFPlus_oob_l2_norm_sign(X_train, y_train, X_train_subset, y_train_subset, X_test, y_test, X_test_subset, y_test_subset,  fit=rf_plus_base, mode="absolute", train_only=True)

In [None]:
def select_top_features(array, sorted_indices, percentage):
    array = copy.deepcopy(array)
    num_features = array.shape[1]
    num_selected = int(np.ceil(num_features * percentage))
    selected_indices = sorted_indices[:num_selected]
    selected_array = array[:, selected_indices]
    return num_selected, selected_array

In [None]:
mask_ratio = [0.05, 0.1, 0.25, 0.5, 0.9]
metric_results_shap_mse = []
metric_results_shap_r2 = []
train_fi_mean = np.mean(treeshap_fi, axis=0)
sorted_feature = np.argsort(-train_fi_mean)
for mask in mask_ratio:
    print(X_train.shape)
    num_features_masked, X_train_masked = select_top_features(X_train, sorted_feature, mask)
    print(X_train_masked.shape)
    num_features_masked, X_test_masked = select_top_features(X_test, sorted_feature, mask)
    print(X_test_masked.shape)
    ablation_models = {"RF_Regressor": RandomForestRegressor(n_estimators=100,min_samples_leaf=5,max_features=0.33,random_state=42)}
                    #"Linear": LinearRegression(),
                    #"XGB_Regressor": xgb.XGBRegressor(random_state=42),
                    # 'Kernel_Ridge': KernelRidge(),
                    #"RF_Plus_Regressor": RandomForestPlusRegressor(rf_model=RandomForestRegressor(n_estimators=100,min_samples_leaf=5,max_features=0.33,random_state=42))}
    # for a_model in ablation_models:
    #     ablation_models[a_model].fit(X_train_masked, y_train)
    rf = LinearRegression()# RandomForestRegressor(n_estimators=100,min_samples_leaf=5,max_features=0.33,random_state=42)
    rf.fit(X_train_masked, y_train)
    y_pred = rf.predict(X_test_masked)
    metric_results_shap_mse.append(mean_squared_error(y_test, y_pred))
    metric_results_shap_r2.append(r2_score(y_test, y_pred))

In [None]:
select_top_features(X_train, sorted_feature, 0.01)[1][0]

In [None]:
select_top_features(X_test, sorted_feature, 0.01)[1][0]

In [None]:
indices = np.nonzero(np.isin(X_train[0], select_top_features(X_train, sorted_feature, 0.01)[1][0]))[0]

print(indices)

In [None]:
indices = np.nonzero(np.isin(X_test[0], select_top_features(X_test, sorted_feature, 0.01)[1][0]))[0]

print(indices)

In [None]:
X_train[0]

In [None]:
metric_results_shap_mse

In [None]:
# plot metric_results_shap_r2
metric_results_shap_r2 = np.array(metric_results_shap_r2).reshape(len(mask_ratio), -1)
plt.figure()
plt.plot(mask_ratio, metric_results_shap_r2[:, 0], label="RF_Regressor")
plt.xlabel("Feature Ratio")
plt.ylabel("R2")
plt.legend()
plt.show()

In [None]:
mask_ratio = [0.05, 0.1, 0.25, 0.5, 0.9]
metric_results_lmdi_mse = []
metric_results_lmdi_r2 = []
train_fi_mean = np.mean(local_fi_score_train, axis=0)
sorted_feature = np.argsort(-train_fi_mean)
for mask in mask_ratio:
    print(X_train.shape)
    num_features_masked, X_train_masked = select_top_features(X_train, sorted_feature, mask)
    print(X_train_masked.shape)
    num_features_masked, X_test_masked = select_top_features(X_test, sorted_feature, mask)
    print(X_test_masked.shape)
    ablation_models = {"RF_Regressor": RandomForestRegressor(n_estimators=100,min_samples_leaf=5,max_features=0.33,random_state=42),
                    "Linear": LinearRegression(),
                    "XGB_Regressor": xgb.XGBRegressor(random_state=42),
                    # 'Kernel_Ridge': KernelRidge(),
                    "RF_Plus_Regressor": RandomForestPlusRegressor(rf_model=RandomForestRegressor(n_estimators=100,min_samples_leaf=5,max_features=0.33,random_state=42))}
    for a_model in ablation_models:
        ablation_models[a_model].fit(X_train_masked, y_train)
        y_pred = ablation_models[a_model].predict(X_test_masked)
        metric_results_lmdi_mse.append(mean_squared_error(y_test, y_pred))
        metric_results_lmdi_r2.append(r2_score(y_test, y_pred))