In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import seaborn as sns
pd.set_option('display.max_columns', None)
import sys
sys.path.append('..')
sys.path.append('../..')
sys.path.append('.')
sys.path.append('./scripts')
from competing_methods_local import *
from simulations_util import *
from util import apply_splitting_strategy
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def load_selection_results(task="regression"):
    if task == "regression":
        datasets = ['openml_361242', 'openml_361243', 'openml_361253', 'openml_361254', 'openml_361259', 'openml_361260']
        result_cols = ["RF_Regressor_R2_keep_0.1", "RF_Regressor_R2_keep_0.2", "RF_Regressor_R2_keep_0.3", "RF_Regressor_R2_keep_0.4"]
    elif task == "classification":
        datasets = ['openml_361062', 'openml_361063', 'openml_361069', 'openml_361071', 'openml_43', 'openml_9978']
        result_cols = ["RF_Classifier_AUROC_keep_0.1", "RF_Classifier_AUROC_keep_0.2", "RF_Classifier_AUROC_keep_0.3", "RF_Classifier_AUROC_keep_0.4"]
    combined_df = pd.DataFrame()
    split_seeds = [1,2,3,4]
    sample_seeds = [1,2,3,4,5]
    for data in datasets:
        ablation_directory =f"/accounts/projects/binyu/zhongyuan_liang/local_MDI+/imodels-experiments/feature_importance/results/mdi_local.real_data_{task}_{data}/{data}_selection/varying_sample_row_n"
        for split_seed in split_seeds:
            for sample_seed in sample_seeds:
                df = pd.read_csv(os.path.join(ablation_directory, f"seed_{split_seed}_{sample_seed}/results.csv"))
                df["data"] = data
                combined_df = pd.concat([combined_df, df], ignore_index=True)
    combined_df = combined_df.groupby(['fi', 'data'])[result_cols].mean().reset_index()
    return combined_df

def load_stability_results(task="regression"):
    if task == "regression":
        datasets = ['openml_361242', 'openml_361243', 'openml_361253', 'openml_361254', 'openml_361259', 'openml_361260']
    elif task == "classification":
        datasets = ['openml_361062', 'openml_361063', 'openml_361069', 'openml_361071', 'openml_43', 'openml_9978']
    result_cols = ["avg_10_features", "avg_20_features", "avg_30_features", "avg_40_features"]
    combined_df = pd.DataFrame()
    split_seeds = [1,2,3]
    sample_seeds = [1,2,3,4,5]
    for data in datasets:
        ablation_directory =f"/accounts/projects/binyu/zhongyuan_liang/local_MDI+/imodels-experiments/feature_importance/results/mdi_local.real_data_{task}_{data}_stability/{data}_stability/varying_sample_row_n"
        for split_seed in split_seeds:
            for sample_seed in sample_seeds:
                df = pd.read_csv(os.path.join(ablation_directory, f"seed_{split_seed}_{sample_seed}/results.csv"))
                df["data"] = data
                combined_df = pd.concat([combined_df, df], ignore_index=True)
    combined_df = combined_df.groupby(['fi', 'data'])[result_cols].mean().reset_index()
    return combined_df

In [None]:
def compute_selection_ablation_rank_results(regression_results_df, classification_results_df, method="Local_MDI+_MDI_elasticnet_fit_on_all_ranking_RFPlus", baselines=['LIME_RF', 'TreeSHAP_RF']):
    baselines.append(method)
    
    regression_results_df = regression_results_df[regression_results_df["fi"].isin(baselines)]
    datasets = ['openml_361242', 'openml_361243', 'openml_361253', 'openml_361254', 'openml_361259', 'openml_361260']
    result_cols = ["RF_Regressor_R2_keep_0.1", "RF_Regressor_R2_keep_0.2", "RF_Regressor_R2_keep_0.3", "RF_Regressor_R2_keep_0.4"]
    rank_results_regression = pd.DataFrame()
    for data in datasets:
        df_data = regression_results_df[regression_results_df["data"] == data]
        ranked_df = df_data.copy()
        for col in result_cols:
            ranked_df[col] = df_data[col].rank(ascending=False, method='min')
        rank_results_regression = pd.concat([rank_results_regression, ranked_df], ignore_index=True)
    rank_results_regression.columns = ['fi', 'data', 'top_0.1', 'top_0.2', 'top_0.3', 'top_0.4']

    classification_results_df = classification_results_df[classification_results_df["fi"].isin(baselines)]
    datasets = ['openml_361062', 'openml_361063', 'openml_361069', 'openml_361071', 'openml_43', 'openml_9978']
    result_cols = ["RF_Classifier_AUROC_keep_0.1", "RF_Classifier_AUROC_keep_0.2", "RF_Classifier_AUROC_keep_0.3", "RF_Classifier_AUROC_keep_0.4"]
    rank_results_classification = pd.DataFrame()
    for data in datasets:
        df_data = classification_results_df[classification_results_df["data"] == data]
        ranked_df = df_data.copy()
        for col in result_cols:
            ranked_df[col] = df_data[col].rank(ascending=False, method='min')
        rank_results_classification = pd.concat([rank_results_classification, ranked_df], ignore_index=True)
    rank_results_classification.columns = ['fi', 'data', 'top_0.1', 'top_0.2', 'top_0.3', 'top_0.4']
    
    rank_results = pd.concat([rank_results_regression, rank_results_classification], ignore_index=True)
    rank_results_avg = rank_results.groupby('fi')[['top_0.1', 'top_0.2', 'top_0.3', 'top_0.4']].mean().sort_values(by='top_0.1').reset_index()
    # print(rank_results_avg)
    rank_results_avg = rank_results_avg[rank_results_avg["fi"] == method]
    return rank_results_avg

def compute_stability_ablation_rank_results(regression_results_df, classification_results_df, method="Local_MDI+_MDI_elasticnet_fit_on_all_ranking_RFPlus", baselines=['LIME_RF', 'TreeSHAP_RF']):
    baselines.append(method)
    
    regression_results_df = regression_results_df[regression_results_df["fi"].isin(baselines)]
    datasets = ['openml_361242', 'openml_361243', 'openml_361253', 'openml_361254', 'openml_361259', 'openml_361260']
    result_cols = ["avg_10_features", "avg_20_features", "avg_30_features", "avg_40_features"]
    rank_results_regression = pd.DataFrame()
    for data in datasets:
        df_data = regression_results_df[regression_results_df["data"] == data]
        ranked_df = df_data.copy()
        for col in result_cols:
            ranked_df[col] = df_data[col].rank(ascending=True, method='min')
        rank_results_regression = pd.concat([rank_results_regression, ranked_df], ignore_index=True)
    rank_results_regression.columns = ['fi', 'data', 'top_0.1', 'top_0.2', 'top_0.3', 'top_0.4']

    classification_results_df = classification_results_df[classification_results_df["fi"].isin(baselines)]
    datasets = ['openml_361062', 'openml_361063', 'openml_361069', 'openml_361071', 'openml_43', 'openml_9978']
    result_cols = ["avg_10_features", "avg_20_features", "avg_30_features", "avg_40_features"]
    rank_results_classification = pd.DataFrame()
    for data in datasets:
        df_data = classification_results_df[classification_results_df["data"] == data]
        ranked_df = df_data.copy()
        for col in result_cols:
            ranked_df[col] = df_data[col].rank(ascending=True, method='min')
        rank_results_classification = pd.concat([rank_results_classification, ranked_df], ignore_index=True)
    rank_results_classification.columns = ['fi', 'data', 'top_0.1', 'top_0.2', 'top_0.3', 'top_0.4']
    
    rank_results = pd.concat([rank_results_regression, rank_results_classification], ignore_index=True)
    rank_results_avg = rank_results.groupby('fi')[['top_0.1', 'top_0.2', 'top_0.3', 'top_0.4']].mean().sort_values(by='top_0.1').reset_index()
    rank_results_avg = rank_results_avg[rank_results_avg["fi"] == method]
    return rank_results_avg

In [None]:
regression_results_df = load_selection_results(task="regression")
classification_results_df = load_selection_results(task="classification")
compute_selection_ablation_rank_results(regression_results_df, classification_results_df, method="MDI", baselines=['LIME_RF', 'TreeSHAP_RF'])

Unnamed: 0,fi,top_0.1,top_0.2,top_0.3,top_0.4
2,MDI,2.333333,1.916667,1.916667,1.916667


In [None]:
compute_selection_ablation_rank_results(regression_results_df, classification_results_df, method="Ablation_model_ranking", baselines=['LIME_RF', 'TreeSHAP_RF'])

In [32]:
compute_selection_ablation_rank_results(regression_results_df, classification_results_df, method="Ablation_model0", baselines=['LIME_RF', 'TreeSHAP_RF'])

Unnamed: 0,fi,top_0.1,top_0.2,top_0.3,top_0.4
0,Ablation_model0,1.666667,1.75,1.416667,1.666667


In [None]:
compute_selection_ablation_rank_results(regression_results_df, classification_results_df, method="Ablation_model1", baselines=['LIME_RF', 'TreeSHAP_RF'])

Unnamed: 0,fi,top_0.1,top_0.2,top_0.3,top_0.4
0,Ablation_model2,1.666667,1.166667,1.083333,1.0


In [34]:
compute_selection_ablation_rank_results(regression_results_df, classification_results_df, method="Local_MDI+_MDI_elasticnet_fit_on_all_ranking_RFPlus", baselines=['LIME_RF', 'TreeSHAP_RF'])

Unnamed: 0,fi,top_0.1,top_0.2,top_0.3,top_0.4
0,Local_MDI+_MDI_elasticnet_fit_on_all_ranking_R...,1.25,1.166667,1.0,1.083333


In [35]:
regression_results_df = load_stability_results(task="regression")
classification_results_df = load_stability_results(task="classification")
compute_stability_ablation_rank_results(regression_results_df, classification_results_df, method="MDI", baselines=['LIME_RF', 'TreeSHAP_RF'])

Unnamed: 0,fi,top_0.1,top_0.2,top_0.3,top_0.4
2,MDI,2.75,2.666667,2.666667,2.416667


In [36]:
compute_stability_ablation_rank_results(regression_results_df, classification_results_df, method="Ablation_model0", baselines=['LIME_RF', 'TreeSHAP_RF'])

Unnamed: 0,fi,top_0.1,top_0.2,top_0.3,top_0.4
2,Ablation_model0,2.363636,2.272727,2.181818,1.727273


In [37]:
compute_stability_ablation_rank_results(regression_results_df, classification_results_df, method="Ablation_model2", baselines=['LIME_RF', 'TreeSHAP_RF'])

Unnamed: 0,fi,top_0.1,top_0.2,top_0.3,top_0.4
0,Ablation_model2,1.25,1.166667,1.166667,1.083333


In [38]:
compute_stability_ablation_rank_results(regression_results_df, classification_results_df, method="Local_MDI+_MDI_elasticnet_fit_on_all_ranking_RFPlus", baselines=['LIME_RF', 'TreeSHAP_RF'])

Unnamed: 0,fi,top_0.1,top_0.2,top_0.3,top_0.4
0,Local_MDI+_MDI_elasticnet_fit_on_all_ranking_R...,1.166667,1.0,1.0,1.0
