# Scalarizing results analysis

In [1]:
import mlflow
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', 150)

In [3]:
SCALARIZING_EXPERIMENT_ID = '1'
BASELINE_EXPERIMENT_ID = '2'

## Load baseline

In [4]:
baseline_df = mlflow.search_runs(BASELINE_EXPERIMENT_ID, output_format='pandas')

In [5]:
baseline_df = baseline_df.query("not `metrics.kne_test_acc`.isna()")\
    .assign(dataset=lambda df: df['params.train_path'].str.split('/').str.get(-1).str.split('-train').str.get(0))

## Load scalarizing

In [6]:
scalarizing_df = mlflow.search_runs(SCALARIZING_EXPERIMENT_ID, output_format='pandas')

In [7]:
scalarizing_df = scalarizing_df.query("status == 'FINISHED'")\
    .dropna(axis=1, how='all')\
    .drop('params.train_and_test_paths', axis=1)\
    .assign(**{"metrics.accuracy_ensemble_selected": lambda df: df['metrics.accuracy_ensemble_selected'].fillna(0.0)})\
    .dropna()\
    .assign(dataset=lambda row: row['params.dataset'].str.split('-').str[0])

## PyPruning

In [12]:
pypruning_df = pd.read_csv('../../pypruning-new/notebooks/results_pypruning.csv')

## Join

In [8]:
analysis_df = scalarizing_df.merge(baseline_df, left_on=['dataset', 'params.bagging_size'], right_on=['dataset', 'params.bagging_size'], how='inner', suffixes=("","_base"))

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.selected_ensemble_accuracy,metrics.accuracy_selection_accuracy,metrics.method_selection_accuracy,metrics.accuracy_ensemble_selected,...,metrics.bagging_test_acc,metrics.ola_test_acc,params.train_path_base,tags.mlflow.runName_base,metrics.RandomPruningClassifier,metrics.bagging_acc,metrics.ProxPruningClassifier,metrics.RankPruningClassifier,metrics.ClusterPruningClassifier,metrics.GreedyPruningClassifier
0,31c1227103394770b95ce77822ba2310,1,FINISHED,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 22:34:39.666000+00:00,2022-12-02 00:50:26.979000+00:00,0.656250,0.656250,0.692708,1.0,...,0.664062,0.705729,../../datasets/processed/pima-train-1-s1.csv,clean-hawk-836,0.731192,0.731626,0.731264,0.731192,0.731192,0.731192
1,31c1227103394770b95ce77822ba2310,1,FINISHED,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 22:34:39.666000+00:00,2022-12-02 00:50:26.979000+00:00,0.656250,0.656250,0.692708,1.0,...,0.674479,0.658854,../../datasets/processed/pima-train-2-s1.csv,upset-fowl-255,0.731192,0.731626,0.731264,0.731192,0.731192,0.731192
2,31c1227103394770b95ce77822ba2310,1,FINISHED,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 22:34:39.666000+00:00,2022-12-02 00:50:26.979000+00:00,0.656250,0.656250,0.692708,1.0,...,0.679688,0.664062,../../datasets/processed/pima-train-1-s2.csv,merciful-dolphin-250,0.731192,0.731626,0.731264,0.731192,0.731192,0.731192
3,31c1227103394770b95ce77822ba2310,1,FINISHED,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 22:34:39.666000+00:00,2022-12-02 00:50:26.979000+00:00,0.656250,0.656250,0.692708,1.0,...,0.643229,0.705729,../../datasets/processed/pima-train-0-s2.csv,honorable-bird-265,0.731192,0.731626,0.731264,0.731192,0.731192,0.731192
4,31c1227103394770b95ce77822ba2310,1,FINISHED,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 22:34:39.666000+00:00,2022-12-02 00:50:26.979000+00:00,0.656250,0.656250,0.692708,1.0,...,0.664062,0.692708,../../datasets/processed/pima-train-2-s2.csv,monumental-goose-694,0.731192,0.731626,0.731264,0.731192,0.731192,0.731192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121014,68ee610ff9864df7aff716cdd49e635a,1,FINISHED,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 21:44:50.651000+00:00,2022-12-03 22:29:39.153000+00:00,0.732143,0.732143,0.732143,0.0,...,0.738095,0.779762,../../datasets/processed/ecoli-train-0-s2.csv,painted-koi-769,0.768043,0.767485,0.767857,0.768043,0.768043,0.768043
121015,68ee610ff9864df7aff716cdd49e635a,1,FINISHED,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 21:44:50.651000+00:00,2022-12-03 22:29:39.153000+00:00,0.732143,0.732143,0.732143,0.0,...,0.750000,0.797619,../../datasets/processed/ecoli-train-3-s2.csv,sedate-conch-868,0.768043,0.767485,0.767857,0.768043,0.768043,0.768043
121016,68ee610ff9864df7aff716cdd49e635a,1,FINISHED,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 21:44:50.651000+00:00,2022-12-03 22:29:39.153000+00:00,0.732143,0.732143,0.732143,0.0,...,0.761905,0.809524,../../datasets/processed/ecoli-train-2-s2.csv,angry-stork-841,0.768043,0.767485,0.767857,0.768043,0.768043,0.768043
121017,68ee610ff9864df7aff716cdd49e635a,1,FINISHED,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 21:44:50.651000+00:00,2022-12-03 22:29:39.153000+00:00,0.732143,0.732143,0.732143,0.0,...,0.684524,0.803571,../../datasets/processed/ecoli-train-2-s1.csv,judicious-boar-265,0.768043,0.767485,0.767857,0.768043,0.768043,0.768043


In [14]:
analysis_df['dataset']

0          pima
1          pima
2          pima
3          pima
4          pima
          ...  
121014    ecoli
121015    ecoli
121016    ecoli
121017    ecoli
121018    ecoli
Name: dataset, Length: 121019, dtype: object

In [20]:
mean_accuracies = analysis_df.groupby(['params.scoring_method', 'dataset'])\
    [['metrics.selected_ensemble_accuracy', 'metrics.kne_test_acc', 'metrics.bagging_test_acc', 'metrics.ola_test_acc', 'metrics.mcb_test_acc', 'metrics.desp_test_acc', 'metrics.knorau_test_acc']]\
    .mean()\
    .rename(lambda name: name.replace('metrics.', ''), axis=1)\
    .assign(best=lambda row: row.max(axis=1))\
    .assign(is_method_best=lambda row: row['selected_ensemble_accuracy'] == row['best'])\
    .assign(difference_method_to_best=lambda row: row['selected_ensemble_accuracy'] - row['best'])

In [22]:
mean_accuracies.merge(pypruning_df, left_on=['dataset'], right_on=['dataset'], how='inner', suffixes=("", "_pyprun"))

Unnamed: 0,dataset,selected_ensemble_accuracy,kne_test_acc,bagging_test_acc,ola_test_acc,mcb_test_acc,desp_test_acc,knorau_test_acc,best,is_method_best,difference_method_to_best,metrics.RandomPruningClassifier,metrics.bagging_acc,metrics.ProxPruningClassifier,metrics.RankPruningClassifier,metrics.ClusterPruningClassifier,metrics.GreedyPruningClassifier
0,adult,0.786742,0.720706,0.786407,0.768272,0.768433,0.765017,0.789379,0.789379,False,-0.002637,0.790724,0.79072,0.789449,0.790724,0.790724,0.790724
1,adult,0.786727,0.720744,0.786416,0.768323,0.768486,0.765019,0.789394,0.789394,False,-0.002667,0.790724,0.79072,0.789449,0.790724,0.790724,0.790724
2,appendicitis,0.861347,0.80843,0.877514,0.80135,0.81847,0.859668,0.869119,0.877514,False,-0.016168,0.861111,0.861111,0.861111,0.861111,0.861111,0.861111
3,appendicitis,0.859242,0.808582,0.877475,0.801236,0.818484,0.859609,0.869043,0.877475,False,-0.018233,0.861111,0.861111,0.861111,0.861111,0.861111,0.861111
4,automobile,0.443408,0.369073,0.413454,0.373195,0.382034,0.41806,0.418545,0.443408,True,0.0,0.099525,0.099525,0.099525,0.099525,0.099525,0.099525
5,automobile,0.442181,0.369203,0.413639,0.372934,0.381757,0.418548,0.418621,0.442181,True,0.0,0.099525,0.099525,0.099525,0.099525,0.099525,0.099525
6,balance,0.887584,0.881798,0.875034,0.871689,0.884852,0.877869,0.883187,0.887584,True,0.0,0.887117,0.887117,0.888185,0.887117,0.887117,0.887117
7,balance,0.891569,0.882323,0.875029,0.871915,0.884918,0.877914,0.883321,0.891569,True,0.0,0.887117,0.887117,0.888185,0.887117,0.887117,0.887117
8,banana,0.635011,0.866438,0.57294,0.88577,0.885842,0.891057,0.891702,0.891702,False,-0.256691,0.388008,0.388008,0.387987,0.388008,0.388008,0.388008
9,banana,0.632207,0.866492,0.573567,0.885658,0.885751,0.891057,0.891701,0.891701,False,-0.259494,0.388008,0.388008,0.387987,0.388008,0.388008,0.388008


In [21]:
mean_accuracies

Unnamed: 0_level_0,Unnamed: 1_level_0,selected_ensemble_accuracy,kne_test_acc,bagging_test_acc,ola_test_acc,mcb_test_acc,desp_test_acc,knorau_test_acc,best,is_method_best,difference_method_to_best
params.scoring_method,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
diversity,adult,0.786742,0.720706,0.786407,0.768272,0.768433,0.765017,0.789379,0.789379,False,-0.002637
diversity,appendicitis,0.861347,0.80843,0.877514,0.80135,0.81847,0.859668,0.869119,0.877514,False,-0.016168
diversity,automobile,0.443408,0.369073,0.413454,0.373195,0.382034,0.41806,0.418545,0.443408,True,0.0
diversity,balance,0.887584,0.881798,0.875034,0.871689,0.884852,0.877869,0.883187,0.887584,True,0.0
diversity,banana,0.635011,0.866438,0.57294,0.88577,0.885842,0.891057,0.891702,0.891702,False,-0.256691
diversity,bands,0.641894,0.639812,0.634077,0.605868,0.605119,0.604381,0.619774,0.641894,True,0.0
diversity,breast,0.723302,0.668483,0.724554,0.666525,0.664254,0.709109,0.725406,0.725406,False,-0.002104
diversity,bupa,0.688638,0.64101,0.672311,0.660133,0.649098,0.650301,0.671865,0.688638,True,0.0
diversity,car,0.709165,0.811742,0.690762,0.826505,0.815046,0.777367,0.76572,0.826505,False,-0.11734
diversity,chess,0.964278,0.982624,0.9626,0.970263,0.968589,0.968633,0.968071,0.982624,False,-0.018345


In [10]:
mean_accuracies.to_csv('results.csv')

In [11]:
def count_better_than(series):
    size = len(series)
    new_values = series.copy(deep=True)
    
    for idx, value in enumerate(series):
        values_without_current = series[~series.index.isin([idx])]
        
        rank = (values_without_current > value).sum()
        new_values[idx] = rank
        
    
    return new_values

In [12]:
datasets_ranking = analysis_df.groupby(['params.scoring_method', 'dataset'])\
    [['metrics.selected_ensemble_accuracy', 'metrics.kne_test_acc', 'metrics.bagging_test_acc', 'metrics.ola_test_acc', 'metrics.mcb_test_acc', 'metrics.desp_test_acc', 'metrics.knorau_test_acc']]\
    .mean()\
    .rename(lambda name: name.replace('metrics.', ''), axis=1)\
    .apply(count_better_than, axis=1)\
    .assign(method_best=lambda df: df['selected_ensemble_accuracy'] == 0)

In [None]:
datasets_ranking['selected_ensemble_accuracy'].groupby('params.scoring_method').mean()

params.scoring_method
diversity    3.125
normal       3.125
Name: selected_ensemble_accuracy, dtype: float64

In [None]:
datasets_ranking.to_csv('rankings.csv')

In [18]:
for_analysis_df = analysis_df.groupby(['params.scoring_method', 'dataset'])\
    .mean()\
    .assign(is_method_better=lambda row: row['metrics.selected_ensemble_accuracy'] > row['metrics.accuracy_selection_accuracy'])\
    .assign(is_method_equal=lambda row: row['metrics.selected_ensemble_accuracy'] == row['metrics.accuracy_selection_accuracy'])\
    .assign(difference=lambda row: row['metrics.selected_ensemble_accuracy'] - row['metrics.accuracy_selection_accuracy'])
    
    

In [19]:
for_analysis_df['is_method_better'].groupby(level=0).count()

params.scoring_method
diversity    56
normal       56
Name: is_method_better, dtype: int64

In [20]:
for_analysis_df['difference']

params.scoring_method  dataset        
diversity              adult              0.000008
                       appendicitis      -0.004501
                       automobile         0.002729
                       balance            0.003376
                       banana             0.019280
                       bands              0.001289
                       breast            -0.002172
                       bupa               0.004449
                       car                0.000000
                       chess              0.000232
                       cleveland          0.000122
                       coil2000          -0.000315
                       crx               -0.000698
                       dermatology        0.004012
                       fars               0.000314
                       flare              0.002582
                       german             0.000000
                       glass              0.025573
                       haberman           0

In [22]:
analysis_df\
 .assign(is_method_better=lambda row: row['metrics.selected_ensemble_accuracy'] > row['metrics.accuracy_selection_accuracy'])\
    .assign(is_method_equal=lambda row: row['metrics.selected_ensemble_accuracy'] == row['metrics.accuracy_selection_accuracy'])\
    .assign(difference=lambda row: row['metrics.selected_ensemble_accuracy'] - row['metrics.accuracy_selection_accuracy'])\
    ['is_method_better'].value_counts()

False    98040
True     22979
Name: is_method_better, dtype: int64