In [None]:
import json
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rc('pdf',fonttype = 42)
plt.rc('ps',fonttype = 42)
import seaborn as sns

sns.set_style('whitegrid')
sns.set_context('paper')

rdirs = [
    '../results/BCH/',
        ]
figdir='figs/'
def save(fig, name):
#     fig.tight_layout()
    for t in ['.pdf','.jpg']:
        plt.savefig(figdir+name+t, 
                    dpi=300,
                    bbox_inches='tight'
                   )

In [None]:
frames = []
# cols = [
#     'algorithm',
#     'random_state',
#     'time_time',
#        ]
globs = []
for rdir in rdirs:
    globs.extend(glob(rdir+'/*.json'))
for f in globs:
#     print(f)
    with open(f,'r') as file:
        try:
            d = json.load(file)
        except Exception as e:
            print('!!!Failed to load',f)
            print(e)
            continue
    frames.append(d)    
df_results = pd.DataFrame.from_records(frames)
print(len(frames),'records')
print(frames[0].keys())

df_results['ml_name'] = df_results['algorithm'].apply(lambda x: x.split('_pmc')[0].split('_mc')[0])
df_results['postprocessing'] = df_results['algorithm'].apply(lambda x: 'MC-CV' if '_mc_cv' in x else x)
df_results['postprocessing'] = df_results['postprocessing'].apply(lambda x: 'MC' if '_mc' in x else x)
df_results['postprocessing'] = df_results['postprocessing'].apply(lambda x: 'PMC-CV' if '_pmc_cv' in x else x)
df_results['postprocessing'] = df_results['postprocessing'].apply(lambda x: 'PMC' if '_pmc' in x else x)
df_results['postprocessing'] = df_results['postprocessing'].apply(lambda x: '-' if not x in ['MC','PMC','MC-CV','PMC-CV'] else x)
df_results.loc[df_results['ml_name']=='lr_cv','ml_name'] = 'lr'
# text embedding type
# df_results['text_encoding'] = df_results['text_encoding'].apply(lambda x: {1:'OHC',-1:'Label Encoding',0:'Word Embedding'}[x])

df_results = df_results.loc[df_results.ml_name!='xgb']
# df_results = df_results.loc[df_results.n_bins==10]
# df_results = df_results.loc[df_results.gamma>0.01]
df_results.groupby('algorithm')['random_state'].count()

In [None]:
df_results['postprocessing'].unique()

In [None]:
df_results['text_encoding'].value_counts()

In [None]:
df_results['rho'].value_counts()

In [None]:
df_results.groupby('algorithm').mean().round(3)

In [None]:
# nice names
nice_names = {
    'roc_auc_test':'AUROC',
    'auprc_test':'AUPRC',
    'MC_loss_test':'MC loss',
    'PMC_loss_test':'PMC loss',
    'DC_loss_test':'DC loss',
    
}
nice_levels = {
    'algorithm':{
        'lr':'LR',
        'lr_mc':'LR+MC',
        'lr_mc_cv':'LR+MC-CV',
        'lr_pmc':'LR+PMC',
        'lr_pmc_cv':'LR+PMC-CV',
        'rf':'RF',
        'rf_mc':'RF+MC',
        'rf_mc_cv':'RF+MC-CV',
        'rf_pmc':'RF+PMC',
        'rf_pmc_cv':'RF+PMC_CV'
    }
}

In [None]:
import pdb
def nice_stat(x):
#     pdb.set_trace()
    return f'{x.mean():.2f} ({x.quantile(.025):.2f}-{x.quantile(.975):.2f})'
def nice_stats(x):
    return x.apply(lambda x: nice_stat(x))


In [None]:
df_results.groupby('algorithm')['roc_auc_test'].std()

In [None]:
metrics = ['roc_auc','auprc', 'MC_loss', 'PMC_loss', 'DC_loss' ]
test_metrics = [m+'_test' for m in metrics]
train_metrics = [m+'_train' for m in metrics]
# df_results.groupby('algorithm')[].mean().round(3)
df_results.groupby(['ml_name','postprocessing'])[test_metrics].mean().round(3)

# for m in test_metrics:
#     df_results[m+'_nice'] = 
df_results.groupby(['ml_name','postprocessing'])[test_metrics].apply(nice_stats)
# df_results.groupby(['text_encoding','ml_name','postprocessing'])[train_metrics].apply(nice_stats)
# df_results.groupby(['text_encoding','ml_name','postprocessing'])[test_metrics].std().round(3)

In [None]:
df_tbl = df_results.copy()
test_metrics = ['AUROC','MC loss','PMC loss','DC loss']
df_tbl = df_tbl.rename(columns=nice_names) 
for level,nicety in nice_levels.items():
    df_tbl[level] = df_tbl[level].apply(lambda x: nicety[x] if x in nicety else x)
df_tbl.groupby(['alpha','gamma','rho','algorithm'])[test_metrics].apply(nice_stats)
df_tbl.groupby(['algorithm'])[test_metrics].apply(nice_stats)

In [None]:
df_tbl = df_results.copy()
# df_tbl = df_tbl.loc[df_tbl.n_bins==10]
# df_tbl = df_tbl.loc[df_tbl.alpha>0.01]
# df_tbl = df_tbl.loc[df_tbl.rho>0.01]
# df_tbl = df_tbl.loc[df_tbl.gamma>0.01]
test_metrics = ['AUROC','MC loss','PMC loss','DC loss']
df_tbl = df_tbl.rename(columns=nice_names) 
for level,nicety in nice_levels.items():
    df_tbl[level] = df_tbl[level].apply(lambda x: nicety[x] if x in nicety else x)
tbl = df_tbl.groupby(['n_bins','gamma','alpha','rho','algorithm'])[test_metrics].apply(nice_stats)
# df_tbl.groupby(['algorithm'])[test_metrics].apply(nice_stats)
tbl

In [None]:
df_tbl['ml_name']

In [None]:
(df_tbl
 .loc[df_tbl.n_bins==5]
#  .loc[df_tbl.ml_name=='rf']
 .loc[df_tbl.rho==0.01]
 .loc[df_tbl.gamma==0.05]
 .groupby(['gamma','rho','alpha','ml_name','postprocessing'])
#  [test_metrics]
 ['AUROC', 'MC loss', 'PMC loss']
 .apply(nice_stats)
)

In [None]:
tbl.apply(min)

In [None]:
sns.set_style('white')
df_plt = df_results.copy()
df_plt = (df_plt
          .rename(columns=nice_names) 
          .loc[df_plt.gamma!=0.01]
         )
for level,nicety in nice_levels.items():
    df_plt[level] = df_plt[level].apply(lambda x: nicety[x] if x in nicety else x)
x,row,col,hue=('alpha','ml_name','gamma','postprocessing') 
df_plt['alpha'] = df_plt['alpha'].astype(str)
for m in [nice_names[t] if t in nice_names else t for t in test_metrics]:
    print(m)
    print('----------------------------------------')
    g=sns.catplot(
######################################## 
        # box options
        kind='box',
        notch=True,
        showfliers=False,
######################################## 
        # point options
#         kind='point',
#         dodge=0.25,
#         join=False,
#         estimator=np.median,
        data=df_plt,
        y=m,
        x=x,
        hue_order=['-','MC','PMC'],
        hue=hue,
        row=row,
        col=col,
#         facet_kws=dict(sharey='row'),
#         sharey="row",
#         facet_kws=dict(sharey=False),
        sharey= (m=='AUROC'),
        aspect=2,
        height=2
    )
# g.set(grid=True,axis='y')
    for ax in g.axes.flat:
        ax.yaxis.grid(True)
#         ax.set_ylabel('')

In [None]:
from statannotations.Annotator import Annotator
import ipdb
# sns.set_style('whitegrid')

def make_plot(df_results, kind='box', plot_kwargs={}, facet_kwargs={}):
    df_plt = df_results.copy()
    df_plt = df_plt.rename(columns=nice_names) 
    for level,nicety in nice_levels.items():
        df_plt[level] = df_plt[level].apply(lambda x: nicety[x] if x in nicety else x)

    order=list(df_plt.groupby('algorithm').groups.keys())

    common_plot_kwargs = dict(
          order=order,
          orient="h",
#           palette="Spectral",
    )
    common_plot_kwargs.update(plot_kwargs)
    
    plot_args = dict(
       box=dict( 
            func=sns.boxplot, 
            showfliers=False,
            notch=True,
            dodge=False,
       ),
       point=dict(
          func=sns.pointplot, 
#           size=10,
#           jitter=False,
          join=False,
       ),
       violin=dict(  
          func=sns.violinplot, 
          dodge=False,
       )
    )
    
    x_vars=[
        'AUROC',
        'MC loss',
        'PMC loss',
        'DC loss'
    ]
    pairgrid_kwargs = dict(
        data=df_plt, 
        x_vars=x_vars,
        y_vars=['algorithm'],
        hue='postprocessing',
        aspect=0.8
    )
    pairgrid_kwargs.update(facet_kwargs)
    g = sns.PairGrid( **pairgrid_kwargs )

    # Draw a dot plot 
    g.map(**plot_args[kind],**common_plot_kwargs)
    # stat annotation pairs
    pairs=[
           ("LR", "LR+MC"), ("LR", "LR+PMC"), ("LR+MC", "LR+PMC"),
           ("RF", "RF+MC"), ("RF", "RF+PMC"), ("RF+MC", "RF+PMC")
          ]

    for (ax,x) in zip(g.axes.flat, x_vars):
        ax.yaxis.grid(True)
        ax.xaxis.grid(False)
        ax.set_ylabel('')
        xticks = ax.get_xticks()
        xticklabs = ax.get_xticklabels()
#             ax.set_xtick
        print(x)
        print('.......')
        annotator = Annotator(
            ax,
            pairs, 
            data=df_plt, 
            x=x, 
            y='algorithm', 
    #             hue='postprocessing',
            orient='h',
            order=order
        )
        annotator.configure(test='Mann-Whitney', 
                            comparisons_correction="Bonferroni",
                            text_format='star', 
                            loc='inside')
        annotator.apply_and_annotate()
        print('........................................')
        
        if x == 'PMC loss':
            ax.set_xlim(left=0.0)
#         if x == 'AUROC':
#             ax.set_xticks(xticks)
#             ax.set_xticklabels(xticklabs)
#             ax.set_xticklabels(ax.get_xticklabels())
#             ax.set_xlim(ax.get_xlim())
#             newxticklabs = ax.get_xticklabels()
    save(g, f'{kind}_AUROC_MC_PMC_DC')
    return g

In [None]:
df_tmp = df_results.loc[~df_results.algorithm.str.contains('xgb')]
g = make_plot(
#     df_results, 
    df_tmp,
    kind='point', 
    plot_kwargs=dict(
        saturation=0.5,
        ci=99,
        legend_out=True,
        estimator=np.median
    ),
    facet_kwargs=dict(
        aspect=.85,
    )
)

In [None]:
make_plot(
    df_results, kind='box', 
    plot_kwargs=dict(saturation=0.9,
                     palette='Set2'
                    ),
    facet_kwargs=dict(aspect=0.85)
)

In [None]:
make_plot(df_results, kind='violin')

In [None]:
# pct difference btw RF / LR with and without PMC/MC

df_results
df = df_tbl.groupby('algorithm')[test_metrics].apply(np.mean)

for metric in test_metrics:
    print(metric)
    for m in ['LR','RF']:
#         print('\t',m)
        for post in ['MC','PMC']:
#             print('\t\t',post)
            a = df.loc[m,metric] 
            b = df.loc[f'{m}+{post}',metric]
            pct= 100*((a-b)/a)
            print('\t\t\t',m,post,'% diff:',round(pct,2))
    print('---')