In [None]:
import json
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rc('pdf',fonttype = 42)
plt.rc('ps',fonttype = 42)
import seaborn as sns

sns.set_style('whitegrid')
sns.set_context('paper')

# rdir = '../results/lpc/results_22-05-13r1/'
# rdir = '../results/lpc/results_22-05-14/'
rdirs = [
#     '../results/lpc/results_22-05-14r1/',
#     '../results/lpc/results_22-05-16/'
#     '../results/lpc/results_22-07-29r1/'
    '../results/lpc/results_22-07-30/'
        ]
figdir='../overleaf/figs/'
def save(fig, name):
#     fig.tight_layout()
    for t in ['.pdf','.jpg']:
        plt.savefig(figdir+name+t, 
                    dpi=300,
                    bbox_inches='tight'
                   )

In [None]:
frames = []
# cols = [
#     'algorithm',
#     'random_state',
#     'time_time',
#        ]
globs = []
for rdir in rdirs:
    globs.extend(glob(rdir+'/*.json'))
for f in globs:
#     print(f)
    with open(f,'r') as file:
        try:
            d = json.load(file)
        except Exception as e:
            print('!!!Failed to load',f)
            print(e)
            continue
    frames.append(d)    
df_results = pd.DataFrame.from_records(frames)
print(len(frames),'records')
print(frames[0].keys())

df_results['ML'] = df_results['algorithm'].apply(lambda x: x.split('_pmc')[0].split('_mc')[0].upper())
df_results['postprocessing'] = df_results['algorithm'].apply(lambda x: 'MC-CV' if '_mc_cv' in x else x)
df_results['postprocessing'] = df_results['postprocessing'].apply(lambda x: 'MC' if '_mc' in x else x)
df_results['postprocessing'] = df_results['postprocessing'].apply(lambda x: 'PMC-CV' if '_pmc_cv' in x else x)
df_results['postprocessing'] = df_results['postprocessing'].apply(lambda x: 'PMC' if '_pmc' in x else x)
df_results['postprocessing'] = df_results['postprocessing'].apply(lambda x: 'Base Model' if not x in ['MC','PMC','MC-CV','PMC-CV'] else x)
df_results.loc[df_results['ML']=='lr_cv','ML'] = 'LR'

df_results['Wall Clock Time (s)'] = df_results['time_time']

df_results['groups'] = df_results['groups'].apply(lambda x: ','.join(x) if isinstance(x,list) else x)
# df_results = df_results.loc[df_results.n_bins==10]
df_results = df_results.loc[df_results.groups!='ethnicity,gender,anchor_year_group']
df_results = df_results.loc[df_results.gamma==0.05]
df_results = df_results.loc[df_results.rho==0.001]
df_results.groupby('algorithm')['random_state'].count()

In [None]:
df_results.groupby(['algorithm','alpha','gamma'])[['MC_loss_train','MC_loss_test']].mean()

In [None]:
df_results['postprocessing'].unique()

In [None]:
df_results['groups'].value_counts()

In [None]:
df_results['rho'].unique()

In [None]:

df_results.groupby('algorithm').mean().round(3)

In [None]:
# nice names
nice_names = {
    'roc_auc_test':'AUROC',
    'auprc_test':'AUPRC',
    'MC_loss_test':'MC loss',
    'PMC_loss_test':'PMC loss',
    'DC_loss_test':'DC loss',
    'n_updates':'# of Updates'
}
nice_levels = {
    'algorithm':{
        'lr':'LR',
        'lr_mc':'LR+MC',
        'lr_mc_cv':'LR+MC-CV',
        'lr_pmc':'LR+PMC',
        'lr_pmc_cv':'LR+PMC-CV',
        'rf':'RF',
        'rf_mc':'RF+MC',
        'rf_mc_cv':'RF+MC-CV',
        'rf_pmc':'RF+PMC',
        'rf_pmc_cv':'RF+PMC_CV'
    }
}

In [None]:
import pdb
def nice_stat(x):
#     pdb.set_trace()
    return f'{x.mean():.2f}$\pm${x.sem():.2f}'
def nice_stats(x):
    return x.apply(lambda x: nice_stat(x))


In [None]:
metrics = ['roc_auc','auprc', 'MC_loss', 'PMC_loss', 'DC_loss' ]
test_metrics = [m+'_test' for m in metrics]
train_metrics = [m+'_train' for m in metrics]
# df_results.groupby('algorithm')[].mean().round(3)
df_results.groupby(['ML','postprocessing'])[test_metrics].mean().round(3)

# for m in test_metrics:
#     df_results[m+'_nice'] = 
df_results.groupby(['ML','postprocessing'])[test_metrics].apply(nice_stats)

In [None]:
df_tbl = df_results.copy()
test_metrics = ['AUROC','MC loss','PMC loss','DC loss']
df_tbl = df_tbl.rename(columns=nice_names) 
for level,nicety in nice_levels.items():
    df_tbl[level] = df_tbl[level].apply(lambda x: nicety[x] if x in nicety else x)
df_tbl.groupby(['alpha','gamma','rho','algorithm'])[test_metrics].apply(nice_stats)
df_tbl.groupby(['algorithm'])[test_metrics].apply(nice_stats)

In [None]:
df_tbl = df_results.copy()
# df_tbl = df_tbl.loc[df_tbl.n_bins==10]
# df_tbl = df_tbl.loc[df_tbl.alpha>0.01]
# df_tbl = df_tbl.loc[df_tbl.rho>0.01]
# df_tbl = df_tbl.loc[df_tbl.gamma>0.01]
test_metrics = ['AUROC','MC loss','PMC loss','DC loss','Wall Clock Time (s)']
df_tbl = df_tbl.rename(columns=nice_names) 
for level,nicety in nice_levels.items():
    df_tbl[level] = df_tbl[level].apply(lambda x: nicety[x] if x in nicety else x)
tbl = df_tbl.groupby(['alpha','gamma','algorithm'])[test_metrics].apply(nice_stats)
# df_tbl.groupby(['algorithm'])[test_metrics].apply(nice_stats)
tbl

# comparison of winning percentages

In [None]:
import ipdb
best_config = []
test_metrics = ['AUROC','MC loss','PMC loss','DC loss']
df = df_results.rename(columns=nice_names) 
# for alg, dfg in df_results.groupby('algorithm'):
for metric in test_metrics:
    for seed, dfgs in df.groupby('random_state'):
        if metric=='AUROC':
            best = dfgs.loc[dfgs[metric].idxmax()]
        else:
            best = dfgs.loc[dfgs[metric].idxmin()]
        tmpbest = best[['algorithm','alpha','gamma','rho','n_bins','ML','Wall Clock Time (s)',
                                 'postprocessing','groups']]
        tmpbest['metric'] = metric
        tmpbest['value'] = best[metric]
#         ipdb.set_trace()
        best_config.append(tmpbest.to_dict())
        
best_df = pd.DataFrame.from_records(best_config)

In [None]:
# for m, dfm in best_df.groupby('metric'):
#     print(m)
#     display(dfm[['postprocessing']].value_counts())
#     display(dfm[['ML','postprocessing','alpha']].value_counts())

# tbl = best_df.groupby(['metric','ML'])['postprocessing'].value_counts().unstack().fillna(0).astype(int)
tbl = best_df.groupby(['metric'])['postprocessing'].value_counts().unstack().fillna(0).astype(int)
tbl.to_latex('../overleaf/tbls/winning_configs.tex')
tbl

In [None]:
best_df.groupby(['metric','postprocessing'])['value'].mean().unstack()

In [None]:
best_df

In [None]:
from sklearn.utils import resample
def boot_med(x):
    x2 = np.empty(1000)
    for i in range(1000):
        x2[i] = np.median(resample(x))
    
    return np.mean(x2)

In [None]:
sns.set_style('white')
df_plt = df_results.copy()
df_plt = df_plt.loc[df_results.rho==0.001]
# df_plt = df_plt.loc[df_results.gamma==0.05]
df_plt = df_plt.loc[df_results.n_bins==10]
alpha_order=['0.15','0.1','0.05','0.01','0.001']
post_order=['Base Model','MC','PMC']
ml_order=['LR','RF']
group_order=['ethnicity,gender','ethnicity,gender,insurance']
# df_plt = df_plt.loc[df_results.alpha.isin([0.001,0.01,0.1])]
# df_plt = df_plt.copy().loc[df_results.gamma==0.1]
df_plt['alpha'] = df_plt['alpha'].astype(str)
df_plt = df_plt.rename(columns=nice_names) 
for level,nicety in nice_levels.items():
    df_plt[level] = df_plt[level].apply(lambda x: nicety[x] if x in nicety else x)
# row,col,hue=('rho','alpha','gamma') 
# row,col,hue=('ML','gamma','alpha') 
row,col,hue=('ML','groups','postprocessing') 

best_cfgs = []
for m in [nice_names[t] if t in nice_names else t for t in test_metrics+['Wall Clock Time (s)','# of Updates']]:
    losses = ['PMC loss', 'MC loss', 'DC loss'] if m in ['# of Updates','Wall Clock Time (s)'] else ['none']
    for loss in losses:
        print(m)
        if m == '# of Updates':
            hue_order=post_order[1:]
        else:
            hue_order=post_order
        print('----------------------------------------')
        g=sns.catplot(
            kind='point',
            dodge=.3 if m not in ['# of Updates','Wall Clock Time (s)'] else False,
            join=False,
            estimator=np.median,
            data=df_plt,
            x=m,
            y='alpha',
            order=alpha_order,
            hue=hue,
            marker=hue,
            hue_order=hue_order,
            row=row,
            row_order=ml_order,
            col=col,
            col_order=group_order,
            sharex=True if m=='AUROC' else False,
            aspect=2,
            height=2.5
        )
#         loss = 'PMC loss'
        for (ml,group),ax in g.axes_dict.items():
            ax.yaxis.grid(True)
            if m == '# of Updates' or m == 'Wall Clock Time (s)':
                xoff=3 if m == '# of Updates' else 1
                yoff=-.1
                df = df_plt.copy().loc[(df_plt.ML==ml) & (df_plt.groups==group) & (df_plt.postprocessing.isin(hue_order))]
                scores = df.groupby(['alpha','postprocessing'])[loss].median().reset_index()
                best_idx = scores.groupby('postprocessing')[loss].idxmin()
    #             for p in hue_order: 
                for p, val in best_idx.iteritems():
                    s = scores.loc[val]
                    a = s['alpha'] 
                    score = s[loss]
                    y = alpha_order.index(a)
                    x = boot_med(df.loc[(df.postprocessing==p) & (df.alpha==a)][m].values )
                    lossname=loss.split(' ')[0]
                    if p in ['MC','PMC']:
                        best_cfgs.append({
                            'metric': loss,
                            'Best Loss': score,
                             m:x,
                            'alpha':a,
                            'postprocessing':p,
                            'ml':ml,
                            'groups':group,
                        })
                    ax.annotate(f'{lossname}={score:.2f}', xy=(x+xoff,y+yoff), horizontalalignment='left',va='bottom',
                               rotation=5)
                ylim=ax.get_ylim()
                print(ylim)
                ax.set_ylim(bottom=ylim[0]-.1)

        plt.savefig((f"../overleaf/figs/catpoint_{m}_vs_alpha_row-{row}_col-{col}_hue-{hue}_annot-{loss}.pdf"
                    .replace(' ','-')
                    .replace('#','n')
                    ),
                    dpi=300, bbox_inches='tight' 
                   )

In [None]:
df_cfgs = pd.DataFrame.from_records(best_cfgs)
display(df_cfgs)

In [None]:
df_cfgs

In [None]:
df_cfgs

In [None]:
import itertools as it
order = []
for m,ml,p in it.product(['MC loss','PMC loss','DC loss'],
                  ml_order,
                  post_order[1:]
                 ):
    order.append((m,ml,p))

tbl = (df_cfgs
       .rename(columns={'metric':'Metric','ml':'ML','postprocessing':'Postprocessing'})
       .groupby(['Metric','ML','Postprocessing']) 
       [['Best Loss','# of Updates','Wall Clock Time (s)']]
#        [['# of Updates','Wall Clock Time (s)']]
       .median()
#        .round(2) 
#        .astype(int)
       .sort_index(level='Metric',ascending=False)
       .loc[order]
)
tbl.to_latex('../overleaf/tbls/best_cfg_time.tex',
             multirow=True,
             formatters=[lambda x: f'{x:0.3f}',
                         lambda x: f'{x:.0f}',
                         lambda x: f'{x:.1f}'
                        ] 
            )
display(tbl)

In [None]:
((378-52)/52
 + (504.69-188.07)/188.07
)/2

In [None]:
from statannotations.Annotator import Annotator
import ipdb
# sns.set_style('whitegrid')

def make_plot(df_results, kind='box', plot_kwargs={}, facet_kwargs={}):
    df_plt = df_results.copy()
    df_plt = df_plt.rename(columns=nice_names) 
    for level,nicety in nice_levels.items():
        df_plt[level] = df_plt[level].apply(lambda x: nicety[x] if x in nicety else x)

    order=list(df_plt.groupby('algorithm').groups.keys())

    common_plot_kwargs = dict(
          order=order,
          orient="h",
#           palette="Spectral",
    )
    common_plot_kwargs.update(plot_kwargs)
    
    plot_args = dict(
       box=dict( 
            func=sns.boxplot, 
            showfliers=False,
            notch=True,
            dodge=False,
       ),
       point=dict(
          func=sns.pointplot, 
#           size=10,
#           jitter=False,
          join=False,
       ),
       violin=dict(  
          func=sns.violinplot, 
          dodge=False,
       )
    )
    
    x_vars=[
        'AUROC',
        'MC loss',
        'PMC loss',
        'DC loss'
    ]
    pairgrid_kwargs = dict(
        data=df_plt, 
        x_vars=x_vars,
        y_vars=['algorithm'],
        hue='postprocessing',
        aspect=0.8
    )
    pairgrid_kwargs.update(facet_kwargs)
    g = sns.PairGrid( **pairgrid_kwargs )

    # Draw a dot plot 
    g.map(**plot_args[kind],**common_plot_kwargs)
    # stat annotation pairs
    pairs=[
           ("LR", "LR+MC"), ("LR", "LR+PMC"), ("LR+MC", "LR+PMC"),
           ("RF", "RF+MC"), ("RF", "RF+PMC"), ("RF+MC", "RF+PMC")
          ]

    for (ax,x) in zip(g.axes.flat, x_vars):
        ax.yaxis.grid(True)
        ax.xaxis.grid(False)
        ax.set_ylabel('')
        xticks = ax.get_xticks()
        xticklabs = ax.get_xticklabels()
#             ax.set_xtick
        print(x)
        print('.......')
        annotator = Annotator(
            ax,
            pairs, 
            data=df_plt, 
            x=x, 
            y='algorithm', 
    #             hue='postprocessing',
            orient='h',
            order=order
        )
        annotator.configure(test='Mann-Whitney', 
                            comparisons_correction="Bonferroni",
                            text_format='star', 
                            loc='inside')
        annotator.apply_and_annotate()
        print('........................................')
        
        if x == 'PMC loss':
            ax.set_xlim(left=0.0)
#         if x == 'AUROC':
#             ax.set_xticks(xticks)
#             ax.set_xticklabels(xticklabs)
#             ax.set_xticklabels(ax.get_xticklabels())
#             ax.set_xlim(ax.get_xlim())
#             newxticklabs = ax.get_xticklabels()
    save(g, f'{kind}_AUROC_MC_PMC_DC')
    return g

In [None]:
g = make_plot(
    df_results, kind='point', 
    plot_kwargs=dict(
        saturation=0.5,
        ci=99,
        legend_out=True,
        estimator=np.median
    ),
    facet_kwargs=dict(
        aspect=.85,
    )
)


In [None]:
make_plot(
    df_results, kind='box', 
    plot_kwargs=dict(saturation=0.9,
                     palette='Set2'
                    ),
    facet_kwargs=dict(aspect=0.85)
)

In [None]:
make_plot(df_results, kind='violin')

In [None]:
# pct difference btw RF / LR with and without PMC/MC

df_results
df = df_tbl.groupby('algorithm')[test_metrics].apply(np.mean)

for metric in test_metrics:
    print(metric)
    for m in ['LR','RF']:
#         print('\t',m)
        for post in ['MC','PMC']:
#             print('\t\t',post)
            a = df.loc[m,metric] 
            b = df.loc[f'{m}+{post}',metric]
            pct= 100*((a-b)/a)
            print('\t\t\t',m,post,'% diff:',round(pct,2))
        a = df.loc[f'{m}+MC',metric] 
        b = df.loc[f'{m}+PMC',metric]
        pct= 100*((a-b)/a)
        print('\t\t\t',m,'MC-PMC','% diff:',round(pct,2))
    print('---')

In [None]:
# loss versus running time 

In [None]:
# sns.set_style('white')
# df_plt = df_results.copy()
# df_plt = df_plt.loc[df_results.rho==0.01]
# df_plt = df_plt.loc[df_results.gamma==0.1]
# df_plt = df_plt.loc[df_results.n_bins==10]
# # df_plt = df_plt.loc[df_results.alpha.isin([0.001,0.01,0.1])]
# # df_plt = df_plt.copy().loc[df_results.gamma==0.1]
# df_plt['alpha'] = df_plt['alpha'].astype(str)
# df_plt = df_plt.rename(columns=nice_names) 
# for level,nicety in nice_levels.items():
#     df_plt[level] = df_plt[level].apply(lambda x: nicety[x] if x in nicety else x)
# # row,col,hue=('rho','alpha','gamma') 
# # row,col,hue=('ml_name','gamma','alpha') 
# row,col,hue=('ML','groups','postprocessing') 
# # row,col,hue=('alpha','gamma','postprocessing') 
# # row,col,hue=('gamma','rho','postprocessing') 
# for m in [nice_names[t] if t in nice_names else t for t in test_metrics]:
#     # m = 'Wall Clock Time (s)'
#     print(m)
#     print('----------------------------------------')
#     g=sns.relplot(
#     #         kind='box',
#         kind='scatter',
#         data=df_plt,
#         x='Wall Clock Time (s)',
#         y=m,
#         hue=hue,
#         style='alpha',
#         hue_order=['Base Model','MC','PMC'],
#         row=row,
#         row_order=['LR','RF'],
#         col=col,
# #         facet_kws=dict(sharey=False),
#         aspect=1,
#         height=4
#     )
#     # g.set(grid=True,axis='y')
#     for ax in g.axes.flat:
#         ax.yaxis.grid(True)
#     #         ax.set_ylabel('')