In [None]:
import json
import numpy as np
import pandas as pd
from glob import glob
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss

In [None]:
def nice_stat(x):
#     ipdb.set_trace()
#     return f'{x.mean():.2f}$\pm${x.std():.2f}'
    return f'{x.mean():.2f} ({x.quantile(.025):.2f}-{x.quantile(.975):.2f})'

# bootstrap metric fn

In [None]:
from sklearn.utils import resample
from pqdm.processes import pqdm
from tqdm.notebook import tqdm

def ballogloss(y_true, y_pred):
    weights = np.array([1/len(y_true[y_true==i]) for i in y_true])
    weights /= np.sum(weights)
    return log_loss(y_true, y_pred, sample_weight=weights)

def bootstrap_metric(fn, n, y_true, y_pred, n_samples):
    ys_true, ys_pred = resample(y_true, y_pred, n_samples=n_samples)
    return {
        'metric':fn.__name__,
        'value':fn(ys_true,ys_pred),
        'bootstrap':n
    }
        
def bootstrap_metrics(y_true, y_pred, n_bootstraps=100, n_samples=None):
    # This line is the strange hack https://github.com/tqdm/tqdm/issues/485
    print(' ', end='', flush=True)
    metrics = [roc_auc_score, average_precision_score, log_loss, ballogloss]
    scores = []
    for n in tqdm(range(n_bootstraps)):
        for fn in metrics:
            scores.append(bootstrap_metric(fn,n,y_true,y_pred,n_samples))
    return scores 

# load lstm results
- to save time, this step is saved to a csv and re-loaded below

In [None]:
import numpy as np
from mimic3models.common_utils import phenotype_names 
# from tqdm.notebook import tqdm
# df_lstm = pd.read_csv('results_lstm/k_lstm.n256.d0.3.dep1.bs8.ts1.0.epoch100.test0.4631129801273346.state.csv')
df_lstm = pd.read_csv('results_lstm/k_lstm.n256.d0.3.dep1.bs8.ts1.0.epoch14.test0.4256526231765747.state.csv')

frames = []
n_bootstraps = 100
    
def bootstrap_phenotype(i,p):
    # This line is the strange hack https://github.com/tqdm/tqdm/issues/485
    print(' ', end='', flush=True)
    y_true = df_lstm[f'label_{i+1}'].values
    y_pred = df_lstm[f'pred_{i+1}'].values
    scores = bootstrap_metrics(y_true, y_pred)
    results = []
    for s in scores:
        result = {
            'method':'LSTM',
            'task':p.replace(';','')
        }
        result.update(s)
        results.append(result)
    return results

frames = pqdm([(i,p) for i,p in enumerate(phenotype_names)],
              bootstrap_phenotype,
              n_jobs=20,
              argument_type='args'
             )

flat_frames = []
for f in frames:
    flat_frames.extend(f)
df_lsr = pd.DataFrame.from_records(flat_frames)
df_lsr.loc[:,'data'] = 'raw'
df_lsr.loc[:,'n_nodes'] = 341249
# Number of nodes gotten with this code:
# from mimic3models.keras_models.lstm import Network
# lstm = Network(dim=256, batch_norm=False, dropout=0.3, rec_dropout=0.1, task='ph')
# lstm.summary()
# Model: "network"
# _________________________________________________________________
#  Layer (type)                Output Shape              Param #
# =================================================================
#  X (InputLayer)              [(None, None, 76)]        0

#  masking (Masking)           (None, None, 76)          0

#  lstm (LSTM)                 (None, 256)               340992

#  dropout (Dropout)           (None, 256)               0

#  dense (Dense)               (None, 1)                 257

# =================================================================
# Total params: 341,249
# Trainable params: 341,249
# Non-trainable params: 0
    
df_lsr
df_lsr.to_csv('lstm_bootstrapped.k_lstm.n256.d0.3.dep1.bs8.ts1.0.epoch14.test0.4256526231765747.state.csv', index=False)

## Get Micro-averaged scores for LSTM

In [None]:
# YTrues = []
# YPreds = []
# for i in range(len(phenotype_names)):
#     y_true = df_lstm[f'label_{i+1}'].values
#     y_pred = df_lstm[f'pred_{i+1}'].values
#     YTrues.extend(y_true)
#     YPreds.extend(y_pred)
# len(YTrues)==len(YPreds)

# results = bootstrap_metrics(np.asarray(YTrues), np.asarray(YPreds), n_samples=10000)
# # micro_lstm = pd.DataFrame.from_records(frames,columns=frames[0].keys())
# # micro_lstm
# micro_lstm = pd.DataFrame.from_records(results)

# FEAT and LR results 

In [None]:

rdirs = [
         'results/lpc/results_linear/',
         'results/lpc/results_archive_22-06-07/',
         'results/lpc/results_archive_dim100_22-06-07/',
#          'results/lpc/results_feat_22-05-26/',
        ]
# rdirs = ['arch_test/']

In [None]:
rframes = []
dropcols = ['metrics','acc', 'prec0', 'prec1', 'rec0', 'rec1', 'minpse' ]

metrics = ['roc_auc_score','average_precision_score','ballogloss','log_loss']
globs = []
for r in rdirs:
    globs.extend(glob(r+'/results/*.csv'))
for f in globs:
#     print(f)
    d = pd.read_csv(f)
    if 'metrics' in d.columns:
        d = d.drop('metrics',axis=1)
    if '.feat.' in f:
        d['method'] = 'FEAT'
        if 'dim100_' in f:
            d['method'] = d['method']+'-100'
    rframes.append(d)
    
print('loaded',len(rframes),'frames')
df_r = pd.concat(rframes) #.dropna()

# rename metrics
metnames = {'auroc':'roc_auc_score',
            'auprc':'average_precision_score',
            'logloss':'log_loss'
           }
df_r['metric'] = df_r['metric'].apply(lambda x: metnames[x] if x in metnames.keys() else x)
df_r['task'] = df_r['task'].apply(lambda x: x.replace('-',' '))
df_r = df_r.loc[df_r.metric.isin(metrics)]


In [None]:
df_r.method.unique()

In [None]:
df_r.loc[(df_r.method=='LR')
         & (df_r.task.str.contains('Septicemia'))
#          & (df_r.metric=='log_loss')
#          & (df_r.fold=='val')
         & (df_r.run_id=='3194bf64dd0e11ec9865a0369feec84c')
        ]

In [None]:


lr_pred_name = "results/lpc/results_linear/predictions/lr.run_3194bf64dd0e11ec9865a0369feec84c.param_4.json"
lr_res_name = 'results/lpc/results_linear/results/Septicemia-(except-in-labor).lr.run_3194bf64dd0e11ec9865a0369feec84c.param_4.csv'
lr_params = 'results/lpc/results_linear/results/Septicemia-(except-in-labor).lr.run_3194bf64dd0e11ec9865a0369feec84c.param_4.params'
df_lr_sept = pd.read_csv(lr_res_name)
with open(lr_pred_name,'r') as file:
    d = json.load(file)
d.keys()

In [None]:
with open(lr_params,'r') as file:
    p = json.load(file)
p

# parameter selection: pick results with best val score

In [None]:
import ipdb
def not_overfit_best(data, metric='roc_auc_score', threshold=0.05):
    indexer = ['run_id','task','method','archive_id']
    dfp = data.pivot(
             index = indexer,
             columns=['fold','metric'],
             values='value'
            )
    overfitting = (dfp[('train',metric)] - dfp[('val',metric)])/dfp[('train',metric)]
    
    mask =  overfitting <= threshold
    while np.sum(mask) == 0:
        threshold += 0.01
        mask =  overfitting <= threshold
    print(data.iloc[0]['method'],data.iloc[0]['task'],'mask sum:',np.sum(mask))     
    dfp = dfp[mask].nlargest(1,columns=[('val',metric)])
    try:
        idx = dfp.reset_index().melt(id_vars=indexer).iloc[0][indexer].to_dict() 
    except Exception as e:
        print(e)
        ipdb.set_trace()
#     print(idx)
    return idx
     

In [None]:

from model_selection import (smallest_of_best_three_quartiles,
                             best_of_smallest_three_quartiles, 
                             best, 
                             smallest_of_best_quartile)
selector = best_of_smallest_three_quartiles
# selector = smallest_of_best_three_quartiles
# selector = smallest_of_best_quartile
bests = []
metric={}
metric['FEAT'] = 'roc_auc_score'
metric['FEAT-100'] = 'roc_auc_score'
metric['LR'] = 'log_loss'
metric['LR-10'] = 'log_loss'
metric['LR-100'] = 'log_loss'

lr_indexer = ['run_id','task','method']
feat_indexer = ['run_id','task','method','archive_id']

for (method,task),dfg in df_r.groupby(['method','task']):
    if 'LR' in method:
        dfg = dfg.loc[(dfg.fold=='val') & (dfg.metric==metric[method])]
        tmp = dfg.nsmallest(1, columns='value') 
        idx = tmp[lr_indexer].to_dict(orient='records')[0]
    elif method=='FEAT-100':
#         tmp = best_of_smallest_three_quartiles(dfg, metric='value', size='n_nodes')
#         idx = tmp.reset_index().melt(id_vars=feat_indexer).iloc[0][feat_indexer].to_dict() 
        idx = not_overfit_best(dfg, metric=metric[method], threshold=0.2)
    else:
        idx = not_overfit_best(dfg, metric=metric[method], threshold=0.2)
        
    bests.append(idx)
        
    

dfs = []

for elem in bests:
    df = df_r
    for k,v in elem.items():
        df = df.loc[df[k]==v]
    dfs.append(df)

df_best = pd.concat(dfs)
df_best = df_best.loc[df_best.fold=='test']

In [None]:
df_best.method.unique()

In [None]:
# import pdb
# # Option: try heuristic from before (smallest of best 3 quartiles etc)
# # val_metric = 'roc_auc_score'
# # val_metric = 'average_precision_score'
# val_metric = 'log_loss'
# fn = np.min
# dfs = []
# for method, val_metric, fn in [
#     ('FEAT','roc_auc_score',np.max),
#     ('FEAT-100','roc_auc_score',np.max),
#     ('LR','log_loss',np.min)
#     ]:
#     task_best = (df_r.loc[(df_r.method==method) & (df_r.fold=='val') & (df_r.metric==val_metric)]
#                  .groupby(['method','task'],as_index=False)
#                  ['value']
#                  .apply(fn)
#                  .reset_index()
#                 )
#     df = pd.merge(df_r,task_best,on=['method','task'],suffixes=('','_best')).set_index(['run_id','task','method','archive_id'])
#     idx = df.loc[(df.fold=='val') & (df.metric==val_metric) & (df.value==df.value_best)].index.values
#     df_best = df.loc[idx].reset_index()
# #     pdb.set_trace()
#     df_best = df_best.loc[df_best.fold=='test']
#     dfs.append(df_best)
# df_best = pd.concat(dfs)
# df_best = df_best.drop_duplicates(subset=['task',
#                                 'data',
#                                 'param_id',
#                                 'fold','model','n_nodes','metric','value','method','value_best'])

In [None]:
df_best.loc[df_best['metric']=='roc_auc_score'].groupby(['task','metric','method'])['value'].mean().unstack().round(3)

In [None]:
df_best.groupby('method').count()

## load feat predictions and calculate bootstrapped metrics

In [None]:
import os
import ipdb 

def get_pred_name(x):
    if  'lr' in x.method.lower():
        return f"{x.method.lower().replace('-','')}.run_{x.run_id}.param_{x.param_id}.json" 
    else:
        mthd = x.method.lower().split('-')[0]
        return f"{x.task.replace(' ','-')}.{mthd}.run_{x.run_id}.param_{x.param_id}.arc{int(x.archive_id)}.json"
        
        
pred_names = (df_best.apply(lambda x: get_pred_name(x), axis=1)
              .unique()
             )
pframes = []
# for f in [rdir + 'predictions/'+pn for pn in pred_names]:
# #     print(f)
# #     pframes.append(d)
# # print(pframes)
# df_p = pd.DataFrame.from_records(pframes)
# df_p

frames = []
n_bootstraps = 100
def bootstrap_phenotype(f):
    with open(f,'r') as file:
        d = json.load(file)
    y_true = np.array(d['label'])
    y_pred = np.array(d['pred'])
#     ipdb.set_trace()
    scores = bootstrap_metrics(y_true, y_pred)
    results = []
    if 'feat' in f:
        idx = 1
    else:
        idx = 0
    method = f.split('/')[-1].split('.')[idx].upper()
    if 'LR10' in method:
        method=method.replace('10','-10')
    if 'dim100_' in f:
        method += '-100'
    for s in scores:
        
        res = {
            'method':method,
            'task':d['task'].replace('-',' ')
        }
        res.update(s)
        results.append(res)
#     if any([np.isnan(s['value']) for s in scores]):
#         print(results)
#         ipd.set_trace()
    return results

# frames = []
# for f in [rdir + 'predictions/'+pn for pn in pred_names]:
#     frames.append(bootstrap_phenotype(f))
pred_files = []
for p in pred_names:
    found = False
    for r in rdirs:
        f = r+ 'predictions/'+p
        if os.path.exists(f):
            pred_files.append(f)
            found=True
            continue
    if not found:
        raise ValueError(f"uh oh spaghetti-ohs, {p} not found")
    
# frames = [bootstrap_phenotype(p) for p in tqdm(pred_files)]
frames = pqdm(pred_files,
              bootstrap_phenotype,
              n_jobs=20,
             )
flat_frames = []
for f in frames:
    flat_frames.extend(f)
# print(frames)
df_feat_lr = pd.DataFrame.from_records(flat_frames)
df_feat_lr.loc[:,'data'] = 'tsfresh'



In [None]:
df_feat_lr.groupby(['task','metric','method'])['value'].apply(nice_stat).unstack()

# Get Micro-averaged scores for FEAT and LR

In [None]:
# YTrues = {}
# YPreds = {}
# YTruesByTask = {}
# YPredsByTask = {}
# for f in pred_files:
#     with open(f,'r') as file:
#         d = json.load(file)
# #     import ipdb
# #     ipdb.set_trace()
#     y_true = np.array(d['label'])
#     y_pred = np.array(d['pred'])
#     if 'feat' in f:
#         idx = 1
#     else:
#         idx = 0
#     method = f.split('/')[-1].split('.')[idx].upper()
#     if 'dim100_' in f:
#         method += '-100'
#     if not (method in YTrues.keys()):
#         YTrues[method] = []
#         YPreds[method] = []
#         YTruesByTask[method] = {} 
#         YPredsByTask[method] = {} 
#     YTrues[method].extend(y_true)
#     YPreds[method].extend(y_pred)
#     YTruesByTask[method][d['task']] = y_true
#     YPredsByTask[method][d['task']] = y_pred
    

# frames = []
# # scores = pqdm([dict(y_true=np.asarray(YTrues[m]),y_pred=np.asarray(YPreds[m])) for m in YTrues.keys()],
# #               bootstrap_metrics,
# #               n_jobs=3,
# #               argument_type='kwargs' 
# # )
# # for scrs,m in zip(scores,YTrues.keys()):
# #     [s.update({'method':m}) for s in scrs]
# #     frames.extend(scrs)
    
# for m in YTrues.keys():
#     scores = bootstrap_metrics(np.asarray(YTrues[m]),np.asarray(YPreds[m]), n_samples=10_000, n_bootstraps=100)
#     [s.update({'method':m}) for s in scores]
#     frames.extend(scores)
# micro_feat_lr = pd.DataFrame.from_records(frames)

# frames=[]
# for m,v in YTruesByTask.items():
#     for t in v.keys():
#         scores = bootstrap_metrics(np.asarray(YTruesByTask[m][t]),np.asarray(YPredsByTask[m][t]), 
#                                    n_samples=100000, n_bootstraps=1)
#         [s.update({'method':m,'task':t}) for s in scores]
#         frames.extend(scores)
               
# scores_feat_lr = pd.DataFrame.from_records(frames)    

## sanity checks

In [None]:
# step = 6281
# for i,(task,vals) in enumerate(YTruesByTask['FEAT'].items()):
#     print(task,
#           all(YTrues['FEAT'][i*step:i*step+step] == vals)
#          )

In [None]:
# step = 6281
# for m,v in YPredsByTask.items():
#     for i,t in enumerate(v.keys()):
#         print(m,t,
#               all(YPreds[m][i*step:i*step+step] == YPredsByTask[m][t])
#              )

In [None]:
# scores_feat_lr.loc[scores_feat_lr['metric']=='roc_auc_score'].groupby(['task','method']).mean().unstack().round(3)

In [None]:
# import math
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set_context('paper')
# sns.set(font_scale=1.2)
# sns.set_style('whitegrid')

# df_plt = scores_feat_lr.loc[scores_feat_lr.metric.isin(['roc_auc_score','average_precision_score'])]

# df_plt['nice-task'] = df_plt['task'].apply(lambda x: task_names[x] if x in task_names.keys() else x)
# # task_order = df_plt[df_plt.metric=='roc_auc_score'].groupby('nice-task')['value'].mean().sort_values().index[::-1]
# task_order = df_plt[df_plt.metric=='average_precision_score'].groupby('nice-task')['value'].mean().sort_values().index[::-1]
# print('task_order:',task_order)
# g = sns.catplot(
#     kind='point',
#     estimator=np.median,
# #     kind='strip',
# #     showfliers=False,
# #     dodge=False,
#     data=df_plt,
#     x='value',
# #     y='task',
#     y='nice-task',
#     hue='method',
#     order=task_order,
# #     hue_order=['FEAT','FEAT-100','LR-10','LR-100','LR'], #,'LSTM'],
#     hue_order=['FEAT','LR-10'], #,'LSTM'],
#     col='metric',
#     ci='sd',
#     join=False,
#     height=6,
#     aspect=1.2,
#     sharex=False,
# #     palette='Spectral',
# )
# g.set(ylabel='',xlabel='')
# # plt.xlabel('AUROC')

# for i,ax in enumerate(g.axes.flat):
#     ax.grid(True,axis='y')
#     ax.grid(False,axis='x')
# #     ax.set_title('')
# #     if i == 0:
# #         ax.set_title('AUROC')
# # #         ax.set_title('AUPRC')
# # #     elif i == 1:
# # #         ax.set_title('AUPRC')
# #     elif i == 1:
# # #     elif i == 2:
# #         ax.set_xscale('log')
# #         ax.set_title('Model Size')

# case counts per task 

In [None]:
# frames = []

# YTBT = YTruesByTask['LR']
# for t,v in YTBT.items():
#     frames.append(
#         {
#             'task': t,
#             'cases':np.sum(v==1),
#             'controls':np.sum(v==0),
#             'prevalence':np.sum(v==1)/len(v),
#         }
#     )
# case_counts= pd.DataFrame.from_records(frames)
# case_counts.sort_values(by='cases',ascending=False) #.value_counts() #['cases'].value_counts()

In [None]:
# tmp = scores_feat_lr.merge(case_counts,on='task')
# metric='average_precision_score'
# x = (tmp.loc[tmp.metric==metric]
#  .groupby(['task','cases','method'])
#  ['value']
#  .mean()
#  .unstack()
#  .sort_values(by='cases',ascending=False)
# )
# x['$\Delta$'] = x['FEAT']-x['LR']
# x['$\Delta$100'] = x['FEAT-100']-x['LR']
# x.round(3)
# # tmp.sort_values(by='cases')

In [None]:
# x = (tmp
#      .groupby(['task','metric','bootstrap','method'])
#      ['value']
#      .mean()
#      .unstack()
# )
# x['FEAT - LR'] = x['FEAT']-x['LR']
# x['FEAT-100 - LR'] = x['FEAT-100']-x['LR']
# x = \
# (x.reset_index()
#  .melt(id_vars=['task','metric','bootstrap'],
#        var_name='method',
#        value_name='value'
#       )
# )
 
# # tmp.merge(x.reset_index(), on = ['task','metric','bootstrap'])
# x

In [None]:
# import math
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set_context('paper')
# sns.set(font_scale=1.2)
# sns.set_style('whitegrid')

# df_plt = x.copy()
# df_plt = df_plt.loc[df_plt.metric.isin(['roc_auc_score','average_precision_score'])]

# df_plt['nice-task'] = df_plt['task'].apply(lambda x: task_names[x] if x in task_names.keys() else x)
# task_order = df_plt[df_plt.metric=='average_precision_score'].groupby('nice-task')['value'].mean().sort_values().index[::-1]
# print('task_order:',task_order)
# g = sns.catplot(
#     kind='point',
#     estimator=np.median,
#     data=df_plt,
#     x='value',
#     y='nice-task',
#     hue='method',
#     order=task_order,
#     hue_order=['FEAT - LR', 'FEAT-100 - LR'],
#     col='metric',
#     ci='sd',
#     join=False,
#     height=6,
#     aspect=1.2,
#     sharex=False,
#     palette='cividis_r',
# )
# g.set(ylabel='',xlabel='')
# # plt.xlabel('AUROC')

# for i,ax in enumerate(g.axes.flat):
#     ax.grid(True,axis='y')
#     ax.grid(False,axis='x')

### combine micro scores

In [None]:
# micro_lstm['method']='LSTM'

# micro_df = micro_feat_lr.append(micro_lstm)
# micro_df

In [None]:
# micro_df.groupby('method')['value'].count()

In [None]:
# import math
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set_context('paper')
# sns.set(font_scale=1.2)
# sns.set_style('whitegrid')
# # n_nodes = df_best.melt(id_vars=['run_id','method','task'],value_vars=['n_nodes'],var_name='metric',value_name='value')
# # feat_lr = pd.concat((df_best,n_nodes))

# # df_best_lsr = df_lsr.groupby(['method','task','metric','n_nodes'])['value'].mean().reset_index()
# # n_nodes = df_best_lsr.melt(id_vars=['method','task'],value_vars=['n_nodes'],var_name='metric',value_name='value')
# # lstm = pd.concat((df_best_lsr,n_nodes))

# # df_plt = pd.concat((feat_lr,lstm))
# df_plt = micro_df.copy()
# df_plt = df_plt.loc[df_plt.metric.isin(['roc_auc_score','average_precision_score'])]
# g = sns.catplot(
#     kind='box',
#     showfliers=False,
#     dodge=False,
#     data=df_plt,
#     y='value',
#     x='method',
#     col='metric',
#     sharex=False,
#     legend=False,
#     color='w',
# )

# for i,ax in enumerate(g.axes.flat):
#     ax.grid(True,axis='y')
#     ax.grid(False,axis='x')
#     ttl = ax.get_title()[8:]
# #     if len(ttl) > 40:
# #         words = ttl.split(' ')
# #         i = math.floor(len(words)/2)
# #         ttl = (' '.join(words[:i])
# #                +'\n'
# #                +' '.join(words[i:])
# #               )
#     ax.set_xlabel(ttl.replace('_',' '))
#     ax.set_title('')
#     if i == 0:
# #         ax.set_xlabel('Micro AUROC')
#         ax.set_title('Micro AUROC')
#     elif i == 1:
# #         ax.set_xlabel('Micro AUPRC')
#         ax.set_title('Micro AUPRC')
#     for j, child in enumerate(ax._children):
#         if hasattr(child,'_edgecolor'):
#             child.set_edgecolor('black')
#     for j, line in enumerate(ax.lines):
#         line.set_color('k')
# #         # iterate over whiskers and median lines
# # #         for k in range(6*j,6*(j+1)):
# # #          box.lines[k].set_color('black') 
# #         box.set_color('black')

## Micro Scores Table

In [None]:
# df_plt.groupby(['metric','method'])['value'].apply(nice_stat).unstack()

# nice task names 

In [None]:
from collections import defaultdict
task_names = {
#        'Diabetes mellitus with complications',
       'Chronic obstructive pulmonary disease and bronchiectasis':'COPD and bronchiectasis',
#        'Congestive heart failure nonhypertensive', 
#        'Conduction disorders',
       'Hypertension with complications and secondary hypertension':'HTN with complications and secondary HTN',
       'Diabetes mellitus without complication':'Diabetes mellitus', 
       'Essential hypertension':'Essential HTN',
#        'Cardiac dysrhythmias', 
       'Chronic kidney disease':'CKD',
       'Coronary atherosclerosis and other heart disease':'Heart disease',
#        'Disorders of lipid metabolism', 
#        'Gastrointestinal hemorrhage', 
#        'Shock',
#        'Pleurisy pneumothorax pulmonary collapse',
#        'Acute cerebrovascular disease', 
#        'Other liver diseases',
#        'Other lower respiratory disease',
#        'Fluid and electrolyte disorders',
#        'Acute myocardial infarction',
#        'Other upper respiratory disease',
#        'Acute and unspecified renal failure',
       'Pneumonia (except that caused by tuberculosis or sexually transmitted disease)':'Pneumonia',
       'Septicemia (except in labor)':'Septicemia',
       'Respiratory failure insufficiency arrest (adult)':'Respiratory failure insufficiency arrest',
       'Complications of surgical procedures or medical care':'Complications, surgical or medical'
}
nice_task=defaultdict(lambda x: x)
nice_task.update(task_names)

## combined data frame of all boostrapped results 
- number of nodes are also included, without bootstrapping 

In [None]:
df_feat_lr.method.unique()

In [None]:
df_comb = pd.concat([df_lsr,df_feat_lr])

# metrics = ['roc_auc_score','average_precision_score','ballogloss','log_loss']
sort_metric = 'roc_auc_score'
lstm = df_lsr.loc[df_lsr['metric']==sort_metric].groupby('task')['value'].mean() 
tmp = df_feat_lr.loc[df_feat_lr['metric']==sort_metric].groupby('task')['value'].mean()
## task_idx
task_idx = (np.abs(lstm - tmp)/lstm).sort_values().index
# task_idx = (np.abs(lstm - tmp)/lstm).sort_values().index

## Add n_nodes
n_nodes1 = df_best.melt(id_vars=['run_id','method','task'],value_vars=['n_nodes'],var_name='metric',value_name='value')
# feat_lr = pd.concat((df_best,n_nodes))

df_best_lsr = df_lsr.groupby(['method','task','metric','n_nodes'])['value'].mean().reset_index()
n_nodes2 = df_best_lsr.melt(id_vars=['method','task'],
                            value_vars=['n_nodes'],
                            var_name='metric',
                            value_name='value')
df_comb = pd.concat((df_comb,n_nodes1,n_nodes2))
(df_comb
  .loc[df_comb.metric.isin(metrics+['n_nodes'])]
  .groupby(['task','method','metric'])
  ['value']
  .max()
  .round(2)
  .unstack()
  .unstack()
 #  .loc[task_idx]
)
# df_comb


In [None]:
df_comb

# catplot of performance and size

In [None]:
palette=sns.color_palette("Paired")
# palette
palette=palette[0:4]+[palette[7]]
# print(palette)

In [None]:
import math
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set(font_scale=1.2)
sns.set_style('whitegrid')
mets = ['roc_auc_score', 'n_nodes']
# mets = ['roc_auc_score']
# mets = ['average_precision_score']
# mets = ['n_nodes']

df_plt = df_comb.loc[df_comb.metric.isin(mets)]

df_plt['nice-task'] = df_plt['task'].apply(lambda x: task_names[x] if x in task_names.keys() else x)
# task_order = df_plt[df_plt.metric=='roc_auc_score'].groupby('nice-task')['value'].mean().sort_values().index[::-1]
task_order = df_plt[df_plt.metric==mets[0]].groupby('nice-task')['value'].mean().sort_values().index[::-1]
print('task_order:',task_order)
df_plt.loc[df_plt.method=='FEAT','method'] = 'FEAT-10'
g = sns.catplot(
    kind='point',
    estimator=np.median,
#     kind='strip',
#     showfliers=False,
#     dodge=False,
    data=df_plt,
    x='value',
#     y='task',
    y='nice-task',
    hue='method',
#     order=task_order,
#     hue_order=['FEAT','FEAT-100','LR','LR-10','LR-100','LSTM'],
    hue_order=['LR-10','FEAT-10','LR-100','FEAT-100','LSTM'],
    col='metric',
#     col_wrap=5,
    ci='sd',
#     scale=.7,
    join=False,
#     dodge=True,
    height=6,
#     aspect=2.4,
    aspect=1.2,
    sharex=False,
#     palette='cividis_r',
#     palette='nipy_spectral',
#     palette='colorblind',
#     palette='deep',
    palette=palette
#     palette='Paired'
#     marker_colors = ['p','b','g','r'],
#     row='metric'
)
g.set(ylabel='',xlabel='')
# plt.xlabel('AUROC')

for k,ax in g.axes_dict.items():
    ax.grid(True,axis='y')
    ax.grid(False,axis='x')
    ax.set_title('')
    if k=='roc_auc_score':
        ax.set_title('AUROC')
    elif k == 'average_precision_score':
        ax.set_title('AUPRC')
    elif k == 'n_nodes':
        ax.set_xscale('log')
        ax.set_title('Model Size')
        
g.savefig('mimic3_detail-'+'-'.join(mets)+'-scores.pdf',dpi=300,bbox_inches='tight')

<!-- # table -->

# AUROC

In [None]:
# df_plt.loc[df_plt.metric=='roc_auc_score'].groupby(['nice-task','method'])['value'].apply(nice_stat).unstack().round(2)

# AUPRC

In [None]:
# df_plt.loc[df_plt.metric=='average_precision_score'].groupby(['nice-task','method'])['value'].apply(nice_stat).unstack().round(2)

# Size 

In [None]:
df_plt.loc[df_plt.metric=='n_nodes'].groupby(['nice-task','method'])['value'].mean().unstack().round()

# Macro scores

In [None]:
help(Annotator.configure)

In [None]:
Annotator.configure?

In [None]:
import math
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator
import seaborn as sns
sns.set_context('paper')
sns.set(font_scale=1.2)
sns.set_style('whitegrid')
# plt.figure(figsize=(5,9))
# df_plt = pd.concat((df_best[['n_nodes','method','task']],
#                    df_lsr[['n_nodes','method','task']]) # on = ['task'])
#                   )
# df_plt['nice-task'] = df_plt['task'].apply(lambda x: nice_task[x] if x in nice_task.keys() else x)
n_nodes = df_best.melt(id_vars=['run_id','method','task'],value_vars=['n_nodes'],var_name='metric',value_name='value')
# n_nodes['value'] = n_nodes['value']+10
feat_lr = pd.concat((df_best.loc[df_best.fold=='test'],n_nodes))

df_best_lsr = df_lsr.groupby(['method','task','metric','n_nodes'])['value'].mean().reset_index()
n_nodes = df_best_lsr.melt(id_vars=['method','task'],value_vars=['n_nodes'],var_name='metric',value_name='value')
lstm = pd.concat((df_best_lsr,n_nodes))

df_plt = pd.concat((feat_lr,lstm))
df_plt = df_plt.loc[df_plt.metric.isin(['roc_auc_score','average_precision_score','n_nodes'])]

df_plt.loc[df_plt.method=='FEAT','method'] = '$FEAT_{10}$'
df_plt['method'] = df_plt['method'].apply(lambda x: '$' +x.replace('-','_{') + '}$' if '-' in x else x)

df_macro = df_plt

order = ['$LR_{10}$','$FEAT_{10}$','$LR_{100}$','$FEAT_{100}$','LSTM']
g = sns.catplot(
    kind='box',
    showfliers=False,
    dodge=False,
    data=df_plt,
    y='value',
    x='method',
    order=order,
    notch=True,
    bootstrap=1000,
    col='metric',
    sharey=False,
    legend=False,
    color='w',
)
g.set(ylabel='',xlabel='')
# plt.xlabel('AUROC')

pairs=[('$LR_{10}$','$FEAT_{10}$'),
       ('$LR_{100}$','$FEAT_{10}$'),
       ('$LR_{100}$','$FEAT_{100}$'),
       ('$FEAT_{100}$','LSTM')]
for i,(k,ax) in enumerate(g.axes_dict.items()):
    ax.grid(True,axis='y')
    ax.grid(False,axis='x')
    ttl = ax.get_title()[8:]
    
    ax.set_title('')
    if k == 'roc_auc_score':
        ax.set_ylabel('Macro AUROC')
    elif k == 'average_precision_score':
        ax.set_ylabel('Macro AUPRC')
    elif k == 'n_nodes':
#         ax.set_xscale('log')
        ax.set_yscale('log')
#         ax.set_xlabel('Model Size')
        ax.set_ylabel('Model Size')
#     ax.set(edgecolor='k') #,patch_edgecolor='k')
    # make box edges black
    for j, child in enumerate(ax._children):
# #         print(j,box)
        if hasattr(child,'_edgecolor'):
            child.set_edgecolor('black')
    for j, line in enumerate(ax.lines):
        line.set_color('k')
#         # iterate over whiskers and median lines
# #         for k in range(6*j,6*(j+1)):
# #          box.lines[k].set_color('black') 
#         box.set_color('black')
    df_ax = df_plt.loc[df_plt.metric==k] 
    annotator = Annotator(ax, pairs, data=df_ax, x='method', y='value', order=order)
    annotator.configure(
#                         test='Mann-Whitney',
                        test='Wilcoxon',
#                         test='Kruskal',
#                         test='t-test_paired',
                        text_format='star',
                        show_test_name=False, 
                        loc='inside',
#                         comparisons_correction='holm'
                        comparisons_correction='bonferroni'
                       )
    annotator.apply_and_annotate()
    

g.savefig('mimic3_macro-scores.pdf',dpi=300,bbox_inches='tight')

In [None]:
import math
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator
import seaborn as sns
sns.set_context('paper')
sns.set(font_scale=1.2)
sns.set_style('whitegrid')
# plt.figure(figsize=(5,9))
# df_plt = pd.concat((df_best[['n_nodes','method','task']],
#                    df_lsr[['n_nodes','method','task']]) # on = ['task'])
#                   )
# df_plt['nice-task'] = df_plt['task'].apply(lambda x: nice_task[x] if x in nice_task.keys() else x)
n_nodes = df_best.melt(id_vars=['run_id','method','task'],value_vars=['n_nodes'],var_name='metric',value_name='value')
# n_nodes['value'] = n_nodes['value']+10
feat_lr = pd.concat((df_best.loc[df_best.fold=='test'],n_nodes))

df_best_lsr = df_lsr.groupby(['method','task','metric','n_nodes'])['value'].mean().reset_index()
n_nodes = df_best_lsr.melt(id_vars=['method','task'],value_vars=['n_nodes'],var_name='metric',value_name='value')
lstm = pd.concat((df_best_lsr,n_nodes))

df_plt = pd.concat((feat_lr,lstm))
df_plt = df_plt.loc[df_plt.metric.isin(['roc_auc_score','average_precision_score','n_nodes'])]

df_plt.loc[df_plt.method=='FEAT','method'] = '$FEAT_{10}$'
df_plt['method'] = df_plt['method'].apply(lambda x: '$' +x.replace('-','_{') + '}$' if '-' in x else x)

df_macro = df_plt

order = ['$LR_{10}$','$FEAT_{10}$','$LR_{100}$','$FEAT_{100}$','LSTM']
g = sns.catplot(
    kind='strip',
#     showfliers=False,
#     dodge=False,
    data=df_plt,
    y='value',
    x='method',
    order=order,
    col='metric',
    sharey=False,
    legend=False,
    hue='task',
    palette='cividis'
)
g.set(ylabel='',xlabel='')
# plt.xlabel('AUROC')

pairs=[('$LR_{10}$','$FEAT_{10}$'),
       ('$LR_{100}$','$FEAT_{10}$'),
       ('$LR_{100}$','$FEAT_{100}$'),
       ('$FEAT_{100}$','LSTM')]
for i,(k,ax) in enumerate(g.axes_dict.items()):
    ax.grid(True,axis='y')
    ax.grid(False,axis='x')
    ttl = ax.get_title()[8:]
    
    ax.set_title('')
    if k == 'roc_auc_score':
        ax.set_ylabel('Macro AUROC')
    elif k == 'average_precision_score':
        ax.set_ylabel('Macro AUPRC')
    elif k == 'n_nodes':
#         ax.set_xscale('log')
        ax.set_yscale('log')
#         ax.set_xlabel('Model Size')
        ax.set_ylabel('Model Size')
#     ax.set(edgecolor='k') #,patch_edgecolor='k')
    # make box edges black
    for j, child in enumerate(ax._children):
# #         print(j,box)
        if hasattr(child,'_edgecolor'):
            child.set_edgecolor('black')
    for j, line in enumerate(ax.lines):
        line.set_color('k')
#         # iterate over whiskers and median lines
# #         for k in range(6*j,6*(j+1)):
# #          box.lines[k].set_color('black') 
#         box.set_color('black')
    df_ax = df_plt.loc[df_plt.metric==k] 
    annotator = Annotator(ax, pairs, data=df_ax, x='method', y='value', order=order)
    annotator.configure(
#                         test='Mann-Whitney',
                        test='Wilcoxon',
#                         test='Kruskal',
#                         test='t-test_paired',
                        text_format='star',
                        show_test_name=False, 
                        loc='inside',
#                         comparisons_correction='holm'
                        comparisons_correction='bonferroni'
                       )
    annotator.apply_and_annotate()
    

g.savefig('mimic3_macro-scores-strip.pdf',dpi=300,bbox_inches='tight')

# percent differences

In [None]:
import pdb
df_macro_lstm = df_macro.loc[df_macro.method=='LSTM'].set_index(['task','metric'])['value']
df_macro_lstm
frames = []
for method, dfg in df_macro.groupby('method'):
    d = pd.DataFrame((dfg.set_index(['task','metric'])['value'] - df_macro_lstm)/df_macro_lstm*100).reset_index()
#     pdb.set_trace()
    d['method'] = method
    frames.append(d)
# # df_lstm_diff = df_macro.set_index(['task','metric'])['value'] - df_macro.loc[df_macro.method=='lstm'].set_index(['task','metric'])['value']
# # df_lstm_diff
df_lstm_diff = pd.concat(frames)
df_lstm_diff = df_lstm_diff.rename(columns = {'value':'% difference from LSTM'})
order = ['$LR_{10}$','$FEAT_{10}$','$LR_{100}$','$FEAT_{100}$','LSTM']
g = sns.catplot(
    kind='box',
#     showfliers=False,
#     dodge=False,
    data=df_lstm_diff,
    y='% difference from LSTM',
    x='method',
    order=order,
    col='metric',
    col_order=['roc_auc_score','average_precision_score','n_nodes'],
    sharey=False,
    legend=False,
#     hue='task',
    palette='cividis'
)
# g.set(ylabel='% difference from LSTM')
df_lstm_diff.groupby(['metric','method'])['% difference from LSTM'].mean().unstack().round(1)[order]

In [None]:
import math
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator
import seaborn as sns
sns.set_context('paper')
sns.set(font_scale=1.2)
sns.set_style('whitegrid')
# plt.figure(figsize=(5,9))
# df_plt = pd.concat((df_best[['n_nodes','method','task']],
#                    df_lsr[['n_nodes','method','task']]) # on = ['task'])
#                   )
# df_plt['nice-task'] = df_plt['task'].apply(lambda x: nice_task[x] if x in nice_task.keys() else x)
n_nodes = df_best.melt(id_vars=['run_id','method','task'],value_vars=['n_nodes'],var_name='metric',value_name='value')
# n_nodes['value'] = n_nodes['value']+10
feat_lr = pd.concat((df_best.loc[df_best.fold=='test'],n_nodes))

df_best_lsr = df_lsr.groupby(['method','task','metric','n_nodes'])['value'].mean().reset_index()
n_nodes = df_best_lsr.melt(id_vars=['method','task'],value_vars=['n_nodes'],var_name='metric',value_name='value')
lstm = pd.concat((df_best_lsr,n_nodes))

df_plt = pd.concat((feat_lr,lstm))
df_plt = df_plt.loc[df_plt.metric.isin(['roc_auc_score','average_precision_score','n_nodes'])]

df_plt.loc[df_plt.method=='FEAT','method'] = '$FEAT_{10}$'
df_plt['method'] = df_plt['method'].apply(lambda x: '$' +x.replace('-','_{') + '}$' if '-' in x else x)

df_macro = df_plt

order = ['$LR_{10}$','$FEAT_{10}$','$LR_{100}$','$FEAT_{100}$','LSTM']
g = sns.catplot(
    kind='box',
    showfliers=False,
    dodge=False,
    data=df_plt,
    y='value',
    x='method',
    order=order,
    notch=True,
    bootstrap=1000,
    col='metric',
    sharey=False,
    legend=False,
    color='w',
)
g.set(ylabel='',xlabel='')
# plt.xlabel('AUROC')

pairs=[('$LR_{10}$','$FEAT_{10}$'),
       ('$LR_{100}$','$FEAT_{10}$'),
       ('$LR_{100}$','$FEAT_{100}$'),
       ('$FEAT_{100}$','LSTM')]
for i,(k,ax) in enumerate(g.axes_dict.items()):
    ax.grid(True,axis='y')
    ax.grid(False,axis='x')
    ttl = ax.get_title()[8:]
    
    ax.set_title('')
    if k == 'roc_auc_score':
        ax.set_ylabel('Macro AUROC')
    elif k == 'average_precision_score':
        ax.set_ylabel('Macro AUPRC')
    elif k == 'n_nodes':
#         ax.set_xscale('log')
        ax.set_yscale('log')
#         ax.set_xlabel('Model Size')
        ax.set_ylabel('Model Size')
#     ax.set(edgecolor='k') #,patch_edgecolor='k')
    # make box edges black
    for j, child in enumerate(ax._children):
# #         print(j,box)
        if hasattr(child,'_edgecolor'):
            child.set_edgecolor('black')
    for j, line in enumerate(ax.lines):
        line.set_color('k')
#         # iterate over whiskers and median lines
# #         for k in range(6*j,6*(j+1)):
# #          box.lines[k].set_color('black') 
#         box.set_color('black')
    df_ax = df_plt.loc[df_plt.metric==k] 
    annotator = Annotator(ax, pairs, data=df_ax, x='method', y='value', order=order)
    annotator.configure(
#                         test='Mann-Whitney',
                        test='Wilcoxon',
#                         test='Kruskal',
#                         test='t-test_paired',
                        text_format='star',
                        show_test_name=False, 
                        loc='inside',
#                         comparisons_correction='holm'
                        comparisons_correction='bonferroni'
                       )
    annotator.apply_and_annotate()
    

g.savefig('mimic3_macro-scores.pdf',dpi=300,bbox_inches='tight')

In [None]:
df_macro

In [None]:
df_plt.loc[(df_plt.method=='$FEAT_{10}$') & (df_plt.metric=='roc_auc_score')] #.groupby(['method','metric'])['value'].

# macro table

In [None]:
tmp = df_macro.groupby(['metric','method'])['value'].apply(nice_stat).unstack()
# tmp.loc[['roc_auc_score','average_precision_score','n_nodes']][order].transpose()
tmp

In [None]:
df_plt

# TODO
- look at micro AUROC/AUPRC
- first or second figure? 
- selecting from archive
- different model selection for FEAT / LR
    - do something conventional/standard for LR
- push changes to main repo 

In [None]:
df_best.loc[df_best.method=='FEAT',:].groupby('task')['model'].value_counts()

In [None]:
df_best.loc[df_best.method=='FEAT-100',:].groupby('task')['model'].value_counts()

In [None]:
df_best_lsr = df_lsr.groupby(['method','task','metric','n_nodes'])['value'].mean().reset_index()
n_nodes = df_best_lsr.melt(id_vars=['method','task'],value_vars=['n_nodes'],var_name='metric',value_name='value')
pd.concat((df_best_lsr,n_nodes))