In [1]:
import pandas as pd
import os
import numpy as np
import logging
import sys
import torch
import copy
import yaml
import random
import argparse

from prediction_utils.pytorch_utils.metrics import (
    StandardEvaluator,
    FairOVAEvaluator,
    CalibrationEvaluator
)

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
sns.set_style("ticks")

grp_label_dict = {1: 'Black women', 2: 'White women', 3: 'Black men', 4: 'White men'} 
args = {'cohort_path': '/labs/shahlab/projects/agataf/data/cohorts/pooled_cohorts/cohort/all_cohorts.csv',
        'base_path': '/labs/shahlab/projects/agataf/data/cohorts/pooled_cohorts',
        'n_bootstrap': 5,
        'eval_fold': 'test'
       }

cohort = pd.read_csv(args['cohort_path'])

fair_evaluator = FairOVAEvaluator(thresholds = [0.075, 0.2],
                                 metrics=['emd_ova', 'emd_1_ova', 'emd_0_ova'])

standard_evaluator = StandardEvaluator(thresholds = [0.075, 0.2],
                                              metrics = ['auc', 'auprc', 'loss_bce', 
                                                         'ace_rmse_logistic_log',
                                                         ]
                                             )
eval_dict = {'label_var': 'labels',
             'pred_prob_var': 'pred_probs',
             'weight_var': 'weights',
             'group_var_name': 'group',
             'strata_vars': ['model_id', 'fold_id']}

In [2]:
all_preds = []
for experiment in ['original_pce', 'revised_pce', 'apr14_erm', 'apr14_erm_recalib', 'apr14_mmd', 'apr14_thr']:
    aggregate_path = os.path.join(args['base_path'], 'experiments', experiment, 
                                  'performance', 'all')
    preds_path = os.path.join(aggregate_path, 'predictions.csv')
    preds = pd.read_csv(preds_path).assign(experiment=experiment)
    if 'model_id' not in preds.columns:
        preds = preds.assign(model_id=0)
    if 'fold_id' not in preds.columns:
        preds = preds.assign(fold_id=0)
    all_preds.append(preds)
all_preds = pd.concat(all_preds)

In [31]:
# eval_df = all_preds.query("(phase=='test') & (experiment==['original_pce', 'revised_pce', 'apr14_erm'])")
# evaluator = StandardEvaluator(metrics=['auc', 'loss_bce'])
# result_df_ci = evaluator.bootstrap_evaluate(
#     df=eval_df,
#     n_boot=10,
#     strata_vars_eval=['phase', 'model_type', 'model_id', 'fold_id', 'group'],
#     strata_vars_boot=['phase', 'labels', 'group'],
#     strata_var_replicate='fold_id',
#     replicate_aggregation_mode=None,
#     strata_var_experiment='model_id',
#     baseline_experiment_name=0,
#     strata_var_group='group',
#     weight_var='weights',
#     compute_overall=True,
#    # group_overall_name='overall'
# )


In [None]:
eval_dict = {'label_var': 'labels',
             'pred_prob_var': 'pred_probs',
             'weight_var': 'weights',
             'group_var_name': 'group'}

eval_overall_all = []
eval_fair_ova_all = []
for iter_idx in range(5): 
    cohort_bootstrap_sample = (cohort
                               .query("fold_id=='test'")
                               .groupby(['ascvd_10yr', 'grp'])
                               .sample(frac=1, replace=True)
                               .person_id)
    df_bootstrap = all_preds.loc[all_preds['person_id'].isin(cohort_bootstrap_sample)]
    
    eval_overall = standard_evaluator.get_result_df(df_bootstrap,
                                                    strata_vars=['model_id', 'fold_id', 'experiment'],
                                                    **eval_dict)
    eval_fair_ova = fair_evaluator.get_result_df(df_bootstrap,
                                                    strata_vars=['model_id', 'fold_id', 'experiment'],
                                                    **eval_dict)
    eval_overall_all.append(eval_overall.assign(bootstrap=iter_idx))

In [None]:
eval_overall

In [None]:
fo


aggregate_path = os.path.join(base_path, 'experiments', 
                              experiment_name, 'performance',
                              'all')

preds_path = os.path.join(aggregate_path, 'predictions.csv')

    
preds = pd.read_csv(preds_path)
if 'fold_id' not in preds.columns:
    preds = preds.assign(fold_id=0)
if 'model_id' not in preds.columns:
    preds = preds.assign(model_id=0)

def get_calib_probs(model, x, transform=None):
    
    if transform=='log':
        model_input = np.log(x)
    else:
        model_input = x
        
    calibration_density = model.predict_proba(model_input.reshape(-1, 1))[:, -1]
                    
    df = pd.DataFrame({'pred_probs': x,
                       'model_input': model_input,
                       'calibration_density': calibration_density})  
    return df
    
def get_calib_model(labels, pred_probs, weights, transform=None):
    
    evaluator = CalibrationEvaluator()
    _, model = evaluator.get_calibration_density_df(labels, 
                                                    pred_probs,
                                                    weights,
                                                    transform = transform)

    return model

df_to_calibrate = preds[preds.phase==eval_fold].reset_index(drop=True)
lin_calibs=[]
thr_calibs=[]
model_type = preds.model_type.unique()[0]
for iter_idx in range(n_bootstrap):
    df_bootstrap = (df_to_calibrate
                    .groupby(['group', 'labels', 'model_id', 'fold_id'])
                    .sample(frac=1, replace=True))
    for group in [1,2,3,4]:
        for model_id in df_bootstrap.model_id.unique():
            group_df = df_bootstrap.query("(group==@group) & (model_id==@model_id)")
            max_pred_prob = group_df.pred_probs.values.max()
            
            for fold_id in group_df.fold_id.unique(): 
                df = group_df.query("(fold_id==@fold_id)")          

                loop_kwargs = {'group': group,
                               'fold_id': fold_id,
                               'phase': eval_fold,
                               'model_type': model_type,
                               'model_id' : model_id}

                model = get_calib_model(df.labels, df.pred_probs, df.weights, transform='log')
                    
                lin_calib = (get_calib_probs(model, np.append([1e-15], np.linspace(0.025, int(max_pred_prob/0.025)*0.025, int((max_pred_prob)/0.025))), 'log')
                                 .assign(**loop_kwargs))
                lin_calibs.append(lin_calib)
                    
                thr_calib = (get_calib_probs(model, [0.075, 0.2], 'log')
                                 .assign(**loop_kwargs))
                thr_calibs.append(thr_calib)
    print(iter_idx)

# lin_calibs = pd.concat(lin_calibs)
# lin_calibs.to_csv(os.path.join(aggregate_path, 'calibration_sensitivity_test_raw.csv'), index=False)

# thr_calibs = pd.concat(thr_calibs)
# thr_calibs.to_csv(os.path.join(aggregate_path, 'calibration_sensitivity_thresholds_raw.csv'), index=False)

In [None]:
(df_to_calibrate
 .groupby(['group', 'labels', 'model_id', 'fold_id'])
 .sample(frac=1, replace=True)
)

