# Find best models

This notebook will load all models and re-evaluate them on all 5 folds, without re-fit. The best model for each phenotype will be selected for further experiments

In [11]:
from _load_llm_results import *

import numpy as np
import warnings
import os
import importlib
import sys; sys.path.append('../')
from evaluate_model import read_data
from sklearn.metrics import average_precision_score, roc_auc_score

warnings.filterwarnings("ignore")

%matplotlib inline

from tqdm import tqdm
from glob import glob

data_dir = '../data'

def evaluate_all_folds(metric, cp_function, cp_df, folds=['A', 'B', 'C', 'D', 'E'], bootstrap=False, n_reps=1_000, override_runid=False):
    
    runid = cp_df['RunID']
    if override_runid:
        runid = '101' # enforcing to use the held out dataset
        # print("Overriding RunID, Using runID", runid)
        cp_df['RunID'] = runid
    
    def eval(X, y):
        try:
            prob = np.array(cp_function(X))
        except Exception:
            prob = np.zeros(shape=len(X))
        try:
            if metric in [average_precision_score, roc_auc_score]:
                return metric(y, prob)
            else:
                return metric(y, np.where(prob> 0.5), 1, 0)
        except ValueError:
            return np.nan

    for FOLD in folds:
        X_train, y_train, X_test, y_test = read_data(
            targets_rev[cp_df['target']], FOLD, runid, cp_df['scale'],
            cp_df['icd_only'], data_dir, cp_df['random_state']
        )

        entry = f"{metric.__name__}_fold_out_{FOLD}"
        if bootstrap:
            val_samples = []
            for i in tqdm(range(n_reps)):
                samples = np.random.randint(0,len(y_test)-1, size=len(y_test))
                val_samples.append( eval(X_test.iloc[samples, :], y_test.iloc[samples]) )

            cp_df[f"{entry}_mean"] = np.mean(val_samples)
            cp_df[f"{entry}_std"] = np.std(val_samples)
            cp_df[f"{entry}_ci_upper"] = np.quantile(val_samples,0.95)
            cp_df[f"{entry}_ci_lower"] = np.quantile(val_samples,0.05)
        else:
            cp_df[entry] = eval(X_test, y_test)

    return cp_df

def load_results(constraints=dict(scale=[False])):
    count = 0
    for file in tqdm(glob(f"{results_path}/**/*.json", recursive=True)):
        
        try:
            cp_df = pd.read_json(file, typ="series")
        except Exception as e:
            # print("Bad results in file", file)
            continue
            
        # unifying column name
        cp_df = cp_df.rename(index={"few_feature": "icd_only"})
            
        stay = True
        for kc, vc in constraints.items():
            if cp_df[kc] not in vc:
                stay = False
                # print("skipping", file)
                break
        if not stay:
            # print('continuing')
            continue

        # Evaluating the program in the other folds
        cp_file = file.replace('.json', '_program') \
                      .replace(cp_df['target'], targets_rev[cp_df['target']]) \
                      .replace('/', '.')[3:]

        if 'iterative' not in file: # removing the fold from file name if not iterative
            cp_file = cp_file.replace(f"_{cp_df['fold']}_", '__')

        # print(cp_file)

        try:
            cp_module = importlib.__import__(
                cp_file,
                globals(),
                locals(),
                ["predict_hyptertension"],
            )
            cp_function = cp_module.predict_hypertension

            # Evaluating on other held-out folds. notice that the LLMs did not see the
            # entire training data even during its training!
            cp_df = evaluate_all_folds(average_precision_score, cp_function, cp_df)
        except:
            continue

        results.append(cp_df)
        count += 1

    print('loaded',count,'files')
    results_df = pd.DataFrame(data=results)\

    # Beautifying it
    results_df["model"] = results_df["model"].apply(lambda m: nice_model_labels[m])
    results_df["target"] = results_df["target"].apply(lambda t: dnames_to_nice[t])

    results_df = results_df[results_df["model"].isin(order)]

    print(results_df["model"].unique())
    print(results_df["target"].unique())
    
    return results_df
    
results_df = pd.concat([
    load_results( # rich prompt, expert features
        constraints = {
            'scale': [False],
            'model': ['GPT_4o_iterative_Classifier'], 
            'prompt_richness' : [True],
            'icd_only' : [True],
        }
    ),
    load_results( # simple prompt, all features
        constraints = {
            'scale': [False],
            'model': ['GPT_4o_iterative_Classifier'], 
            'prompt_richness' : [False],
            'icd_only' : [False],
        }
    )
])

# Making it the format seaborn likes
results_df_melted = pd.melt(
    results_df, 
    id_vars=['model', 'target', 'fold', 'RunID', 'random_state', 'prompt_richness', 'icd_only', 'scale']
)

print(results_df.columns)
print(results_df.shape)

100%|██████████| 7200/7200 [02:54<00:00, 41.36it/s] 


loaded 300 files
['gpt-4o-iter']
['HTN Heuristic' 'HTN Diagnosis' 'HTN-Hypokalemia Diagnosis'
 'Htn-Hypokalemia Heuristic' 'Resistant HTN Heuristic'
 'Resistant HTN Diagnosis']


100%|██████████| 7200/7200 [04:21<00:00, 27.51it/s] 


loaded 300 files
['gpt-4o-iter']
['HTN Heuristic' 'HTN Diagnosis' 'HTN-Hypokalemia Diagnosis'
 'Htn-Hypokalemia Heuristic' 'Resistant HTN Heuristic'
 'Resistant HTN Diagnosis']
Index(['accuracy_score_train', 'precision_score_train',
       'average_precision_score_train', 'roc_auc_score_train',
       'balanced_accuracy_score_train', 'accuracy_score_test',
       'precision_score_test', 'average_precision_score_test',
       'roc_auc_score_test', 'balanced_accuracy_score_test', 'messages',
       'model', 'target', 'fold', 'RunID', 'random_state', 'representation',
       'representation_fmt', 'size', 'complexity', 'scale', 'icd_only',
       'prompt_richness', 'time', 'pred', 'pred_proba',
       'average_precision_score_fold_out_A',
       'average_precision_score_fold_out_B',
       'average_precision_score_fold_out_C',
       'average_precision_score_fold_out_D',
       'average_precision_score_fold_out_E'],
      dtype='object')
(2100, 31)


In [12]:
print(results_df_melted.variable.unique())
display(results_df_melted.sample(5))

['accuracy_score_train' 'precision_score_train'
 'average_precision_score_train' 'roc_auc_score_train'
 'balanced_accuracy_score_train' 'accuracy_score_test'
 'precision_score_test' 'average_precision_score_test'
 'roc_auc_score_test' 'balanced_accuracy_score_test' 'messages'
 'representation' 'representation_fmt' 'size' 'complexity' 'time' 'pred'
 'pred_proba' 'average_precision_score_fold_out_A'
 'average_precision_score_fold_out_B' 'average_precision_score_fold_out_C'
 'average_precision_score_fold_out_D' 'average_precision_score_fold_out_E']


Unnamed: 0,model,target,fold,RunID,random_state,prompt_richness,icd_only,scale,variable,value
40095,gpt-4o-iter,Htn-Hypokalemia Heuristic,E,1,27690,True,True,False,average_precision_score_fold_out_B,1
37469,gpt-4o-iter,Resistant HTN Diagnosis,B,1,8233,True,True,False,pred_proba,"[0.2, 0.2, 0.8, 0.8, 0.2, 0.2, 0.2, 0.2, 0.8, ..."
16812,gpt-4o-iter,HTN Heuristic,B,1,16695,True,True,False,roc_auc_score_test,0.945983
36360,gpt-4o-iter,HTN Diagnosis,B,1,1318,True,True,False,pred_proba,"[0.27272727272727204, 0.45454545454545403, 0.8..."
34575,gpt-4o-iter,HTN Diagnosis,C,1,27690,True,True,False,pred,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, ..."


In [15]:
for prompt_richness in [True,False]:
    for icd_only in [True,False]:
        for metric in [
            'average_precision_score_fold_out', 
        ]:
            data = (
                results_df_melted[ ( (results_df_melted['variable'].str.contains(metric))
                                   | (results_df_melted['variable']=='size') )
                                 & (results_df_melted['prompt_richness']==prompt_richness)
                                 & (results_df_melted['icd_only']==icd_only)
                ]
                .dropna()
            )

            if len(data)==0:
                continue

            print("prompt_richness", prompt_richness, "expert_features", icd_only)
                        
            # removing the fold name from the metric
            data['variable'].replace({f'{metric}{fold}' : metric for fold in ['_A', '_B', '_C', '_D', '_E']}, inplace=True)
            data['value'] = data['value'].astype(float)
            data['Strategy'] = ['SEDI / Train' if 'iter' in v or 'gpt' not in v else 'Zero Shot' for v in data['model'].values]

            # first we group each model's performance on the held out data and take the mean.
            # then, we ignore folds and random states to get the final model.
            model_metrics_all_folds = data.groupby([
                'target', 'model', 'prompt_richness', 'icd_only', 'Strategy', 'scale',
                'RunID', 'fold', 'random_state', 'variable']
            )['value'].mean().reset_index()

            model_metrics_all_folds = model_metrics_all_folds.pivot_table(
                index=['target', 'model', 'prompt_richness', 'icd_only', 'Strategy', 'scale',
                       'RunID', 'fold', 'random_state'],
                columns='variable',
                values='value'
            ).reset_index()

            # Sorting by size, in case of ties in metric
            best_models = model_metrics_all_folds\
                .sort_values([metric, 'size'], ascending=[False, True])\
                .drop_duplicates(['target', 'model', 'prompt_richness', 'icd_only', 'Strategy', 'scale'])
                
            display(best_models.T)

            best_models[f'size'] = np.nan
            for metric_f in [average_precision_score, roc_auc_score]:
                best_models[f'mean_{metric_f.__name__}_fold_out_mean'] = np.nan
                best_models[f'mean_{metric_f.__name__}_fold_out_std'] = np.nan
                best_models[f'mean_{metric_f.__name__}_fold_out_ci_lower'] = np.nan
                best_models[f'mean_{metric_f.__name__}_fold_out_ci_upper'] = np.nan
            

            # Evaluating it on the held out test partition -----
            held_out_performances = []
            for i, row in best_models.iterrows():
                # print(row)
                # finding the correct file
                filename = (
                    "/".join([results_path,
                              targets_rev[dnames_to_ugly[row['target']]],
                              nice_to_ugly[row['model']] ])
                    + "/"
                    + "_".join(
                        [
                            dnames_to_ugly[row['target']],
                            nice_to_ugly[row['model']],
                            str(row['scale']),
                            str(row['icd_only']),
                            str(row['prompt_richness']),
                            str(row['RunID']),
                            row['fold'],
                            str(row['random_state']),
                        ]
                    )
                    + '.json'
                )
                # print(filename)
                cp_df = pd.read_json(filename, typ="series")

                cp_df = cp_df.rename(index={"few_feature": "icd_only"})

                # Evaluating the program in the other folds
                cp_file = filename.replace('.json', '_program') \
                            .replace(cp_df['target'], targets_rev[cp_df['target']]) \
                            .replace('/', '.')[3:]

                if 'iterative' not in filename: # removing the fold from file name if not iterative
                    cp_file = cp_file.replace(f"_{cp_df['fold']}_", '__')

                # print(cp_file)

                cp_function = None
                try:
                    cp_module = importlib.__import__(
                        cp_file, # This is going to be unique for each algorithm
                        globals(),
                        locals(),
                        ["predict_hyptertension"],
                    )
                    cp_function = cp_module.predict_hypertension
                except:
                    continue
                
                cp_df['RunID'] = '101'
                best_models.loc[i, f'size'] = cp_df['size']
                for metric_f in [average_precision_score, roc_auc_score]:
                    cp_df = evaluate_all_folds(metric_f, cp_function, cp_df, ['A'], bootstrap=True, override_runid=True)
                    
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_mean'] = cp_df[f'{metric_f.__name__}_fold_out_A_mean']
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_std'] = cp_df[f'{metric_f.__name__}_fold_out_A_std']
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_ci_upper'] = cp_df[f'{metric_f.__name__}_fold_out_A_ci_upper']
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_ci_lower'] = cp_df[f'{metric_f.__name__}_fold_out_A_ci_lower']
                    
            display(best_models.T)
    
            print("-"*120)

            # Create LaTeX table
            latex_table = best_models.to_latex(
                index=True,
                column_format="lrrr",
                escape=False
            )
            
            # Save LaTeX table to file
            filename = f"{paper_dir}/tab_best_model_{metric}_{prompt_richness}_{icd_only}.tex"
            with open(filename, 'w') as f:
                f.write(latex_table)
            
            print(f"\nLaTeX table saved to {filename}\n")

prompt_richness True expert_features True


Unnamed: 0_level_0,151,75,19,284,114,226
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
target,Htn-Hypokalemia Heuristic,HTN Heuristic,HTN Diagnosis,Resistant HTN Heuristic,HTN-Hypokalemia Diagnosis,Resistant HTN Diagnosis
model,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter
prompt_richness,True,True,True,True,True,True
icd_only,True,True,True,True,True,True
Strategy,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train
scale,False,False,False,False,False,False
RunID,1,1,1,1,1,1
fold,A,C,B,D,B,C
random_state,6832,16695,31658,14724,14724,24284
average_precision_score_fold_out,1,0.97889,0.968017,0.799027,0.740501,0.659152


  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:04<00:00, 235.31it/s]
100%|██████████| 1000/1000 [00:04<00:00, 243.25it/s]
100%|██████████| 1000/1000 [00:07<00:00, 134.39it/s]
100%|██████████| 1000/1000 [00:08<00:00, 123.00it/s]
100%|██████████| 1000/1000 [00:04<00:00, 211.29it/s]
100%|██████████| 1000/1000 [00:05<00:00, 194.31it/s]
100%|██████████| 1000/1000 [00:02<00:00, 398.55it/s]
100%|██████████| 1000/1000 [00:02<00:00, 396.57it/s]
100%|██████████| 1000/1000 [00:05<00:00, 169.20it/s]
100%|██████████| 1000/1000 [00:05<00:00, 176.01it/s]
100%|██████████| 1000/1000 [00:35<00:00, 27.94it/s]
100%|██████████| 1000/1000 [00:36<00:00, 27.50it/s]


Unnamed: 0_level_0,151,75,19,284,114,226
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
target,Htn-Hypokalemia Heuristic,HTN Heuristic,HTN Diagnosis,Resistant HTN Heuristic,HTN-Hypokalemia Diagnosis,Resistant HTN Diagnosis
model,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter
prompt_richness,True,True,True,True,True,True
icd_only,True,True,True,True,True,True
Strategy,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train
scale,False,False,False,False,False,False
RunID,1,1,1,1,1,1
fold,A,C,B,D,B,C
random_state,6832,16695,31658,14724,14724,24284
average_precision_score_fold_out,1,0.97889,0.968017,0.799027,0.740501,0.659152


------------------------------------------------------------------------------------------------------------------------

LaTeX table saved to ../paper_rebuttal/tab_best_model_average_precision_score_fold_out_True_True.tex

prompt_richness False expert_features False


Unnamed: 0_level_0,66,21,295,151,234,107
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
target,HTN Heuristic,HTN Diagnosis,Resistant HTN Heuristic,Htn-Hypokalemia Heuristic,Resistant HTN Diagnosis,HTN-Hypokalemia Diagnosis
model,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter
prompt_richness,False,False,False,False,False,False
icd_only,False,False,False,False,False,False
Strategy,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train
scale,False,False,False,False,False,False
RunID,1,1,1,1,1,1
fold,B,C,E,A,D,A
random_state,24284,6832,16695,6832,14724,24481
average_precision_score_fold_out,0.994699,0.985126,0.828473,0.805917,0.688854,0.621716


100%|██████████| 1000/1000 [00:02<00:00, 360.04it/s]
100%|██████████| 1000/1000 [00:03<00:00, 308.65it/s]
100%|██████████| 1000/1000 [00:04<00:00, 236.84it/s]
100%|██████████| 1000/1000 [00:04<00:00, 206.79it/s]
100%|██████████| 1000/1000 [00:06<00:00, 163.17it/s]
100%|██████████| 1000/1000 [00:06<00:00, 158.77it/s]
100%|██████████| 1000/1000 [00:07<00:00, 129.91it/s]
100%|██████████| 1000/1000 [00:07<00:00, 133.38it/s]
100%|██████████| 1000/1000 [00:05<00:00, 178.30it/s]
100%|██████████| 1000/1000 [00:05<00:00, 177.32it/s]
100%|██████████| 1000/1000 [00:06<00:00, 148.31it/s]
100%|██████████| 1000/1000 [00:07<00:00, 130.25it/s]


Unnamed: 0_level_0,66,21,295,151,234,107
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
target,HTN Heuristic,HTN Diagnosis,Resistant HTN Heuristic,Htn-Hypokalemia Heuristic,Resistant HTN Diagnosis,HTN-Hypokalemia Diagnosis
model,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter
prompt_richness,False,False,False,False,False,False
icd_only,False,False,False,False,False,False
Strategy,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train
scale,False,False,False,False,False,False
RunID,1,1,1,1,1,1
fold,B,C,E,A,D,A
random_state,24284,6832,16695,6832,14724,24481
average_precision_score_fold_out,0.994699,0.985126,0.828473,0.805917,0.688854,0.621716


------------------------------------------------------------------------------------------------------------------------

LaTeX table saved to ../paper_rebuttal/tab_best_model_average_precision_score_fold_out_False_False.tex



In [None]:
# reminder of filename
# filename = (
#     rdir
#     + "/"
#     + "_".join(
#         [
#             targets[target],
#             name,
#             str(scale),
#             str(few_feature),
#             str(prompt_richness),
#             str(repeat),
#             str(fold),
#             str(random_state),
#         ]
#     )
# )