# Find best models

This notebook will load all models and re-evaluate them on all 5 folds, without re-fit. The best model for each phenotype will be selected for further experiments

In [1]:
from _load_llm_results import *

import numpy as np
import warnings
import os
import importlib
import sys; sys.path.append('../')
from evaluate_model import read_data
from sklearn.metrics import average_precision_score, roc_auc_score

warnings.filterwarnings("ignore")

%matplotlib inline

from tqdm import tqdm
from glob import glob

data_dir = '../data'

def evaluate_all_folds(metric, cp_function, cp_df, folds=['A', 'B', 'C', 'D', 'E'], bootstrap=False, n_reps=1_000, override_runid=False):
    
    runid = cp_df['RunID']
    if override_runid:
        runid = '101' # enforcing to use the held out dataset
        print("Overriding RunID, Using runID", runid)
        cp_df['RunID'] = runid
    
    for FOLD in folds:
        X_train, y_train, X_test, y_test = read_data(
            targets_rev[cp_df['target']], FOLD, runid, cp_df['scale'],
            cp_df['icd_only'], data_dir, cp_df['random_state']
        )

        def eval(X, y):
            try:
                prob = np.array(cp_function(X))
            except Exception:
                prob = np.zeros(shape=len(X))
            try:
                if metric in [average_precision_score, roc_auc_score]:
                    return metric(y, prob)
                else:
                    return metric(y, np.where(prob> 0.5), 1, 0)
            except ValueError:
                return np.nan

        entry = f"{metric.__name__}_train_{FOLD}"
        if bootstrap:
            val_samples = []
            for i in tqdm(range(n_reps)):
                samples = np.random.randint(0,len(y_train)-1, size=len(y_train))
                val_samples.append( eval(X_train.iloc[samples, :], y_train.iloc[samples]) )

            cp_df[f"{entry}_mean"] = np.mean(val_samples)
            cp_df[f"{entry}_std"] = np.std(val_samples)
            cp_df[f"{entry}_ci_upper"] = np.quantile(val_samples,0.95)
            cp_df[f"{entry}_ci_lower"] = np.quantile(val_samples,0.05)
        else:
            cp_df[entry] = eval(X_train, y_train)
            
        entry = f"{metric.__name__}_fold_out_{FOLD}"
        if bootstrap:
            val_samples = []
            for i in tqdm(range(n_reps)):
                samples = np.random.randint(0,len(y_test)-1, size=len(y_test))
                val_samples.append( eval(X_test.iloc[samples, :], y_test.iloc[samples]) )

            cp_df[f"{entry}_mean"] = np.mean(val_samples)
            cp_df[f"{entry}_std"] = np.std(val_samples)
            cp_df[f"{entry}_ci_upper"] = np.quantile(val_samples,0.95)
            cp_df[f"{entry}_ci_lower"] = np.quantile(val_samples,0.05)
        else:
            cp_df[entry] = eval(X_test, y_test)

    return cp_df

def load_results(constraints=dict(scale=[False])):
    count = 0
    for file in tqdm(glob(f"{results_path}/**/*.json", recursive=True)):
        
        try:
            cp_df = pd.read_json(file, typ="series")
        except Exception as e:
            # print("Bad results in file", file)
            continue
            
        # unifying column name
        cp_df = cp_df.rename(index={"few_feature": "icd_only"})
            
        stay = True
        for kc, vc in constraints.items():
            if cp_df[kc] not in vc:
                stay = False
                # print("skipping", file)
                break
        if not stay:
            # print('continuing')
            continue

        indxs = cp_df.index
        # indxs = [indx for indx in indxs if indx not in ['pred', 'pred_proba']]

        # Evaluating the program in the other folds
        cp_file = file.replace('.json', '_program') \
                      .replace(cp_df['target'], targets_rev[cp_df['target']]) \
                      .replace('/', '.')[3:]

        if 'iterative' not in file: # removing the fold from file name if not iterative
            cp_file = cp_file.replace(f"_{cp_df['fold']}_", '__')

        # print(cp_file)

        try:
            cp_module = importlib.__import__(
                cp_file,
                globals(),
                locals(),
                ["predict_hyptertension"],
            )
            cp_function = cp_module.predict_hypertension
            cp_df = evaluate_all_folds(average_precision_score, cp_function, cp_df[indxs])
        except:
            continue

        results.append(cp_df)
        count += 1

    print('loaded',count,'files')
    results_df = pd.DataFrame(data=results) #, columns=indxs)

    # Beautifying it
    results_df["model"] = results_df["model"].apply(lambda m: nice_model_labels[m])
    results_df["target"] = results_df["target"].apply(lambda t: dnames_to_nice[t])

    results_df = results_df[results_df["model"].isin(order)]

    print(results_df["model"].unique())
    print(results_df["target"].unique())
    return results_df
    
phenotypes_order =  ['HTN Heuristic', 'Htn-Hypokalemia Heuristic', 'Resistant HTN Heuristic',
                     'HTN Diagnosis', 'HTN-Hypokalemia Diagnosis', 'Resistant HTN Diagnosis']

settings_order = ['Simple prompt,\nfew features', 'Simple prompt,\nall features', 
                  'Rich prompt,\nfew features',   'Rich prompt,\nall features']

results_df = load_results(
    constraints = {
        'scale': [False],
        'model': ['GPT_4o_iterative_Classifier'], 
        'prompt_richness' : [True],
        'icd_only' : [True],
    }
)
results_df.to_csv('find_best.csv', index=False)

results_df = pd.read_csv('find_best.csv')

# Making it the format seaborn likes
results_df_melted = pd.melt(
    results_df, 
    id_vars=['model', 'target', 'fold', 'RunID', 'random_state', 'prompt_richness', 'icd_only', 'scale']
)

print(results_df.columns)
print(results_df.shape)

100%|██████████| 7200/7200 [06:40<00:00, 17.96it/s]  


loaded 300 files
['gpt-4o-iter']
['HTN Heuristic' 'HTN Diagnosis' 'HTN-Hypokalemia Diagnosis'
 'Htn-Hypokalemia Heuristic' 'Resistant HTN Heuristic'
 'Resistant HTN Diagnosis']
Index(['accuracy_score_train', 'precision_score_train',
       'average_precision_score_train', 'roc_auc_score_train',
       'balanced_accuracy_score_train', 'accuracy_score_test',
       'precision_score_test', 'average_precision_score_test',
       'roc_auc_score_test', 'balanced_accuracy_score_test', 'messages',
       'model', 'target', 'fold', 'RunID', 'random_state', 'representation',
       'representation_fmt', 'size', 'complexity', 'scale', 'icd_only',
       'prompt_richness', 'time', 'pred', 'pred_proba',
       'average_precision_score_train_A', 'average_precision_score_fold_out_A',
       'average_precision_score_train_B', 'average_precision_score_fold_out_B',
       'average_precision_score_train_C', 'average_precision_score_fold_out_C',
       'average_precision_score_train_D', 'average_precision

In [2]:
results_df_melted.variable.unique()

array(['accuracy_score_train', 'precision_score_train',
       'average_precision_score_train', 'roc_auc_score_train',
       'balanced_accuracy_score_train', 'accuracy_score_test',
       'precision_score_test', 'average_precision_score_test',
       'roc_auc_score_test', 'balanced_accuracy_score_test', 'messages',
       'representation', 'representation_fmt', 'size', 'complexity',
       'time', 'pred', 'pred_proba', 'average_precision_score_train_A',
       'average_precision_score_fold_out_A',
       'average_precision_score_train_B',
       'average_precision_score_fold_out_B',
       'average_precision_score_train_C',
       'average_precision_score_fold_out_C',
       'average_precision_score_train_D',
       'average_precision_score_fold_out_D',
       'average_precision_score_train_E',
       'average_precision_score_fold_out_E'], dtype=object)

In [3]:
for prompt_richness in [True,False]:
    for icd_only in [True,False]:
        print("prompt_richness",prompt_richness,"expert_features",icd_only)
        for metric in [
            'average_precision_score_fold_out', 
            # 'roc_auc_score_test', 
            # 'accuracy_score_test', 
            # 'size'
        ]:
            data = (
                results_df_melted[
                    (results_df_melted['variable'].str.contains(metric))
                    & (
                        (results_df_melted['model'].isin(['DT','FEAT','LR L1','RF']))
                        | 
                        (
                            (results_df_melted['prompt_richness']==prompt_richness)
                            & (results_df_melted['icd_only']==icd_only)
                        )
                    )
                ]
                .dropna()
            )

            if len(data)==0:
                continue
                        
            # prettify metric names
            metric = metric.replace('_fold_out', '_validation')
            metric = metric.replace('average_precision_score', 'AUPRC')
            
            data = data.rename(columns={'value': metric}) #, 'model':'Model'})
            data[metric] = data[metric].astype(float)
            data['Model'] = data['model'].apply(lambda x: x.replace('-iter',''))
            data['Strategy'] = ['SEDI / Train' if 'iter' in v or 'gpt' not in v else 'Zero Shot' for v in data['model'].values]
            hue_order = ['Zero Shot', 'SEDI / Train']
            
            # print(data.random_state.unique())
            # display(data)

            # first we group each model's performance on the held out data and take the mean.
            # then, we ignore folds and random states to get the final model.
            model_metrics_all_folds = data.groupby([
                'target', 'model', 'prompt_richness', 'icd_only', 'Strategy', 'scale',
                'RunID', 'fold', 'random_state']
            )[metric].mean().reset_index()
            # display(model_metrics_all_folds)

            best_models = model_metrics_all_folds\
                .sort_values(metric, ascending=False)\
                .drop_duplicates(['target', 'model', 'prompt_richness', 'icd_only', 'Strategy', 'scale'])
                
            best_models[f'size'] = np.nan
            for metric_f in [average_precision_score, roc_auc_score]:
                best_models[f'mean_{metric_f.__name__}_train_mean'] = np.nan
                best_models[f'mean_{metric_f.__name__}_train_std'] = np.nan
                best_models[f'mean_{metric_f.__name__}_train_ci_lower'] = np.nan
                best_models[f'mean_{metric_f.__name__}_train_ci_upper'] = np.nan

                best_models[f'mean_{metric_f.__name__}_fold_out_mean'] = np.nan
                best_models[f'mean_{metric_f.__name__}_fold_out_std'] = np.nan
                best_models[f'mean_{metric_f.__name__}_fold_out_ci_lower'] = np.nan
                best_models[f'mean_{metric_f.__name__}_fold_out_ci_upper'] = np.nan
            
            display(best_models.T)

            # Evaluating it on the held out test partition -----
            held_out_performances = []
            for i, row in best_models.iterrows():
                # print(row)
                # finding the correct file
                filename = (
                    "/".join([results_path,
                              targets_rev[dnames_to_ugly[row['target']]],
                              nice_to_ugly[row['model']] ])
                    + "/"
                    + "_".join(
                        [
                            dnames_to_ugly[row['target']],
                            nice_to_ugly[row['model']],
                            str(row['scale']),
                            str(row['icd_only']),
                            str(row['prompt_richness']),
                            str(row['RunID']),
                            row['fold'],
                            str(row['random_state']),
                        ]
                    )
                    + '.json'
                )
                # print(filename)
                cp_df = pd.read_json(filename, typ="series")

                cp_df = cp_df.rename(index={"few_feature": "icd_only"})

                # Evaluating the program in the other folds
                cp_file = filename.replace('.json', '_program') \
                            .replace(cp_df['target'], targets_rev[cp_df['target']]) \
                            .replace('/', '.')[3:]

                if 'iterative' not in filename: # removing the fold from file name if not iterative
                    cp_file = cp_file.replace(f"_{cp_df['fold']}_", '__')

                # print(cp_file)

                try:
                    cp_module = importlib.__import__(
                        cp_file, # Thiis iis going to be unique for each algorithm
                        globals(),
                        locals(),
                        ["predict_hyptertension"],
                    )
                    cp_function = cp_module.predict_hypertension
                except:
                    continue
                
                cp_df['RunID'] = '101'
                best_models.loc[i, f'size'] = cp_df['size']
                for metric_f in [average_precision_score, roc_auc_score]:
                    cp_df = evaluate_all_folds(metric_f, cp_function, cp_df, ['A'], bootstrap=True, override_runid=True)
                    
                    best_models.loc[i, f'mean_{metric_f.__name__}_train_mean'] = cp_df[f'{metric_f.__name__}_train_A_mean']
                    best_models.loc[i, f'mean_{metric_f.__name__}_train_std'] = cp_df[f'{metric_f.__name__}_train_A_std']
                    best_models.loc[i, f'mean_{metric_f.__name__}_train_ci_upper'] = cp_df[f'{metric_f.__name__}_train_A_ci_upper']
                    best_models.loc[i, f'mean_{metric_f.__name__}_train_ci_lower'] = cp_df[f'{metric_f.__name__}_train_A_ci_lower']
                    
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_mean'] = cp_df[f'{metric_f.__name__}_fold_out_A_mean']
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_std'] = cp_df[f'{metric_f.__name__}_fold_out_A_std']
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_ci_upper'] = cp_df[f'{metric_f.__name__}_fold_out_A_ci_upper']
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_ci_lower'] = cp_df[f'{metric_f.__name__}_fold_out_A_ci_lower']
                    
            display(best_models.T)
    
            print("-"*120)

            # Create LaTeX table
            latex_table = best_models.to_latex(
                index=True,
                column_format="lrrr",
                escape=False
            )
            
            # Save LaTeX table to file
            filename = f"{paper_dir}/tab_best_model_{metric}_{prompt_richness}_{icd_only}.tex"
            with open(filename, 'w') as f:
                f.write(latex_table)
            
            print(f"\nLaTeX table saved to {filename}\n")

prompt_richness True expert_features True


Unnamed: 0,150,75,19,284,114,226
target,Htn-Hypokalemia Heuristic,HTN Heuristic,HTN Diagnosis,Resistant HTN Heuristic,HTN-Hypokalemia Diagnosis,Resistant HTN Diagnosis
model,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter
prompt_richness,True,True,True,True,True,True
icd_only,True,True,True,True,True,True
Strategy,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train
scale,False,False,False,False,False,False
RunID,1,1,1,1,1,1
fold,A,C,B,D,B,C
random_state,1318,16695,31658,14724,14724,24284
AUPRC_validation,1,0.97889,0.968017,0.799027,0.740501,0.659152


Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [00:11<00:00, 86.95it/s]
100%|██████████| 1000/1000 [00:10<00:00, 95.30it/s]


Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [00:11<00:00, 84.78it/s]
100%|██████████| 1000/1000 [00:11<00:00, 86.90it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [00:16<00:00, 61.31it/s]
100%|██████████| 1000/1000 [00:17<00:00, 58.13it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [00:17<00:00, 56.85it/s]
100%|██████████| 1000/1000 [00:17<00:00, 58.81it/s]


Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [00:10<00:00, 97.87it/s]
100%|██████████| 1000/1000 [00:10<00:00, 95.21it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [00:11<00:00, 90.03it/s]
100%|██████████| 1000/1000 [00:08<00:00, 111.54it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [00:02<00:00, 368.42it/s]
100%|██████████| 1000/1000 [00:02<00:00, 482.10it/s]
  4%|▍         | 43/1000 [00:00<00:02, 427.61it/s]

Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [00:02<00:00, 425.50it/s]
100%|██████████| 1000/1000 [00:02<00:00, 435.93it/s]
  2%|▏         | 23/1000 [00:00<00:04, 224.19it/s]

Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [00:04<00:00, 207.68it/s]
100%|██████████| 1000/1000 [00:04<00:00, 216.46it/s]
  2%|▏         | 21/1000 [00:00<00:04, 203.32it/s]

Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [00:05<00:00, 197.30it/s]
100%|██████████| 1000/1000 [00:04<00:00, 214.44it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [01:24<00:00, 11.89it/s]
100%|██████████| 1000/1000 [00:29<00:00, 33.61it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Overriding RunID, Using runID 101


100%|██████████| 1000/1000 [01:25<00:00, 11.65it/s]
100%|██████████| 1000/1000 [00:30<00:00, 32.86it/s]


Unnamed: 0,150,75,19,284,114,226
target,Htn-Hypokalemia Heuristic,HTN Heuristic,HTN Diagnosis,Resistant HTN Heuristic,HTN-Hypokalemia Diagnosis,Resistant HTN Diagnosis
model,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter,gpt-4o-iter
prompt_richness,True,True,True,True,True,True
icd_only,True,True,True,True,True,True
Strategy,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train,SEDI / Train
scale,False,False,False,False,False,False
RunID,1,1,1,1,1,1
fold,A,C,B,D,B,C
random_state,1318,16695,31658,14724,14724,24284
AUPRC_validation,1,0.97889,0.968017,0.799027,0.740501,0.659152


------------------------------------------------------------------------------------------------------------------------

LaTeX table saved to ../paper_rebuttal/tab_best_model_AUPRC_validation_True_True.tex

prompt_richness True expert_features False
prompt_richness False expert_features True
prompt_richness False expert_features False


In [4]:
# reminder of filename
# filename = (
#     rdir
#     + "/"
#     + "_".join(
#         [
#             targets[target],
#             name,
#             str(scale),
#             str(few_feature),
#             str(prompt_richness),
#             str(repeat),
#             str(fold),
#             str(random_state),
#         ]
#     )
# )