# Find best models

This notebook will load all models and re-evaluate them on all 5 folds, without re-fit. The best model for each phenotype will be selected for further experiments

In [None]:
from _load_llm_results import *

import numpy as np
import warnings
import os
import importlib
import sys; sys.path.append('../')
from evaluate_model import read_data
from sklearn.metrics import average_precision_score, roc_auc_score

warnings.filterwarnings("ignore")

%matplotlib inline

from tqdm import tqdm
from glob import glob

data_dir = '../data'

def bootstrap(val, n = 1000, fn=np.mean):
    val_samples = []
    for i in range(n):
        sample = np.random.randint(0,len(val)-1, size=len(val))
        val_samples.append( fn(val[sample]) )
    m = np.mean(val_samples)
    sd = np.std(val_samples)
    ci_upper  = np.quantile(val_samples,0.95)
    ci_lower  = np.quantile(val_samples,0.05)
    return m, sd, ci_upper,ci_lower

def evaluate_all_folds(metric, cp_function, cp_df, folds=['A', 'B', 'C', 'D', 'E'], bootstrap=False):
    for FOLD in folds:
        X_train, y_train, X_test, y_test = read_data(
            targets_rev[cp_df['target']], FOLD, cp_df['RunID'], cp_df['scale'],
            cp_df['icd_only'], data_dir, cp_df['random_state']
        )

        def eval(X, y):
            try:
                prob = np.array(cp_function(X))
            except Exception:
                prob = np.zeros(shape=len(X))
            try:
                if metric in [average_precision_score, roc_auc_score]:
                    return metric(y, prob)
                else:
                    return metric(y, np.where(prob> 0.5), 1, 0)
            except ValueError:
                return np.nan

        entry = f"{metric.__name__}_fold_out_{FOLD}"
        if bootstrap:
            val_samples = []
            for i in tqdm(range(1_000)):
                samples = np.random.randint(0,len(X_test)-1, size=len(X_test))
                val_samples.append( eval(X_test.iloc[samples, :], y_test[samples]) )

            cp_df[f"{entry}_mean"] = np.mean(val_samples)
            cp_df[f"{entry}_std"] = np.std(val_samples)
            cp_df[f"{entry}_ci_upper"] = np.quantile(val_samples,0.95)
            cp_df[f"{entry}_ci_lower"] = np.quantile(val_samples,0.05)
        else:
            cp_df[entry] = eval(X_test, y_test)

    return cp_df

def load_results(constraints=dict(scale=[False])):
    count = 0
    for file in tqdm(glob(f"{results_path}/**/*.json", recursive=True)):
        
        try:
            cp_df = pd.read_json(file, typ="series")
        except Exception as e:
            # print("Bad results in file", file)
            continue
            
        # unifying column name
        cp_df = cp_df.rename(index={"few_feature": "icd_only"})
            
        stay = True
        for kc, vc in constraints.items():
            if cp_df[kc] not in vc:
                stay = False
                # print("skipping", file)
                break
        if not stay:
            # print('continuing')
            continue

        indxs = cp_df.index
        # indxs = [indx for indx in indxs if indx not in ['pred', 'pred_proba']]

        # Evaluating the program in the other folds
        cp_file = file.replace('.json', '_program') \
                      .replace(cp_df['target'], targets_rev[cp_df['target']]) \
                      .replace('/', '.')[3:]

        if 'iterative' not in file: # removing the fold from file name if not iterative
            cp_file = cp_file.replace(f"_{cp_df['fold']}_", '__')

        print(cp_file)

        try:
            cp_module = importlib.__import__(
                cp_file,
                globals(),
                locals(),
                ["predict_hyptertension"],
            )
            cp_function = cp_module.predict_hypertension
            cp_df = evaluate_all_folds(average_precision_score, cp_function, cp_df[indxs])
        except:
            continue

        results.append(cp_df)
        count += 1

    print('loaded',count,'files')
    results_df = pd.DataFrame(data=results) #, columns=indxs)

    # Beautifying it
    results_df["model"] = results_df["model"].apply(lambda m: nice_model_labels[m])
    results_df["target"] = results_df["target"].apply(lambda t: dnames_to_nice[t])

    results_df = results_df[results_df["model"].isin(order)]

    print(results_df["model"].unique())
    print(results_df["target"].unique())
    return results_df
    
phenotypes_order =  ['HTN Heuristic', 'Htn-Hypokalemia Heuristic', 'Resistant HTN Heuristic',
                     'HTN Diagnosis', 'HTN-Hypokalemia Diagnosis', 'Resistant HTN Diagnosis']

settings_order = ['Simple prompt,\nfew features', 'Simple prompt,\nall features', 
                  'Rich prompt,\nfew features',   'Rich prompt,\nall features']

results_df = load_results(
    constraints = {
        'scale': [False],
        'model': ['GPT_4o_iterative_Classifier'], 
        'prompt_richness' : [True],
        'icd_only' : [True],
    }
)
results_df.to_csv('find_best.csv', index=False)

results_df = pd.read_csv('find_best.csv')

# Making it the format seaborn likes
results_df_melted = pd.melt(
    results_df, 
    id_vars=['model', 'target', 'fold', 'RunID', 'random_state', 'prompt_richness', 'icd_only', 'scale']
)

print(results_df.columns)
print(results_df.shape)

 10%|█████████████████▉                                                                                                                                                                | 725/7200 [00:02<00:24, 267.63it/s]

results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_A_1318_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_A_14724_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_A_16695_program


 10%|██████████████████▋                                                                                                                                                                | 753/7200 [00:03<01:18, 82.15it/s]

results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_A_24284_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_A_24481_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_A_27690_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_A_31658_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_A_6832_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_A_6933_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_A_8233_program
results_paper_re

 11%|███████████████████▏                                                                                                                                                               | 774/7200 [00:09<08:38, 12.39it/s]

results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_C_24481_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_C_27690_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_C_31658_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_C_6832_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_C_6933_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_C_8233_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_D_1318_program
results_paper_reb

 11%|███████████████████▌                                                                                                                                                               | 789/7200 [00:14<12:55,  8.27it/s]

results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_D_8233_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_E_1318_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_E_14724_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_E_16695_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_E_24284_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_E_24481_program
results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_E_27690_program
results_paper_r

 11%|███████████████████▊                                                                                                                                                               | 799/7200 [00:17<16:01,  6.66it/s]

results_paper_rebuttal.HTN_heuristic.GPT_4o_iterative_Classifier.HTN_heuristic_GPT_4o_iterative_Classifier_False_True_True_1_E_8233_program


 27%|███████████████████████████████████████████████▉                                                                                                                                 | 1948/7200 [00:21<00:19, 262.78it/s]

results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_1318_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_14724_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_16695_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_24284_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_24481_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_27690_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_31658_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia

 27%|████████████████████████████████████████████████▊                                                                                                                                 | 1975/7200 [00:29<07:16, 11.96it/s]

results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_C_27690_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_C_31658_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_C_6832_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_C_6933_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_C_8233_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_D_1318_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_D_14724_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GP

 28%|█████████████████████████████████████████████████▎                                                                                                                                | 1994/7200 [00:35<11:43,  7.40it/s]

results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_E_24481_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_E_27690_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_E_31658_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_E_6832_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_E_6933_program
results_paper_rebuttal.htn_dx_ia.GPT_4o_iterative_Classifier.htn_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_E_8233_program


 44%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 3148/7200 [00:42<00:18, 223.18it/s]

results_paper_rebuttal.htn_hypok_dx_ia.GPT_4o_iterative_Classifier.htn_hypok_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_1318_program
results_paper_rebuttal.htn_hypok_dx_ia.GPT_4o_iterative_Classifier.htn_hypok_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_14724_program
results_paper_rebuttal.htn_hypok_dx_ia.GPT_4o_iterative_Classifier.htn_hypok_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_16695_program
results_paper_rebuttal.htn_hypok_dx_ia.GPT_4o_iterative_Classifier.htn_hypok_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_24284_program
results_paper_rebuttal.htn_hypok_dx_ia.GPT_4o_iterative_Classifier.htn_hypok_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_24481_program
results_paper_rebuttal.htn_hypok_dx_ia.GPT_4o_iterative_Classifier.htn_hypok_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_27690_program
results_paper_rebuttal.htn_hypok_dx_ia.GPT_4o_iterative_Classifier.htn_hypok_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_A_

 44%|██████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 3171/7200 [00:48<04:44, 14.18it/s]

results_paper_rebuttal.htn_hypok_dx_ia.GPT_4o_iterative_Classifier.htn_hypok_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_C_14724_program
results_paper_rebuttal.htn_hypok_dx_ia.GPT_4o_iterative_Classifier.htn_hypok_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_C_16695_program
results_paper_rebuttal.htn_hypok_dx_ia.GPT_4o_iterative_Classifier.htn_hypok_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_C_24284_program
results_paper_rebuttal.htn_hypok_dx_ia.GPT_4o_iterative_Classifier.htn_hypok_dx_ia_GPT_4o_iterative_Classifier_False_True_True_1_C_24481_program


In [None]:
results_df_melted.variable.unique()

In [None]:
for prompt_richness in [True,False]:
    for icd_only in [True,False]:
        print("prompt_richness",prompt_richness,"expert_features",icd_only)
        for metric in [
            'average_precision_score_fold_out', 
            # 'roc_auc_score_test', 
            # 'accuracy_score_test', 
            # 'size'
        ]:
            data = (
                results_df_melted[
                    (results_df_melted['variable'].str.contains(metric))
                    & (
                        (results_df_melted['model'].isin(['DT','FEAT','LR L1','RF']))
                        | 
                        (
                            (results_df_melted['prompt_richness']==prompt_richness)
                            & (results_df_melted['icd_only']==icd_only)
                        )
                    )
                ]
                .dropna()
            )
                        
            # prettify metric names
            metric = metric.replace('_fold_out', '_validation')
            metric = metric.replace('average_precision_score', 'AUPRC')
            
            data = data.rename(columns={'value': metric}) #, 'model':'Model'})
            data[metric] = data[metric].astype(float)
            data['Model'] = data['model'].apply(lambda x: x.replace('-iter',''))
            data['Strategy'] = ['SEDI / Train' if 'iter' in v or 'gpt' not in v else 'Zero Shot' for v in data['model'].values]
            hue_order = ['Zero Shot', 'SEDI / Train']
            
            # print(data.random_state.unique())
            # display(data)

            # first we group each model's performance on the held out data and take the mean.
            # then, we ignore folds and random states to get the final model.
            model_metrics_all_folds = data.groupby([
                'target', 'model', 'prompt_richness', 'icd_only', 'Strategy', 'scale',
                'RunID', 'fold', 'random_state']
            )[metric].mean().reset_index()
            # display(model_metrics_all_folds)

            best_models = model_metrics_all_folds\
                .sort_values(metric, ascending=False)\
                .drop_duplicates(['target', 'model', 'prompt_richness', 'icd_only', 'Strategy', 'scale'])
                
            best_models[f'size'] = np.nan
            for metric_f in [average_precision_score, roc_auc_score]:
                best_models[f'mean_{metric_f.__name__}_fold_out_mean'] = np.nan
                best_models[f'mean_{metric_f.__name__}_fold_out_std'] = np.nan
                best_models[f'mean_{metric_f.__name__}_fold_out_ci_lower'] = np.nan
                best_models[f'mean_{metric_f.__name__}_fold_out_ci_upper'] = np.nan
            # display(best_models)

            # Evaluating it on the held out test partition -----
            held_out_performances = []
            for i, row in best_models.iterrows():
                # print(row)
                # finding the correct file
                filename = (
                    "/".join([results_path,
                              targets_rev[dnames_to_ugly[row['target']]],
                              nice_to_ugly[row['model']] ])
                    + "/"
                    + "_".join(
                        [
                            dnames_to_ugly[row['target']],
                            nice_to_ugly[row['model']],
                            str(row['scale']),
                            str(row['icd_only']),
                            str(row['prompt_richness']),
                            str(row['RunID']),
                            row['fold'],
                            str(row['random_state']),
                        ]
                    )
                    + '.json'
                )
                # print(filename)
                cp_df = pd.read_json(filename, typ="series")

                cp_df = cp_df.rename(index={"few_feature": "icd_only"})

                # Evaluating the program in the other folds
                cp_file = filename.replace('.json', '_program') \
                            .replace(cp_df['target'], targets_rev[cp_df['target']]) \
                            .replace('/', '.')[3:]

                if 'iterative' not in filename: # removing the fold from file name if not iterative
                    cp_file = cp_file.replace(f"_{cp_df['fold']}_", '__')

                # print(cp_file)

                try:
                    cp_module = importlib.__import__(
                        cp_file, # Thiis iis going to be unique for each algorithm
                        globals(),
                        locals(),
                        ["predict_hyptertension"],
                    )
                    cp_function = cp_module.predict_hypertension
                except:
                    continue
                
                cp_df['RunID'] = '101'
                best_models.loc[i, f'size'] = cp_df['size']
                for metric_f in [average_precision_score, roc_auc_score]:
                    cp_df = evaluate_all_folds(metric_f, cp_function, cp_df, ['A'], bootstrap=True)
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_mean'] = cp_df[f'{metric_f.__name__}_fold_out_A_mean']
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_std'] = cp_df[f'{metric_f.__name__}_fold_out_A_std']
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_ci_upper'] = cp_df[f'{metric_f.__name__}_fold_out_A_ci_upper']
                    best_models.loc[i, f'mean_{metric_f.__name__}_fold_out_ci_lower'] = cp_df[f'{metric_f.__name__}_fold_out_A_ci_lower']
                
            display(best_models)
    
            print("-"*120)

            # Create LaTeX table
            latex_table = best_models.to_latex(
                index=True,
                column_format="lrrr",
                escape=False
            )
            
            # Save LaTeX table to file
            filename = f"{paper_dir}/tab_best_model_{metric}_{prompt_richness}_{icd_only}.tex"
            with open(filename, 'w') as f:
                f.write(latex_table)
            
            print(f"\nLaTeX table saved to {filename}\n")