In [1]:
from _load_llm_results import *

import numpy as np
import warnings
import os
import importlib
import sys; sys.path.append('../')
from evaluate_model import read_data

from sklearn.metrics import accuracy_score, precision_score, balanced_accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score

warnings.filterwarnings("ignore")

%matplotlib inline

from tqdm import tqdm
from glob import glob

data_dir = '../data'
paper_dir = '../paper_rebuttal/'

In [2]:
X_train, y_train, X_test, y_test = read_data(
    'res_htn_dx_ia', 'A', 101, False, False, data_dir, 1318 )

def feature_engineering(df):
    # modifying to use the conditionals generated by the LLM as features for a logistic regression
    df['Med_Potassium_N>0'] = (df['Med_Potassium_N'] > 0).astype(float)
    df['Dx_HypoK_N>0'] = (df['Dx_HypoK_N'] > 0).astype(float)

    df['mean_systolic>140'] = (df['mean_systolic'] > 140).astype(float)
    df['mean_diastolic>80'] = (df['mean_diastolic'] > 80).astype(float)
    df['mean_systolic<130'] = (df['mean_systolic'] < 130).astype(float)
    df['mean_diastolic<75'] = (df['mean_diastolic'] < 75).astype(float)

    df['high_BP_during_htn_meds_2>5'] = (df['high_BP_during_htn_meds_2'] > 5).astype(float)
    df['high_BP_during_htn_meds_3>=2'] = (df['high_BP_during_htn_meds_3'] >= 2).astype(float)
    df['high_BP_during_htn_meds_3>5'] = (df['high_BP_during_htn_meds_3'] > 5).astype(float)
    df['sum_enc_during_htn_meds_4_plus>=2'] = (df['sum_enc_during_htn_meds_4_plus'] >= 2).astype(float)

    return df

X_train = feature_engineering(X_train)
X_test  = feature_engineering(X_test)

In [3]:
class LLM_model():
    # implementation of the final model generated by GPT-4o-SEDI

    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
        probabilities = []
        
        for _, row in X.iterrows():
            prob = 0.0
            if row['high_BP_during_htn_meds_3'] >= 2:
                prob += 0.4
            if row['sum_enc_during_htn_meds_4_plus'] >= 2:
                prob += 0.4
            if row['mean_diastolic'] > 80:
                prob += 0.1
            if row['mean_systolic'] > 140:
                prob += 0.1
            if row['high_BP_during_htn_meds_2'] > 5:
                prob += 0.1
            if row['high_BP_during_htn_meds_3'] > 5:
                prob += 0.1
            prob = min(1.0, prob)
            if row['Med_Potassium_N'] > 0 and row['Dx_HypoK_N'] > 0:
                prob *= 0.5
            if row['mean_systolic'] < 130 and row['mean_diastolic'] < 75:
                prob *= 0.5
            probabilities.append(prob)
        
        return np.array(probabilities)
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        return (self.predict_proba(X) > 0.5).astype(int)

llm_model = LLM_model()

print(llm_model.predict(X_train)[:5])
print(llm_model.predict_proba(X_train)[:5])

[0 0 0 0 0]
[0.  0.  0.  0.  0.1]


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Custom selector class
class BPFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, prob_adjust_features=False):
        self.features = [
            'mean_systolic>140', 'mean_diastolic>80',
            'high_BP_during_htn_meds_2>5', 'high_BP_during_htn_meds_3>=2',
            'high_BP_during_htn_meds_3>5', 'sum_enc_during_htn_meds_4_plus>=2'
        ]
        if prob_adjust_features:
            # Additional features used to adjust probability. This logic cannot be
            # Directly implemented by a logistic regression and would require a more
            # complex optimization system that mix decision trees and logistic regression
            # (spoiler alert: we are working on it)
            self.features += [
                'Med_Potassium_N>0', 'Dx_HypoK_N>0',
                'mean_systolic<130', 'mean_diastolic<75'
            ]
        
    def fit(self, X, y=None):
        # Check if all required features exist
        if not all(feature in X.columns for feature in self.features):
            raise ValueError(f"Not all required features found. Required: {self.features}")
        return self
    
    def transform(self, X):
        return X[self.features]

# Create and fit the pipeline
lr_model = Pipeline([
    ('selector', BPFeatureSelector()),
    ('classifier', LogisticRegression())
])

lr_extra_features_model = Pipeline([
    ('selector', BPFeatureSelector(prob_adjust_features=True)),
    ('classifier', LogisticRegression())
])

print("Pipeline steps:")
for name, step in lr_model.named_steps.items():
    print(f"{name}: {step}")

print("\nFeatures selected by pipeline:", 
      lr_model.named_steps['selector'].features)

# Fit the pipeline
lr_model.fit(X_train, y_train)
lr_extra_features_model.fit(X_train, y_train)

print("="*80)
print("lr_model acc on test",
    accuracy_score(y_test, lr_model.predict(X_test)))

print("lr_extra_features_model acc on test",
    accuracy_score(y_test, lr_extra_features_model.predict(X_test)))

Pipeline steps:
selector: BPFeatureSelector(prob_adjust_features=None)
classifier: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Features selected by pipeline: ['mean_systolic>140', 'mean_diastolic>80', 'high_BP_during_htn_meds_2>5', 'high_BP_during_htn_meds_3>=2', 'high_BP_during_htn_meds_3>5', 'sum_enc_during_htn_meds_4_plus>=2']
lr_model acc on test 0.8766666666666667
lr_extra_features_model acc on test 0.86


In [5]:
def logistic_regression_summary(estimator, feature_names=None, sort=False):
    coefs = estimator.coef_[0] # newer versions of sklearn
    # coefs = estimator.named_steps["est"].coef_[0] # older versions of sklearn

    # Combine feature names and coefficients
    if feature_names is None:
        feature_names = [f"x_{i}" for i in range(len(coefs))]

    feature_coef_pairs = list(zip(feature_names, coefs))

    sorted_pairs = feature_coef_pairs
    if sort:
        sorted_pairs = sorted(feature_coef_pairs, key=lambda x: abs(x[1]), reverse=True)

    summary = "Logistic reg. weights:\n"
    for feature, coef in sorted_pairs:
        summary += f"  {coef:.2f}*{feature} +\n"
    summary += f"  {estimator.intercept_[0]:.2f} (intercept)"

    return summary

print("="*80)
print("lr_model pretty:",
    logistic_regression_summary(
        lr_model.named_steps['classifier'],
        lr_model.named_steps['selector'].features,
        sort=False
    )
)

print("="*80)
print("lr_extra_features_model pretty:",
    logistic_regression_summary(
        lr_extra_features_model.named_steps['classifier'],
        lr_extra_features_model.named_steps['selector'].features,
        sort=False
    )
)

lr_model pretty: Logistic reg. weights:
  0.90*mean_systolic>140 +
  0.70*mean_diastolic>80 +
  -0.63*high_BP_during_htn_meds_2>5 +
  2.56*high_BP_during_htn_meds_3>=2 +
  0.60*high_BP_during_htn_meds_3>5 +
  1.61*sum_enc_during_htn_meds_4_plus>=2 +
  -3.69 (intercept)
lr_extra_features_model pretty: Logistic reg. weights:
  0.54*mean_systolic>140 +
  0.25*mean_diastolic>80 +
  -0.83*high_BP_during_htn_meds_2>5 +
  2.30*high_BP_during_htn_meds_3>=2 +
  0.58*high_BP_during_htn_meds_3>5 +
  1.50*sum_enc_during_htn_meds_4_plus>=2 +
  0.19*Med_Potassium_N>0 +
  0.34*Dx_HypoK_N>0 +
  -1.21*mean_systolic<130 +
  -0.53*mean_diastolic<75 +
  -2.70 (intercept)


In [6]:
class ProbabilityAdjuster(BaseEstimator):
    def __init__(self, fitted_pipe):
        self.fitted_pipe = fitted_pipe
        
    def fit(self, X, y=None):        
        self.features = [
            'Med_Potassium_N>0', 'Dx_HypoK_N>0',
            'mean_systolic<130', 'mean_diastolic<75'
        ]

        return self
    
    def predict_proba(self, X):
        probabilities = self.fitted_pipe.predict_proba(X)[:,1]
        for i, (row_id, row) in enumerate(X.iterrows()):
            if row['Med_Potassium_N>0'] and row['Dx_HypoK_N>0'] :
                probabilities[i] *= 0.5
            if row['mean_systolic<130']  and row['mean_diastolic<75'] :
                probabilities[i] *= 0.5
        return probabilities

    def predict(self, X):
        probabilities = self.predict_proba(X)
        return np.where(probabilities>0.5, 1, 0)

lr_prob_adjust_model = ProbabilityAdjuster(lr_model).fit(X_train, y_train)

print("lr_prob_adjust_model acc on test",
    accuracy_score(y_test, lr_prob_adjust_model.predict(X_test)))

lr_prob_adjust_model acc on test 0.84


In [7]:
def evaluate_all_folds(
    metric, model, target, res_dict, folds=['A', 'B', 'C', 'D', 'E'], 
    bootstrap=False, n_reps=1_000):
    
    # Stolen from 04. loading feat model
    for FOLD in folds:
        X_train, y_train, X_test, y_test = read_data(
            target, FOLD, 101, False, False, data_dir, 1318 )
        
        X_train = feature_engineering(X_train)
        X_test  = feature_engineering(X_test)

        def eval(model, X, y):
            if metric in [average_precision_score, roc_auc_score]:
                probs = model.predict_proba(X)
                if probs.ndim == 2:
                    probs = probs[:, 1]
                return metric(y, probs)
            else:
                return model.predict(X)

        entry = f"{metric.__name__}_train_{FOLD}"
        if bootstrap:
            val_samples = []
            for i in tqdm(range(n_reps)):
                samples = np.random.randint(0, len(y_train)-1, size=len(y_train))
                val_samples.append( eval(model, X_train.iloc[samples, :], y_train.iloc[samples]) )
                
            res_dict[f"{entry}_mean"] = np.mean(val_samples)
            res_dict[f"{entry}_std"] = np.std(val_samples)
            res_dict[f"{entry}_ci_upper"] = np.quantile(val_samples,0.95)
            res_dict[f"{entry}_ci_lower"] = np.quantile(val_samples,0.05)
        else:
            res_dict[entry] = eval(model, X_train, y_train)
            
        entry = f"{metric.__name__}_fold_out_{FOLD}"
        if bootstrap:
            val_samples = []
            for i in tqdm(range(n_reps)):
                samples = np.random.randint(0,len(y_test)-1, size=len(y_test))
                val_samples.append( eval(model, X_test.iloc[samples, :], y_test.iloc[samples]) )

            res_dict[f"{entry}_mean"] = np.mean(val_samples)
            res_dict[f"{entry}_std"] = np.std(val_samples)
            res_dict[f"{entry}_ci_upper"] = np.quantile(val_samples,0.95)
            res_dict[f"{entry}_ci_lower"] = np.quantile(val_samples,0.05)
        else:
            res_dict[entry] = eval(model, X_test, y_test)

    return res_dict

In [8]:
# We have 4 models:
models = [llm_model, lr_model, lr_prob_adjust_model, lr_extra_features_model]
model_names = ['llm_model', 'lr_model', 'lr_prob_adjust_model', 'lr_extra_features_model']

final_performances = []
for model, model_name in zip(models, model_names):

    held_out_performances = {
        'size' : 50,
        'target' : 'ResHtndx',
        'scale' : False,
        'RunID' : 101,
        'model' : model_name,
    }
    for metric_f in [average_precision_score, roc_auc_score]:
        held_out_performances = evaluate_all_folds(
            metric_f, model, 'res_htn_dx_ia', held_out_performances, ['A'],
            bootstrap=True, n_reps=1_000)
    final_performances.append(held_out_performances)

final_performances_df = pd.DataFrame(final_performances)

100%|██████████| 1000/1000 [01:55<00:00,  8.66it/s]
100%|██████████| 1000/1000 [00:33<00:00, 29.68it/s]
100%|██████████| 1000/1000 [01:41<00:00,  9.83it/s]
100%|██████████| 1000/1000 [01:22<00:00, 12.06it/s]
100%|██████████| 1000/1000 [00:08<00:00, 120.66it/s]
100%|██████████| 1000/1000 [00:07<00:00, 140.89it/s]
100%|██████████| 1000/1000 [00:09<00:00, 110.00it/s]
100%|██████████| 1000/1000 [00:07<00:00, 140.56it/s]
100%|██████████| 1000/1000 [02:25<00:00,  6.89it/s]
100%|██████████| 1000/1000 [00:52<00:00, 18.93it/s]
100%|██████████| 1000/1000 [02:25<00:00,  6.85it/s]
100%|██████████| 1000/1000 [00:57<00:00, 17.53it/s]
100%|██████████| 1000/1000 [00:09<00:00, 111.07it/s]
100%|██████████| 1000/1000 [00:07<00:00, 131.56it/s]
100%|██████████| 1000/1000 [00:09<00:00, 110.83it/s]
100%|██████████| 1000/1000 [00:07<00:00, 125.28it/s]


In [9]:
display(final_performances_df.T)

# Create LaTeX table
latex_table = final_performances_df.to_latex(
    index=True,
    column_format="lrrr",
    escape=False
)

# Save LaTeX table to file
filename = f"{paper_dir}/tab_parameter_optimization.tex"
with open(filename, 'w') as f:
    f.write(latex_table)

print(f"\nLaTeX table saved to {filename}\n")

Unnamed: 0,0,1,2,3
size,50,50,50,50
target,ResHtndx,ResHtndx,ResHtndx,ResHtndx
scale,False,False,False,False
RunID,101,101,101,101
model,llm_model,lr_model,lr_prob_adjust_model,lr_extra_features_model
average_precision_score_train_A_mean,0.630814,0.716887,0.707128,0.732727
average_precision_score_train_A_std,0.0520861,0.0446501,0.0477249,0.0435491
average_precision_score_train_A_ci_upper,0.712883,0.787245,0.781795,0.798565
average_precision_score_train_A_ci_lower,0.540292,0.640169,0.628298,0.659403
average_precision_score_fold_out_A_mean,0.801947,0.859289,0.83984,0.873314



LaTeX table saved to ../paper_rebuttal//tab_parameter_optimization.tex



In [10]:
final_performances

[{'size': 50,
  'target': 'ResHtndx',
  'scale': False,
  'RunID': 101,
  'model': 'llm_model',
  'average_precision_score_train_A_mean': 0.630813734169819,
  'average_precision_score_train_A_std': 0.05208610515433557,
  'average_precision_score_train_A_ci_upper': 0.7128831900674248,
  'average_precision_score_train_A_ci_lower': 0.5402915795932827,
  'average_precision_score_fold_out_A_mean': 0.8019468077794136,
  'average_precision_score_fold_out_A_std': 0.04989353255045758,
  'average_precision_score_fold_out_A_ci_upper': 0.877432594842803,
  'average_precision_score_fold_out_A_ci_lower': 0.7139905760481584,
  'roc_auc_score_train_A_mean': 0.9371522989981884,
  'roc_auc_score_train_A_std': 0.013710024181619884,
  'roc_auc_score_train_A_ci_upper': 0.958886956835677,
  'roc_auc_score_train_A_ci_lower': 0.9128962705257702,
  'roc_auc_score_fold_out_A_mean': 0.9509844540293343,
  'roc_auc_score_fold_out_A_std': 0.011544820068897437,
  'roc_auc_score_fold_out_A_ci_upper': 0.96897528640441