## Notebook for loading FEAT model from original paper and get new predictions

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score

import numpy as np
import pandas as pd

from _load_llm_results import *
import numpy as np
import warnings
import os
import importlib
import sys; sys.path.append('../')
from evaluate_model import read_data
from sklearn.metrics import average_precision_score, roc_auc_score

warnings.filterwarnings("ignore")

%matplotlib inline

from tqdm import tqdm
from glob import glob

data_dir = '../data'
paper_dir = '../paper_rebuttal/'

"""Model features
(reversed)
[(sum_enc_during_htn_meds_3>=1.500000)]
[(median_enc_during_htn_meds_4_plus>=1.250000)]
[sd_enc_during_htn_meds_2]
[(mean_systolic>=128.641357)]
[(max.CALCIUM>=10.150000)]
[(re_htn_spec_sum>=40.500000)]
"""

# Hardcoding the best FEAT model from original paper 
class FeatTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ss = StandardScaler()
        self.feature_names = [
            'sum_enc_during_htn_meds_3>1',            # 279
            'median_enc_during_htn_meds_4_plus>1.25', # 284
            'sd_enc_during_htn_meds_2',               # 286
            'mean_systolic>128.6',                    # 19
            'max.CALCIUM>10.1',                       # 89
            're_htn_spec_sum>40']                     # 307

    def fit(self, X, y=None):
        self.ss.fit(self.feat_model(X))
        return self
    
    def feat_model(self, X):
        if type(X).__name__ == 'DataFrame':
            X = X.values
        Phi = []
        Phi.append(X[:,279] >= 1.5)
        Phi.append(X[:,284] >= 1.25)
        Phi.append(X[:,286])
        Phi.append(X[:,19]>=128.641357)
        Phi.append(X[:,89]>=10.15)
        Phi.append(X[:,307]>=40.5)
       
        Phi = np.array(Phi).transpose()
        return Phi
    
    def transform(self, X):
        Phi = self.ss.transform(self.feat_model(X))
        return Phi

ft_lr_estimator = Pipeline( [
    ('prep', FeatTransformer()),
    ('est', LogisticRegression(C=1.0, penalty='l2', intercept_scaling=1.0, solver='liblinear'))
] )

In [8]:
X_train, y_train, X_test, y_test = read_data(
    'res_htn_dx_ia', 'A', 101, False, False, data_dir, 1318 )

# for i, c in enumerate(X_train.columns):
#     print(i, c)

In [9]:

feat_features = [
    'sum_enc_during_htn_meds_3', 'median_enc_during_htn_meds_4_plus',
    'sd_enc_during_htn_meds_2', 'mean_systolic', 'max.CALCIUM', 're_htn_spec_sum'
]

print(f"train (prevalence is {np.sum(y_train)/len(y_train)})")
display(X_train[feat_features].describe())

print(f"test (prevalence is {np.sum(y_test)/len(y_test)})")
display(X_test[feat_features].describe())

train (prevalence is 0.11457174638487208)


Unnamed: 0,sum_enc_during_htn_meds_3,median_enc_during_htn_meds_4_plus,sd_enc_during_htn_meds_2,mean_systolic,max.CALCIUM,re_htn_spec_sum
count,899.0,899.0,899.0,899.0,899.0,899.0
mean,28.489433,0.438265,11.30945,124.776104,9.776085,42.770857
std,86.790102,3.178054,9.162714,11.837503,0.402713,76.00509
min,0.0,0.0,0.25,97.428571,8.6,0.0
25%,0.0,0.0,9.591663,115.834734,9.5,0.0
50%,0.0,0.0,9.591663,124.4,9.8,11.0
75%,0.0,0.0,9.591663,132.190476,10.0,51.0
max,799.0,40.0,94.407323,170.4,12.4,710.0


test (prevalence is 0.24333333333333335)


Unnamed: 0,sum_enc_during_htn_meds_3,median_enc_during_htn_meds_4_plus,sd_enc_during_htn_meds_2,mean_systolic,max.CALCIUM,re_htn_spec_sum
count,300.0,300.0,300.0,300.0,300.0,300.0
mean,48.013333,0.61,12.720709,126.534801,9.812333,65.153333
std,110.224853,3.158505,13.226885,12.009932,0.470125,87.554683
min,0.0,0.0,0.235702,93.833333,8.8,0.0
25%,0.0,0.0,9.417902,118.446154,9.5,2.0
50%,0.0,0.0,9.417902,126.895672,9.8,28.0
75%,52.25,0.0,9.417902,134.941525,10.0,100.0
max,1066.0,27.0,103.012439,166.72,14.1,565.0


In [10]:
ft_lr_estimator

In [11]:
def evaluate_all_folds(metric, target, res_dict, folds=['A', 'B', 'C', 'D', 'E'], bootstrap=False, n_reps=1_000):
    print("Using runID", res_dict['RunID'])
    for FOLD in folds:
        X_train, y_train, X_test, y_test = read_data(
            target, FOLD, 101, False, False, data_dir, 1318 )
        
        ft_lr_estimator.fit(X_train, y_train) # will do nothing

        def eval(model, X, y):
            if metric in [average_precision_score, roc_auc_score]:
                return metric(y, model.predict_proba(X)[:,1])
            else:
                return model.predict(X)

        entry = f"{metric.__name__}_train_{FOLD}"
        if bootstrap:
            val_samples = []
            for i in tqdm(range(n_reps)):
                samples = np.random.randint(0, len(y_train)-1, size=len(y_train))
                val_samples.append( eval(ft_lr_estimator, X_train.iloc[samples, :], y_train.iloc[samples]) )
                
            res_dict[f"{entry}_mean"] = np.mean(val_samples)
            res_dict[f"{entry}_std"] = np.std(val_samples)
            res_dict[f"{entry}_ci_upper"] = np.quantile(val_samples,0.95)
            res_dict[f"{entry}_ci_lower"] = np.quantile(val_samples,0.05)
        else:
            res_dict[entry] = eval(ft_lr_estimator, X_train, y_train)
            
        entry = f"{metric.__name__}_fold_out_{FOLD}"
        if bootstrap:
            val_samples = []
            for i in tqdm(range(n_reps)):
                samples = np.random.randint(0,len(y_test)-1, size=len(y_test))
                val_samples.append( eval(ft_lr_estimator, X_test.iloc[samples, :], y_test.iloc[samples]) )

            res_dict[f"{entry}_mean"] = np.mean(val_samples)
            res_dict[f"{entry}_std"] = np.std(val_samples)
            res_dict[f"{entry}_ci_upper"] = np.quantile(val_samples,0.95)
            res_dict[f"{entry}_ci_lower"] = np.quantile(val_samples,0.05)
        else:
            res_dict[entry] = eval(ft_lr_estimator, X_test, y_test)

    return res_dict

In [12]:
held_out_performances = {
    'size' : 45,
    'target' : 'ResHtndx',
    'model' : 'FEAT',
    'scale' : False,
    'RunID' : 101,
    'fold' : 'A',
    'random_state' : 1318 
}

# We will update the dictionary with different performances
for metric_f in [average_precision_score, roc_auc_score]:
    held_out_performances = evaluate_all_folds(
        metric_f, 'res_htn_dx_ia', held_out_performances, ['A'],
        bootstrap=True, n_reps=1_000)
    
held_out_performances = pd.DataFrame().from_records([held_out_performances])
print("-"*120)

display(held_out_performances.T)

# Create LaTeX table
latex_table = held_out_performances.to_latex(
    index=True,
    column_format="lrrr",
    escape=False
)

# Save LaTeX table to file
filename = f"{paper_dir}/tab_feat.tex"
with open(filename, 'w') as f:
    f.write(latex_table)

print(f"\nLaTeX table saved to {filename}\n")

Using runID 101


  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:04<00:00, 209.75it/s]
100%|██████████| 1000/1000 [00:02<00:00, 400.81it/s]


Using runID 101


100%|██████████| 1000/1000 [00:05<00:00, 183.83it/s]
100%|██████████| 1000/1000 [00:02<00:00, 362.09it/s]

------------------------------------------------------------------------------------------------------------------------





Unnamed: 0,0
size,45
target,ResHtndx
model,FEAT
scale,False
RunID,101
fold,A
random_state,1318
average_precision_score_train_A_mean,0.79832
average_precision_score_train_A_std,0.041959
average_precision_score_train_A_ci_upper,0.865575



LaTeX table saved to ../paper_rebuttal//tab_feat.tex

