## Model inference experiment

**Claim: Conditional on test_hat, yield_hat_lasso will not add any value. However, conditional on test_hat, yield_hat_cnn is going to add value**

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import roc_auc_score
from scipy.stats import mode
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
import statsmodels.api as sm
from multiprocessing import Pool
import os
import warnings

import matplotlib.pyplot as plt

### Build Test_hat model

In [2]:
############################
## Load data
############################

df_train = pd.read_csv('train_ids_labels_untested_with_covars.csv')
df_val = pd.read_csv('val_ids_labels_untested_with_covars.csv')
df_test = pd.read_csv('test_ids_labels_untested_with_covars.csv')
df_all = pd.read_csv('all_ids_labels_untested_with_covars.csv')

df_train_t = pd.read_csv('train_ids_labels_with_covars.csv')
df_val_t = pd.read_csv('val_ids_labels_with_covars.csv')
df_test_t = pd.read_csv('test_ids_labels_with_covars.csv')
df_all_t = pd.read_csv('all_ids_labels_tested_with_covars.csv')

print(df_train.shape)
print(df_train_t.shape)
df_train_full = pd.concat([df_train, df_train_t])
df_val_full = pd.concat([df_val, df_val_t])
print(df_train_full.shape)
print(df_test.columns)

(27126, 81)
(3548, 81)
(30674, 81)
Index(['Unnamed: 0', 'patient_ngsci_id', 'ecg_id', 'date', 'p-r-t_axes',
       'p_axes', 'r_axes', 't_axes', 'pr_interval', 'pr_interval_units',
       'qrs_duration', 'qrs_duration_units', 'qtqtc', 'qt_interval',
       'qt_interval_units', 'qtc_interval', 'qtc_interval_units', 'vent_rate',
       'vent_rate_units', 'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
       'ed_enc_id', 'start_datetime', 'end_datetime', 'age_at_admit',
       'macetrop_030_pos', 'death_030_day', 'macetrop_pos_or_death_030',
       'stent_010_day', 'cabg_010_day', 'stent_or_cabg_010_day', 'ami_day_of',
       'days_to_ami', 

In [3]:
############################
## Covariate Spec
############################

## Lets use age, sex and human annotated features for now
human_waveform_age_sex_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
        'age_at_admit', 'female','ste_std_twi']

groundtruth_ami = 'stent_or_cabg_010_day'
adverse_event = 'macetrop_pos_or_death_030' #'macetrop_030_pos'
testing = 'test_010_day'

In [4]:
############################
## Train tested_hat LASSO
############################

############ Train Lasso for testing

print(f"LASSO")

variables_incl_y = human_waveform_age_sex_vars + [testing] #+ ['ecg_id']

df_train_rel = df_train_full[variables_incl_y].dropna()
df_val_rel = df_val_full[variables_incl_y].dropna() 

# Prepare the training data
X_train = df_train_rel[human_waveform_age_sex_vars]
y_train = df_train_rel[testing]

# Prepare the validation data
X_val = df_val_rel[human_waveform_age_sex_vars]
y_val = df_val_rel[testing]

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Initialize the LASSO Logistic Regression classifier
lasso_classifier = LogisticRegression(penalty='l1', solver='liblinear', C=1, random_state=42)

# Train the model
lasso_classifier.fit(X_train_scaled, y_train)

# Predict on validation set
y_val_pred = lasso_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val, y_val_pred)

print(f"AUC Score on Validation Set: {auc_score}")

LASSO
AUC Score on Validation Set: 0.7124326321347942


In [5]:
###################################
## Obtain tested_hat for test set
###################################
variables_incl_y = human_waveform_age_sex_vars + [testing] + ['ecg_id'] + [groundtruth_ami]
df_test_rel = df_test_t[variables_incl_y].dropna() 

# Prepare the validation data
X_test = df_test_rel[human_waveform_age_sex_vars]
y_test = df_test_rel[testing]

# Scale the data using StandardScaler
X_test_scaled = scaler.transform(X_test)

# Predict on validation set
y_test_pred = lasso_classifier.predict_proba(X_test_scaled)[:, 1]  # get probabilities for the positive class

In [6]:
df_test_rel['tested_hat'] = y_test_pred
df_test_rel.head()

Unnamed: 0,r_axes,qrs_duration,qt_interval,qtc_interval,vent_rate,has_bbb,has_afib,has_st,has_pacemaker,has_lvh,...,has_prolonged_qt,has_lead_reversal,has_poor_or_quality,age_at_admit,female,ste_std_twi,test_010_day,ecg_id,stent_or_cabg_010_day,tested_hat
0,-59.0,140,430,411,55,0,0,1,0,0,...,0,0,0,69.0,0,False,True,ecge9460b970b.npy,False,0.364928
1,91.0,72,278,387,117,0,0,1,0,0,...,0,0,0,65.0,1,False,True,ecgdd0d198786.npy,False,0.108585
2,18.0,86,398,423,68,0,0,0,0,0,...,0,0,0,50.0,1,False,True,ecgd3f0db6ae6.npy,False,0.095654
3,3.0,86,388,406,66,0,0,0,0,0,...,0,0,0,66.0,1,True,True,ecg61e94cc9e1.npy,False,0.250523
4,-22.0,98,424,467,73,0,0,1,0,0,...,0,0,0,64.0,1,False,True,ecg49de641e8e.npy,False,0.16172


### Build ami_hat model

Using LASSO due to performance in tabular baseline comparision experiment

In [7]:
variables_incl_y = human_waveform_age_sex_vars + [groundtruth_ami] #+ ['ecg_id']

df_train_rel = df_train_t[variables_incl_y].dropna()
df_val_rel = df_val_t[variables_incl_y].dropna() 

# Prepare the training data
X_train = df_train_rel[human_waveform_age_sex_vars]
y_train = df_train_rel[groundtruth_ami]

# Prepare the validation data
X_val = df_val_rel[human_waveform_age_sex_vars]
y_val = df_val_rel[groundtruth_ami]

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Initialize the LASSO Logistic Regression classifier
lasso_classifier = LogisticRegression(penalty='l1', solver='liblinear', C=0.04, random_state=42)

# Train the model
lasso_classifier.fit(X_train_scaled, y_train)

# Predict on validation set
y_val_pred = lasso_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val, y_val_pred)

print(f"AUC Score on Validation Set: {auc_score}")

AUC Score on Validation Set: 0.7329421910129663


In [8]:
###################################
## Obtain ami_hat_lasso for test set
###################################

# Prepare the validation data
X_test = df_test_rel[human_waveform_age_sex_vars]
y_test = df_test_rel[groundtruth_ami]

# Scale the data using StandardScaler
X_test_scaled = scaler.transform(X_test)

# Predict on validation set
y_test_pred = lasso_classifier.predict_proba(X_test_scaled)[:, 1]  # get probabilities for the positive class

df_test_rel['ami_hat_lasso'] = y_test_pred
print(df_test_rel.columns)

df_test_rel['curve_idx'] = df_test_rel['ecg_id'].str.replace('.npy', '', regex=False)
display(df_test_rel.head())

Index(['r_axes', 'qrs_duration', 'qt_interval', 'qtc_interval', 'vent_rate',
       'has_bbb', 'has_afib', 'has_st', 'has_pacemaker', 'has_lvh',
       'has_normal', 'has_normal_ecg', 'has_normal_sinus', 'has_depress',
       'has_st_eleva', 'has_twave', 'has_aberran_bbb', 'has_jpoint_repol',
       'has_jpoint_eleva', 'has_twave_inver', 'has_twave_abnormal',
       'has_nonspecific', 'has_rhythm_disturbance', 'has_prolonged_qt',
       'has_lead_reversal', 'has_poor_or_quality', 'age_at_admit', 'female',
       'ste_std_twi', 'test_010_day', 'ecg_id', 'stent_or_cabg_010_day',
       'tested_hat', 'ami_hat_lasso'],
      dtype='object')


Unnamed: 0,r_axes,qrs_duration,qt_interval,qtc_interval,vent_rate,has_bbb,has_afib,has_st,has_pacemaker,has_lvh,...,has_poor_or_quality,age_at_admit,female,ste_std_twi,test_010_day,ecg_id,stent_or_cabg_010_day,tested_hat,ami_hat_lasso,curve_idx
0,-59.0,140,430,411,55,0,0,1,0,0,...,0,69.0,0,False,True,ecge9460b970b.npy,False,0.364928,0.281386,ecge9460b970b
1,91.0,72,278,387,117,0,0,1,0,0,...,0,65.0,1,False,True,ecgdd0d198786.npy,False,0.108585,0.124512,ecgdd0d198786
2,18.0,86,398,423,68,0,0,0,0,0,...,0,50.0,1,False,True,ecgd3f0db6ae6.npy,False,0.095654,0.078457,ecgd3f0db6ae6
3,3.0,86,388,406,66,0,0,0,0,0,...,0,66.0,1,True,True,ecg61e94cc9e1.npy,False,0.250523,0.164228,ecg61e94cc9e1
4,-22.0,98,424,467,73,0,0,1,0,0,...,0,64.0,1,False,True,ecg49de641e8e.npy,False,0.16172,0.1536,ecg49de641e8e


### Load ami_hat_cnn model

1D CNN which also takes age and sex as inputs

In [9]:
cnn_preds = pd.read_csv('/home/ngsci/project/NEJM_benchmark/ECG_img_benchmark/benchmark_acs_cnn1d/new/vf5qykna/checkpoints/cnn1d_test_val_predictions.csv')
merge_df = cnn_preds[['curve_idx','preds', 'probas']]
cnn_preds.head(10)

Unnamed: 0,curve_idx,preds,probas,split
0,ecgcafc2054f7,-0.282537,0.25417,val
1,ecg7bad30ef89,0.349121,0.610641,val
2,ecg9009d2ec15,1.222714,0.800381,val
3,ecg220ea4e100,-0.597965,0.213872,val
4,ecge982edb75e,-0.331532,0.274634,val
5,ecg754c29543f,-0.416654,0.286446,val
6,ecg69839ecaa4,-0.44578,0.287223,val
7,ecga93479b230,-0.35313,0.341117,val
8,ecg8fd599859a,-0.44578,0.287223,val
9,ecg8b7736a8d8,-0.35313,0.341117,val


In [10]:
# Merge in predictions
df_test_rel = pd.merge(df_test_rel, merge_df, on='curve_idx', how='left')

In [21]:
df_val_t['curve_idx'] = df_val_t['ecg_id'].str.replace('.npy', '', regex=False)
df_test_t['curve_idx'] = df_test_t['ecg_id'].str.replace('.npy', '', regex=False)

df_val_t = pd.merge(df_val_t, merge_df, on='curve_idx', how='left')
df_test_t = pd.merge(df_test_t, merge_df, on='curve_idx', how='left')

In [11]:
display(df_test_rel.head())

Unnamed: 0,r_axes,qrs_duration,qt_interval,qtc_interval,vent_rate,has_bbb,has_afib,has_st,has_pacemaker,has_lvh,...,female,ste_std_twi,test_010_day,ecg_id,stent_or_cabg_010_day,tested_hat,ami_hat_lasso,curve_idx,preds,probas
0,-59.0,140,430,411,55,0,0,1,0,0,...,0,False,True,ecge9460b970b.npy,False,0.364928,0.281386,ecge9460b970b,0.949625,0.849488
1,91.0,72,278,387,117,0,0,1,0,0,...,1,False,True,ecgdd0d198786.npy,False,0.108585,0.124512,ecgdd0d198786,0.028079,0.491539
2,18.0,86,398,423,68,0,0,0,0,0,...,1,False,True,ecgd3f0db6ae6.npy,False,0.095654,0.078457,ecgd3f0db6ae6,-0.42161,0.316153
3,3.0,86,388,406,66,0,0,0,0,0,...,1,True,True,ecg61e94cc9e1.npy,False,0.250523,0.164228,ecg61e94cc9e1,-0.147706,0.439178
4,-22.0,98,424,467,73,0,0,1,0,0,...,1,False,True,ecg49de641e8e.npy,False,0.16172,0.1536,ecg49de641e8e,0.00362,0.39905


### Test hypothesis 1: Lasso predictor of AMI does not add predictive power after conditioning on test_hat

In [12]:
# Define your inputs and target
X = df_test_rel[['ami_hat_lasso', 'tested_hat']]
y = df_test_rel['stent_or_cabg_010_day']

# Add a constant to the input features for the intercept
X = sm.add_constant(X)

# Create the model
logit_model = sm.Logit(y, X)

# Fit the model
result = logit_model.fit()

# Print the summary results
print(result.summary())

# Get the predicted probabilities
y_pred_prob = result.predict(X)

# Calculate the AUC
auc = roc_auc_score(y, y_pred_prob)

# Print the AUC
print(f"AUC: {auc}")

Optimization terminated successfully.
         Current function value: 0.455057
         Iterations 6
                             Logit Regression Results                            
Dep. Variable:     stent_or_cabg_010_day   No. Observations:                 2203
Model:                             Logit   Df Residuals:                     2200
Method:                              MLE   Df Model:                            2
Date:                   Tue, 20 Feb 2024   Pseudo R-squ.:                  0.1108
Time:                           00:54:07   Log-Likelihood:                -1002.5
converged:                          True   LL-Null:                       -1127.4
Covariance Type:               nonrobust   LLR p-value:                 5.554e-55
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -2.8791      0.129    -22.260      0.000      -3.133      -2.

In [13]:
# Define your inputs and target
X = df_test_rel['tested_hat'] #'ami_hat_lasso', 
#X = df_test_rel['ami_hat_lasso'] #'ami_hat_lasso', 
y = df_test_rel['stent_or_cabg_010_day']

# Add a constant to the input features for the intercept
X = sm.add_constant(X)

# Create the model
logit_model = sm.Logit(y, X)

# Fit the model
result = logit_model.fit()

# Print the summary results
print(result.summary())

# Get the predicted probabilities
y_pred_prob = result.predict(X)

# Calculate the AUC
auc = roc_auc_score(y, y_pred_prob)

# Print the AUC
print(f"AUC: {auc}")

Optimization terminated successfully.
         Current function value: 0.480017
         Iterations 6
                             Logit Regression Results                            
Dep. Variable:     stent_or_cabg_010_day   No. Observations:                 2203
Model:                             Logit   Df Residuals:                     2201
Method:                              MLE   Df Model:                            1
Date:                   Tue, 20 Feb 2024   Pseudo R-squ.:                 0.06204
Time:                           00:54:09   Log-Likelihood:                -1057.5
converged:                          True   LL-Null:                       -1127.4
Covariance Type:               nonrobust   LLR p-value:                 2.823e-32
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.3620      0.109    -21.595      0.000      -2.576      -2.148
teste

In [14]:
# Define your inputs and target
X = df_test_rel['ami_hat_lasso'] #'ami_hat_lasso', 
#X = df_test_rel['ami_hat_lasso'] #'ami_hat_lasso', 
y = df_test_rel['stent_or_cabg_010_day']

# Add a constant to the input features for the intercept
X = sm.add_constant(X)

# Create the model
logit_model = sm.Logit(y, X)

# Fit the model
result = logit_model.fit()

# Print the summary results
print(result.summary())

# Get the predicted probabilities
y_pred_prob = result.predict(X)

# Calculate the AUC
auc = roc_auc_score(y, y_pred_prob)

# Print the AUC
print(f"AUC: {auc}")

Optimization terminated successfully.
         Current function value: 0.455160
         Iterations 6
                             Logit Regression Results                            
Dep. Variable:     stent_or_cabg_010_day   No. Observations:                 2203
Model:                             Logit   Df Residuals:                     2201
Method:                              MLE   Df Model:                            1
Date:                   Tue, 20 Feb 2024   Pseudo R-squ.:                  0.1106
Time:                           00:54:11   Log-Likelihood:                -1002.7
converged:                          True   LL-Null:                       -1127.4
Covariance Type:               nonrobust   LLR p-value:                 3.501e-56
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -2.8938      0.128    -22.675      0.000      -3.144      -2.

### Takeaway hypothesis 1: tested_hat does not add additional information to ami_hat_lasso prediction. ami_hat_lasso seems to re-implement the human judgement. 

### Hypothesis 2: The waveform-based predictior is complimentary to human judgement. I.e. human judgement + cnn_hat achieve higher AUCs than each alone.

In [15]:
# Define your inputs and target
X = df_test_rel[['probas', 'tested_hat']]
y = df_test_rel['stent_or_cabg_010_day']

# Add a constant to the input features for the intercept
X = sm.add_constant(X)

# Create the model
logit_model = sm.Logit(y, X)

# Fit the model
result = logit_model.fit()

# Print the summary results
print(result.summary())

# Get the predicted probabilities
y_pred_prob = result.predict(X)

# Calculate the AUC
auc = roc_auc_score(y, y_pred_prob)

# Print the AUC
print(f"AUC: {auc}")

Optimization terminated successfully.
         Current function value: 0.465335
         Iterations 6
                             Logit Regression Results                            
Dep. Variable:     stent_or_cabg_010_day   No. Observations:                 2203
Model:                             Logit   Df Residuals:                     2200
Method:                              MLE   Df Model:                            2
Date:                   Tue, 20 Feb 2024   Pseudo R-squ.:                 0.09073
Time:                           00:54:14   Log-Likelihood:                -1025.1
converged:                          True   LL-Null:                       -1127.4
Covariance Type:               nonrobust   LLR p-value:                 3.781e-45
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.2088      0.165    -19.460      0.000      -3.532      -2.886
proba

In [16]:
# Define your inputs and target
X = df_test_rel['tested_hat'] #'ami_hat_lasso', 
#X = df_test_rel['ami_hat_lasso'] #'ami_hat_lasso', 
y = df_test_rel['stent_or_cabg_010_day']

# Add a constant to the input features for the intercept
X = sm.add_constant(X)

# Create the model
logit_model = sm.Logit(y, X)

# Fit the model
result = logit_model.fit()

# Print the summary results
print(result.summary())

# Get the predicted probabilities
y_pred_prob = result.predict(X)

# Calculate the AUC
auc = roc_auc_score(y, y_pred_prob)

# Print the AUC
print(f"AUC: {auc}")

Optimization terminated successfully.
         Current function value: 0.480017
         Iterations 6
                             Logit Regression Results                            
Dep. Variable:     stent_or_cabg_010_day   No. Observations:                 2203
Model:                             Logit   Df Residuals:                     2201
Method:                              MLE   Df Model:                            1
Date:                   Tue, 20 Feb 2024   Pseudo R-squ.:                 0.06204
Time:                           00:54:19   Log-Likelihood:                -1057.5
converged:                          True   LL-Null:                       -1127.4
Covariance Type:               nonrobust   LLR p-value:                 2.823e-32
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.3620      0.109    -21.595      0.000      -2.576      -2.148
teste

In [17]:
# Define your inputs and target
X = df_test_rel['probas'] #'ami_hat_lasso', 
#X = df_test_rel['ami_hat_lasso'] #'ami_hat_lasso', 
y = df_test_rel['stent_or_cabg_010_day']

# Add a constant to the input features for the intercept
X = sm.add_constant(X)

# Create the model
logit_model = sm.Logit(y, X)

# Fit the model
result = logit_model.fit()

# Print the summary results
print(result.summary())

# Get the predicted probabilities
y_pred_prob = result.predict(X)

# Calculate the AUC
auc = roc_auc_score(y, y_pred_prob)

# Print the AUC
print(f"AUC: {auc}")

Optimization terminated successfully.
         Current function value: 0.476762
         Iterations 6
                             Logit Regression Results                            
Dep. Variable:     stent_or_cabg_010_day   No. Observations:                 2203
Model:                             Logit   Df Residuals:                     2201
Method:                              MLE   Df Model:                            1
Date:                   Tue, 20 Feb 2024   Pseudo R-squ.:                 0.06840
Time:                           00:54:21   Log-Likelihood:                -1050.3
converged:                          True   LL-Null:                       -1127.4
Covariance Type:               nonrobust   LLR p-value:                 2.068e-35
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.9365      0.158    -18.635      0.000      -3.245      -2.628
proba

### Q3: What is with the complimentarity of the ami_hat_lasso and cnn_hat predictor?

In [18]:
# Define your inputs and target
X = df_test_rel[['probas','ami_hat_lasso', 'tested_hat']]
y = df_test_rel['stent_or_cabg_010_day']

# Add a constant to the input features for the intercept
X = sm.add_constant(X)

# Create the model
logit_model = sm.Logit(y, X)

# Fit the model
result = logit_model.fit()

# Print the summary results
print(result.summary())

# Get the predicted probabilities
y_pred_prob = result.predict(X)

# Calculate the AUC
auc = roc_auc_score(y, y_pred_prob)

# Print the AUC
print(f"AUC: {auc}")

Optimization terminated successfully.
         Current function value: 0.449061
         Iterations 6
                             Logit Regression Results                            
Dep. Variable:     stent_or_cabg_010_day   No. Observations:                 2203
Model:                             Logit   Df Residuals:                     2199
Method:                              MLE   Df Model:                            3
Date:                   Tue, 20 Feb 2024   Pseudo R-squ.:                  0.1225
Time:                           00:54:31   Log-Likelihood:                -989.28
converged:                          True   LL-Null:                       -1127.4
Covariance Type:               nonrobust   LLR p-value:                 1.355e-59
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -3.3958      0.173    -19.686      0.000      -3.734      -3.

In [19]:
# Define your inputs and target
X = df_test_rel[['probas','ami_hat_lasso']]
y = df_test_rel['stent_or_cabg_010_day']

# Add a constant to the input features for the intercept
X = sm.add_constant(X)

# Create the model
logit_model = sm.Logit(y, X)

# Fit the model
result = logit_model.fit()

# Print the summary results
print(result.summary())

# Get the predicted probabilities
y_pred_prob = result.predict(X)

# Calculate the AUC
auc = roc_auc_score(y, y_pred_prob)

# Print the AUC
print(f"AUC: {auc}")

Optimization terminated successfully.
         Current function value: 0.449295
         Iterations 6
                             Logit Regression Results                            
Dep. Variable:     stent_or_cabg_010_day   No. Observations:                 2203
Model:                             Logit   Df Residuals:                     2200
Method:                              MLE   Df Model:                            2
Date:                   Tue, 20 Feb 2024   Pseudo R-squ.:                  0.1221
Time:                           00:54:36   Log-Likelihood:                -989.80
converged:                          True   LL-Null:                       -1127.4
Covariance Type:               nonrobust   LLR p-value:                 1.703e-60
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -3.4104      0.172    -19.843      0.000      -3.747      -3.

#### Trial out of curiosity - cnn_hat and troponing in lasso

In [26]:
variables_incl_y = ['age_at_admit', 'female', 'race_black', 'race_hispanic', 'race_white', 'race_other',
       'agi_under_25k', 'agi_25k_to_50k', 'agi_50k_to_75k', 'agi_75k_to_100k',
       'agi_100k_to_200k', 'agi_above_200k', 'maxtrop_sameday', 'probas'] + [groundtruth_ami] #+ ['ecg_id']

if 'maxtrop_sameday' in variables_incl_y:
        # Compute the median of the 'maxtrop_sameday' column
        median_maxtrop = df_val_t['maxtrop_sameday'].median()

        # Replace missing values with the median
        #imputer = SimpleImputer(strategy='median')
        imputer = KNNImputer(n_neighbors=5) 
        df_val_t['maxtrop_sameday'] = imputer.fit_transform(df_val_t[['maxtrop_sameday']])
        df_test_t['maxtrop_sameday'] = imputer.transform(df_test_t[['maxtrop_sameday']])

#df_train_rel = df_train_t[variables_incl_y].dropna()
df_val_rel = df_val_t[variables_incl_y].dropna() 
df_test_rel = df_test_t[variables_incl_y].dropna()

# Prepare the training data
X_train = df_val_rel[['age_at_admit', 'female','maxtrop_sameday', 'probas']]
y_train = df_val_rel[groundtruth_ami]

# Prepare the validation data
X_val = df_test_rel[['age_at_admit', 'female','maxtrop_sameday', 'probas']]
y_val = df_test_rel[groundtruth_ami]

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Initialize the LASSO Logistic Regression classifier
lasso_classifier = LogisticRegression(penalty='l1', solver='liblinear', C=0.04, random_state=42)

# Train the model
lasso_classifier.fit(X_train_scaled, y_train)

# Predict on validation set
y_val_pred = lasso_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val, y_val_pred)

print(f"AUC Score on Validation Set: {auc_score}")

AUC Score on Validation Set: 0.7447182117675816


#####################################################################################################################################
#####################################################################################################################################
#####################################################################################################################################
#####################################################################################################################################

### other baseline training code

In [4]:
display(df_train.head())
print(df_train.columns)

Unnamed: 0.1,Unnamed: 0,patient_ngsci_id,ecg_id,date,p-r-t_axes,p_axes,r_axes,t_axes,pr_interval,pr_interval_units,...,race_other,agi_under_25k,agi_25k_to_50k,agi_50k_to_75k,agi_75k_to_100k,agi_100k_to_200k,agi_above_200k,ste_std_twi,female,split
0,15,pat000c942d,ecgf319c6c805.npy,2111-01-17T22:38:52Z,74 57 60,74.0,57.0,60.0,164.0,ms,...,0,0.33373,0.214158,0.154075,0.107674,0.166568,0.023795,False,1,train
1,16,pat000d398d,ecgab490959f0.npy,2111-03-16T05:04:59Z,67 76 72,67.0,76.0,72.0,140.0,ms,...,1,0.356511,0.216182,0.141593,0.082174,0.117573,0.085967,False,0,train
2,25,pat0010462d,ecg5558d790bc.npy,2111-08-14T03:13:00Z,75 94 72,75.0,94.0,72.0,138.0,ms,...,0,0.286545,0.166113,0.125415,0.098007,0.227575,0.096346,False,1,train
3,28,pat00116b06,ecga3eac1846c.npy,2113-06-11T11:23:02Z,56 13 40,56.0,13.0,40.0,134.0,ms,...,0,0.348425,0.259843,0.153543,0.094488,0.116142,0.027559,False,1,train
4,29,pat00116b06,ecga41e48c1ac.npy,2113-06-11T11:23:02Z,56 13 40,56.0,13.0,40.0,134.0,ms,...,0,0.348425,0.259843,0.153543,0.094488,0.116142,0.027559,False,1,train


Index(['Unnamed: 0', 'patient_ngsci_id', 'ecg_id', 'date', 'p-r-t_axes',
       'p_axes', 'r_axes', 't_axes', 'pr_interval', 'pr_interval_units',
       'qrs_duration', 'qrs_duration_units', 'qtqtc', 'qt_interval',
       'qt_interval_units', 'qtc_interval', 'qtc_interval_units', 'vent_rate',
       'vent_rate_units', 'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
       'ed_enc_id', 'start_datetime', 'end_datetime', 'age_at_admit',
       'macetrop_030_pos', 'death_030_day', 'macetrop_pos_or_death_030',
       'stent_010_day', 'cabg_010_day', 'stent_or_cabg_010_day', 'ami_day_of',
       'days_to_ami', 'maxtrop_sameday', 'tn_group_sameda

In [17]:
# Set the display option to show all rows
pd.set_option('display.max_rows', None)
#print(df_all.isna().sum())
print(df_all.dtypes)

#Define X and y variables
human_waveform_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality', 'ste_std_twi'] #'p_axes', 'maxtrop_sameday','vent_rate_units','pr_interval','t_axes',

human_waveform_age_sex_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
        'age_at_admit', 'female','ste_std_twi'] #'p_axes', 'maxtrop_sameday','vent_rate_units','pr_interval','t_axes',

human_waveform_age_sex_race_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
        'age_at_admit', 'female', 'race_black', 'race_hispanic', 'race_white', 'race_other',
        'ste_std_twi'] #'p_axes', 'maxtrop_sameday','vent_rate_units','pr_interval','t_axes',

human_waveform_age_sex_race_agi_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
        'age_at_admit', 'female', 'race_black', 'race_hispanic', 'race_white', 'race_other',
       'agi_under_25k', 'agi_25k_to_50k', 'agi_50k_to_75k', 'agi_75k_to_100k',
       'agi_100k_to_200k', 'agi_above_200k', 'ste_std_twi'] #'p_axes', 'maxtrop_sameday','vent_rate_units','pr_interval','t_axes',

human_waveform_age_sex_race_agi_tropt_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
        'age_at_admit', 'female', 'race_black', 'race_hispanic', 'race_white', 'race_other',
       'agi_under_25k', 'agi_25k_to_50k', 'agi_50k_to_75k', 'agi_75k_to_100k',
       'agi_100k_to_200k', 'agi_above_200k', 'ste_std_twi', 'maxtrop_sameday'] #'p_axes','vent_rate_units','pr_interval','t_axes',

groundtruth_ami = 'stent_or_cabg_010_day'
adverse_event = 'macetrop_pos_or_death_030' #'macetrop_030_pos'

input_spec_list = [human_waveform_vars, human_waveform_age_sex_vars, human_waveform_age_sex_race_vars, 
                   human_waveform_age_sex_race_agi_vars, human_waveform_age_sex_race_agi_tropt_vars]

input_spec_name = ['human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 
                   'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)']

# relevant_vars = [ 'r_axes',
#        'qrs_duration',  'qt_interval',
#         'qtc_interval',  'vent_rate',
#         'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
#        'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
#        'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
#        'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
#        'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
#        'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
#         'age_at_admit', 'female', 'race_black', 'race_hispanic', 'race_white', 'race_other',
#        'agi_under_25k', 'agi_25k_to_50k', 'agi_50k_to_75k', 'agi_75k_to_100k',
#        'agi_100k_to_200k', 'agi_above_200k', 'ste_std_twi', 'stent_or_cabg_010_day', 'maxtrop_sameday', 'p_axes'] 

Unnamed: 0                     int64
patient_ngsci_id              object
ecg_id                        object
date                          object
p-r-t_axes                    object
p_axes                       float64
r_axes                       float64
t_axes                       float64
pr_interval                  float64
pr_interval_units             object
qrs_duration                   int64
qrs_duration_units            object
qtqtc                         object
qt_interval                    int64
qt_interval_units             object
qtc_interval                   int64
qtc_interval_units            object
vent_rate                      int64
vent_rate_units               object
has_bbb                        int64
has_afib                       int64
has_st                         int64
has_pacemaker                  int64
has_lvh                        int64
has_normal                     int64
has_normal_ecg                 int64
has_normal_sinus               int64
h

## Analysis in tested set

### LASSO

In [22]:

for variables, name in zip(input_spec_list, input_spec_name):
    
    print(f"LASSO: {name}")
    
    if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)':
        # Compute the median of the 'maxtrop_sameday' column
        median_maxtrop = df_train['maxtrop_sameday'].median()

        # Replace missing values with the median
        #imputer = SimpleImputer(strategy='median')
        imputer = KNNImputer(n_neighbors=5) 
        df_train['maxtrop_sameday'] = imputer.fit_transform(df_train[['maxtrop_sameday']])
        df_val['maxtrop_sameday'] = imputer.transform(df_val[['maxtrop_sameday']])
    
    variables_incl_y = variables + [adverse_event]
    
    df_train_rel = df_train[variables_incl_y].dropna()
    df_val_rel = df_val[variables_incl_y].dropna() 

    # Prepare the training data
    X_train = df_train_rel[variables]
    y_train = df_train_rel[adverse_event]

    # Prepare the validation data
    X_val = df_val_rel[variables]
    y_val = df_val_rel[adverse_event]
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Initialize the LASSO Logistic Regression classifier
    lasso_classifier = LogisticRegression(penalty='l1', solver='liblinear', C=1, random_state=42)

    # Train the model
    lasso_classifier.fit(X_train_scaled, y_train)

    # Predict on validation set
    y_val_pred = lasso_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

    # Calculate AUC
    auc_score = roc_auc_score(y_val, y_val_pred)

    print(f"AUC Score on Validation Set: {auc_score}")

LASSO: human ECG labels
AUC Score on Validation Set: 0.7402671136584839
LASSO: human ECG labels + age + sex
AUC Score on Validation Set: 0.7677348940431914
LASSO: human ECG labels + age + sex + race
AUC Score on Validation Set: 0.7768609443250689
LASSO: human ECG labels + age + sex + agi
AUC Score on Validation Set: 0.7821165874073925
LASSO: human ECG labels + age + sex + agi + tropt (KNN imputed)
AUC Score on Validation Set: 0.7821123666777009


### SVM

In [25]:

warnings.filterwarnings("ignore")#category=ConvergenceWarning

for variables, name in zip(input_spec_list, input_spec_name):
    
    print(f"SVM: {name}")
    
    if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)':
        # Compute the median of the 'maxtrop_sameday' column
        median_maxtrop = df_train['maxtrop_sameday'].median()

        # Replace missing values with the median
        #imputer = SimpleImputer(strategy='median')
        imputer = KNNImputer(n_neighbors=5) 
        df_train['maxtrop_sameday'] = imputer.fit_transform(df_train[['maxtrop_sameday']])
        df_val['maxtrop_sameday'] = imputer.transform(df_val[['maxtrop_sameday']])
    
    variables_incl_y = variables + [adverse_event]
    
    df_train_rel = df_train[variables_incl_y].dropna()
    df_val_rel = df_val[variables_incl_y].dropna() 

    # Prepare the training data
    X_train = df_train_rel[variables]
    y_train = df_train_rel[adverse_event]

    # Prepare the validation data
    X_val = df_val_rel[variables]
    y_val = df_val_rel[adverse_event]
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Initialize the Non-Linear Support Vector Machine (SVM) classifier with a kernel (e.g., 'rbf')
    svm_classifier = LinearSVC(random_state=42, C=0.01)

    # Train the model
    svm_classifier.fit(X_train_scaled, y_train)

    # Predict on validation set
    y_val_pred = svm_classifier.decision_function(X_val_scaled)  # get probabilities for non-linear SVM

    # Calculate AUC
    auc_score = roc_auc_score(y_val, y_val_pred)

    print(f"AUC Score on Validation Set: {auc_score}")

warnings.resetwarnings()

SVM: human ECG labels
AUC Score on Validation Set: 0.7435152293810918
SVM: human ECG labels + age + sex
AUC Score on Validation Set: 0.7760992401766025
SVM: human ECG labels + age + sex + race
AUC Score on Validation Set: 0.784394821352166
SVM: human ECG labels + age + sex + agi
AUC Score on Validation Set: 0.7884612485762574
SVM: human ECG labels + age + sex + agi + tropt (KNN imputed)
AUC Score on Validation Set: 0.7884612485762574


### Random Forest

In [42]:

for variables, name in zip(input_spec_list, input_spec_name):
    
    print(f"Random Forest: {name}")
    
    if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)':
        # Compute the median of the 'maxtrop_sameday' column
        median_maxtrop = df_train['maxtrop_sameday'].median()

        # Replace missing values with the median
        #imputer = SimpleImputer(strategy='median')
        imputer = KNNImputer(n_neighbors=5) 
        df_train['maxtrop_sameday'] = imputer.fit_transform(df_train[['maxtrop_sameday']])
        df_val['maxtrop_sameday'] = imputer.transform(df_val[['maxtrop_sameday']])
    
    variables_incl_y = variables + [adverse_event]
    
    df_train_rel = df_train[variables_incl_y].dropna()
    df_val_rel = df_val[variables_incl_y].dropna() 

    # Prepare the training data
    X_train = df_train_rel[variables]
    y_train = df_train_rel[adverse_event]

    # Prepare the validation data
    X_val = df_val_rel[variables]
    y_val = df_val_rel[adverse_event]
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Initialize the Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=250, min_samples_leaf=40, max_depth=12, random_state=42)

    # Train the model
    rf_classifier.fit(X_train_scaled, y_train)

    # Predict on validation set
    y_val_pred = rf_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

    # Calculate AUC
    auc_score = roc_auc_score(y_val, y_val_pred)

    print(f"AUC Score on Validation Set: {auc_score}")


Random Forest: human ECG labels
AUC Score on Validation Set: 0.7535214123499513
Random Forest: human ECG labels + age + sex
AUC Score on Validation Set: 0.7908484420329038
Random Forest: human ECG labels + age + sex + race
AUC Score on Validation Set: 0.7960166869631043
Random Forest: human ECG labels + age + sex + agi
AUC Score on Validation Set: 0.7952991321576792
Random Forest: human ECG labels + age + sex + agi + tropt (KNN imputed)
AUC Score on Validation Set: 0.800374861092771


### Gradient Boosted Trees

In [51]:
for variables, name in zip(input_spec_list, input_spec_name):
    
    print(f"Gradient Boosted Trees: {name}")
    
    if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)':
        # Compute the median of the 'maxtrop_sameday' column
        median_maxtrop = df_train['maxtrop_sameday'].median()

        # Replace missing values with the median
        #imputer = SimpleImputer(strategy='median')
        imputer = KNNImputer(n_neighbors=5) 
        df_train['maxtrop_sameday'] = imputer.fit_transform(df_train[['maxtrop_sameday']])
        df_val['maxtrop_sameday'] = imputer.transform(df_val[['maxtrop_sameday']])
    
    variables_incl_y = variables + [adverse_event]
    
    df_train_rel = df_train[variables_incl_y].dropna()
    df_val_rel = df_val[variables_incl_y].dropna() 

    # Prepare the training data
    X_train = df_train_rel[variables]
    y_train = df_train_rel[adverse_event]

    # Prepare the validation data
    X_val = df_val_rel[variables]
    y_val = df_val_rel[adverse_event]
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Initialize the Gradient Boosting classifier
    gb_classifier = GradientBoostingClassifier(learning_rate=0.05, n_estimators=250, min_samples_leaf=50, max_depth=8, random_state=42) #learning_rate=0.02, n_estimators=250, min_samples_leaf=50, max_depth=8, random_state=42)

    # Train the model
    gb_classifier.fit(X_train_scaled, y_train)

    # Predict on validation set
    y_val_pred = gb_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

    # Calculate AUC
    auc_score = roc_auc_score(y_val, y_val_pred)

    print(f"AUC Score on Validation Set: {auc_score}")

Gradient Boosted Trees: human ECG labels
AUC Score on Validation Set: 0.7084903564493302
Gradient Boosted Trees: human ECG labels + age + sex
AUC Score on Validation Set: 0.7664407022724173
Gradient Boosted Trees: human ECG labels + age + sex + race
AUC Score on Validation Set: 0.7746531189904129
Gradient Boosted Trees: human ECG labels + age + sex + agi
AUC Score on Validation Set: 0.7633066040551565
Gradient Boosted Trees: human ECG labels + age + sex + agi + tropt (KNN imputed)
AUC Score on Validation Set: 0.7727486778564241


### Summary table

In [50]:
# Define the data with updated AUC scores
data = {
    'Model Class': ['LASSO', 'LASSO', 'LASSO', 'LASSO', 'LASSO', 'SVM', 'SVM', 'SVM', 'SVM', 'SVM',
                    'Random Forest', 'Random Forest', 'Random Forest', 'Random Forest', 'Random Forest',
                    'Gradient Boosted Trees', 'Gradient Boosted Trees', 'Gradient Boosted Trees', 'Gradient Boosted Trees', 'Gradient Boosted Trees'],
    'Input Specification': ['human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)',
                            'human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)',
                            'human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)',
                            'human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)'],
    'AUC Score on Validation Set': [0.7402671136584839, 0.7677348940431914, 0.7768609443250689, 0.7821165874073925, 0.7821123666777009,
                                    0.7435152293810918, 0.7760992401766025, 0.784394821352166, 0.7884612485762574, 0.7884612485762574,
                                    0.7535214123499513, 0.7908484420329038, 0.7960166869631043, 0.7952991321576792, 0.800374861092771,
                                    0.7380225549061741, 0.774073465659838, 0.7843267910626713, 0.7913162707320737, 0.7894193542162979]
}

# Create a DataFrame
summary_df = pd.DataFrame(data)

# Specify the desired order of rows and columns
desired_row_order = ['LASSO', 'SVM', 'Random Forest', 'Gradient Boosted Trees']
desired_column_order = ['human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)']

# Filter the DataFrame to match the desired row order
summary_df_filtered = summary_df[summary_df['Model Class'].isin(desired_row_order)]

# Pivot the DataFrame
summary_df_pivot = summary_df_filtered.pivot(index='Model Class', columns='Input Specification', values='AUC Score on Validation Set')

# Reorder the rows and columns in the DataFrame
summary_df_pivot = summary_df_pivot.reindex(desired_row_order)[desired_column_order]

# Print the summary table with the desired ordering
display(summary_df_pivot)


Input Specification,human ECG labels,human ECG labels + age + sex,human ECG labels + age + sex + race,human ECG labels + age + sex + agi,human ECG labels + age + sex + agi + tropt (KNN imputed)
Model Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LASSO,0.740267,0.767735,0.776861,0.782117,0.782112
SVM,0.743515,0.776099,0.784395,0.788461,0.788461
Random Forest,0.753521,0.790848,0.796017,0.795299,0.800375
Gradient Boosted Trees,0.738023,0.774073,0.784327,0.791316,0.789419
