## Baseline training - Models only using metadata

In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import roc_auc_score
from scipy.stats import mode
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from multiprocessing import Pool
import os
import warnings

from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

from confidenceinterval import roc_auc_score, accuracy_score

import joblib

In [12]:
############################
## Load data
############################

df_train = pd.read_csv('train_ids_labels_with_covars_all.csv')
df_val = pd.read_csv('val_ids_labels_with_covars_all.csv')
df_test = pd.read_csv('test_ids_labels_with_covars_all.csv')
df_all = pd.read_csv('all_ids_labels_tested_with_covars_all.csv')

In [13]:
display(df_train.head())
print(df_train.columns)

Unnamed: 0.1,Unnamed: 0,patient_ngsci_id,ecg_id,date,p-r-t_axes,p_axes,r_axes,t_axes,pr_interval,pr_interval_units,...,race_white,race_other,agi_under_25k,agi_25k_to_50k,agi_50k_to_75k,agi_75k_to_100k,agi_100k_to_200k,agi_above_200k,female,split
0,26,pat001162d6,ecg162c83f05d,2114-06-21T21:04:34Z,45 59 0,45.0,59.0,0.0,224.0,ms,...,1,0,0.274465,0.181957,0.137615,0.106269,0.212538,0.087156,1,train
1,27,pat001162d6,ecg86065367ee,2114-06-21T21:04:34Z,45 59 0,45.0,59.0,0.0,224.0,ms,...,1,0,0.274465,0.181957,0.137615,0.106269,0.212538,0.087156,1,train
2,89,pat004815f1,ecgdd0d198786,2112-05-05T08:36:38Z,72 91 40,72.0,91.0,40.0,144.0,ms,...,1,0,0.254731,0.084425,0.061135,0.049491,0.142649,0.407569,1,train
3,333,pat00bc255b,ecge81f90e55f,2110-03-16T00:48:37Z,21 63 82,21.0,63.0,82.0,166.0,ms,...,1,0,0.283726,0.230193,0.171306,0.110278,0.162741,0.041756,0,train
4,523,pat011385ba,ecg7bad30ef89,2112-03-31T15:26:35Z,43 64 48,43.0,64.0,48.0,162.0,ms,...,1,0,0.233711,0.191926,0.157224,0.109065,0.21813,0.089943,0,train


Index(['Unnamed: 0', 'patient_ngsci_id', 'ecg_id', 'date', 'p-r-t_axes',
       'p_axes', 'r_axes', 't_axes', 'pr_interval', 'pr_interval_units',
       'qrs_duration', 'qrs_duration_units', 'qtqtc', 'qt_interval',
       'qt_interval_units', 'qtc_interval', 'qtc_interval_units', 'vent_rate',
       'vent_rate_units', 'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
       'ecg_id_new', 'ed_enc_id', 'start_datetime', 'end_datetime',
       'age_at_admit', 'macetrop_030_pos', 'death_030_day',
       'macetrop_pos_or_death_030', 'stent_010_day', 'cabg_010_day',
       'stent_or_cabg_010_day', 'ami_day_of', 'days_to_ami', 'maxtrop_sameday',
  

In [14]:
# Set the display option to show all rows
pd.set_option('display.max_rows', None)
#print(df_all.isna().sum())
print(df_all.dtypes)

#Define X and y variables
human_waveform_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality', 'ste_std_twi'] #'p_axes', 'maxtrop_sameday','vent_rate_units','pr_interval','t_axes',

human_waveform_age_sex_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
        'age_at_admit', 'female','ste_std_twi'] #'p_axes', 'maxtrop_sameday','vent_rate_units','pr_interval','t_axes',

human_waveform_age_sex_race_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
        'age_at_admit', 'female', 'race_black', 'race_hispanic', 'race_white', 'race_other',
        'ste_std_twi'] #'p_axes', 'maxtrop_sameday','vent_rate_units','pr_interval','t_axes',

human_waveform_age_sex_race_agi_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
        'age_at_admit', 'female', 'race_black', 'race_hispanic', 'race_white', 'race_other',
       'agi_under_25k', 'agi_25k_to_50k', 'agi_50k_to_75k', 'agi_75k_to_100k',
       'agi_100k_to_200k', 'agi_above_200k', 'ste_std_twi'] #'p_axes', 'maxtrop_sameday','vent_rate_units','pr_interval','t_axes',

human_waveform_age_sex_race_agi_tropt_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
        'age_at_admit', 'female', 'race_black', 'race_hispanic', 'race_white', 'race_other',
       'agi_under_25k', 'agi_25k_to_50k', 'agi_50k_to_75k', 'agi_75k_to_100k',
       'agi_100k_to_200k', 'agi_above_200k', 'ste_std_twi', 'maxtrop_sameday'] #'p_axes','vent_rate_units','pr_interval','t_axes',

human_waveform_age_sex_tropt_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
        'age_at_admit', 'female', 'ste_std_twi', 'maxtrop_sameday'] #'p_axes','vent_rate_units','pr_interval','t_axes',

human_waveform_age_sex_race_tropt_vars = [ 'r_axes',
       'qrs_duration',  'qt_interval', #'p-r-t_axes', 'qtqtc',
        'qtc_interval',  'vent_rate',
        'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
        'race_black', 'race_hispanic', 'race_white', 'race_other',
        'age_at_admit', 'female', 'ste_std_twi', 'maxtrop_sameday'] #'p_axes','vent_rate_units','pr_interval','t_axes',

ste = [
       'has_depress', 'has_st_eleva', 'has_twave_inver'] #'p_axes','vent_rate_units','pr_interval','t_axes',

groundtruth_ami = 'stent_or_cabg_010_day'
adverse_event = 'macetrop_pos_or_death_030'

# input_spec_list = [human_waveform_vars, human_waveform_age_sex_vars, human_waveform_age_sex_race_vars, 
#                    human_waveform_age_sex_race_agi_vars, human_waveform_age_sex_race_agi_tropt_vars, human_waveform_age_sex_tropt_vars, human_waveform_age_sex_race_tropt_vars]

# input_spec_name = ['human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 
#                    'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)', 'human ECG labels + age + sex + tropt (KNN imputed)', 
#                    'human ECG labels + age + sex + race + tropt (KNN imputed)']

# input_spec_list = [human_waveform_vars, human_waveform_age_sex_vars, human_waveform_age_sex_race_vars, 
#                    human_waveform_age_sex_race_agi_vars, human_waveform_age_sex_race_agi_tropt_vars, human_waveform_age_sex_race_tropt_vars, human_waveform_age_sex_tropt_vars,]

# input_spec_name = ['human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 
#                    'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)', 
#                    'human ECG labels + age + sex + race + tropt (KNN imputed)', 'human ECG labels + age + sex + tropt (KNN imputed)']

# input_spec_list = [human_waveform_age_sex_vars, human_waveform_age_sex_race_vars, 
#                    human_waveform_age_sex_race_agi_vars, human_waveform_age_sex_race_agi_tropt_vars, human_waveform_age_sex_tropt_vars, human_waveform_age_sex_race_tropt_vars,
#                   human_waveform_vars, ste]

input_spec_list = [ste]

input_spec_name = ['St elevation, T-wave inversion, ST depression']

# relevant_vars = [ 'r_axes',
#        'qrs_duration',  'qt_interval',
#         'qtc_interval',  'vent_rate',
#         'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
#        'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
#        'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
#        'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
#        'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
#        'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
#         'age_at_admit', 'female', 'race_black', 'race_hispanic', 'race_white', 'race_other',
#        'agi_under_25k', 'agi_25k_to_50k', 'agi_50k_to_75k', 'agi_75k_to_100k',
#        'agi_100k_to_200k', 'agi_above_200k', 'ste_std_twi', 'stent_or_cabg_010_day', 'maxtrop_sameday', 'p_axes'] 

Unnamed: 0                     int64
patient_ngsci_id              object
ecg_id                        object
date                          object
p-r-t_axes                    object
p_axes                       float64
r_axes                       float64
t_axes                       float64
pr_interval                  float64
pr_interval_units             object
qrs_duration                   int64
qrs_duration_units            object
qtqtc                         object
qt_interval                    int64
qt_interval_units             object
qtc_interval                   int64
qtc_interval_units            object
vent_rate                      int64
vent_rate_units               object
has_bbb                        int64
has_afib                       int64
has_st                         int64
has_pacemaker                  int64
has_lvh                        int64
has_normal                     int64
has_normal_ecg                 int64
has_normal_sinus               int64
h

## Analysis in tested set

### LASSO

In [None]:
variables, name = input_spec_list[-1], input_spec_name[-1]

print(f"LASSO: {name}")

if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)' or 'human ECG labels + age + sex + tropt (KNN imputed)' or 'human ECG labels + age + sex + race + tropt (KNN imputed)':
    # Compute the median of the 'maxtrop_sameday' column
    median_maxtrop = df_train['maxtrop_sameday'].median()

    # Replace missing values with the median
    #imputer = SimpleImputer(strategy='median')
    imputer = KNNImputer(n_neighbors=5) 
    df_train_t['maxtrop_sameday'] = imputer.fit_transform(df_train_t[['maxtrop_sameday']])
    df_val_t['maxtrop_sameday'] = imputer.transform(df_val_t[['maxtrop_sameday']])

variables_incl_y = variables + [adverse_event]

df_train_rel = df_train_t[variables_incl_y].dropna()
df_val_rel = df_test_t[variables_incl_y].dropna() 

# Prepare the training data
X_train = df_train_rel[variables]
y_train = df_train_rel[adverse_event]

# Prepare the validation data
X_val = df_val_rel[variables]
y_val = df_val_rel[adverse_event]

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Initialize the LASSO Logistic Regression classifier
lasso_classifier = LogisticRegression(penalty='l1', solver='liblinear', C=0.07, random_state=42)

# Train the model
lasso_classifier.fit(X_train_scaled, y_train)

joblib.dump(lasso_classifier, "structured_feature_logreg_mace.pkl")


# Predict on validation set
y_val_pred = lasso_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

# # Calculate AUC
# auc_score = roc_auc_score(y_val, y_val_pred)

# print(f"AUC Score on Validation Set: {auc_score}")

auc, ci = roc_auc_score(y_val, y_val_pred,
                        confidence_level=0.95)
# ,
#                         method='bootstrap_bca',
#                         n_resamples=1000)
    
print(f'Test AUC Score: {auc} ({ci[0]}, {ci[1]})')

# Step 2: Calculate the optimal threshold using Youden's J statistic
fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)
youden_j = tpr - fpr
optimal_idx = np.argmax(youden_j)
optimal_threshold = thresholds[optimal_idx]

# Step 3: Binarize the predictions based on the optimal threshold
y_val_class = (y_val_pred >= optimal_threshold).astype(int)

acc, ci_acc = accuracy_score(y_val, y_val_class,
                    confidence_level=0.95)
# ,
#                     method='bootstrap_bca',
#                     n_resamples=1000)

print(f'Test Accuracy Score: {acc} ({ci_acc[0]}, {ci_acc[1]})')

df_test_t['preds_ste_sti_twi_logist'] = y_val_pred
df_test_t['binary_preds_ste_sti_twi_logist'] = y_val_class
df_test_t.to_csv('test_ids_labels_untested_with_covars_all.csv')

In [21]:
# Assuming df_train and df_val are your training and validation DataFrames
# input_vars and output are defined as provided

for variables, name in zip(input_spec_list, input_spec_name):
    
    print(f"LASSO: {name}")
    
    if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)' or 'human ECG labels + age + sex + tropt (KNN imputed)' or 'human ECG labels + age + sex + race + tropt (KNN imputed)':
        # Compute the median of the 'maxtrop_sameday' column
        median_maxtrop = df_train['maxtrop_sameday'].median()

        # Replace missing values with the median
        #imputer = SimpleImputer(strategy='median')
        imputer = KNNImputer(n_neighbors=5) 
        df_train['maxtrop_sameday'] = imputer.fit_transform(df_train[['maxtrop_sameday']])
        df_val['maxtrop_sameday'] = imputer.transform(df_val[['maxtrop_sameday']])
    
    variables_incl_y = variables + [groundtruth_ami]
    
    df_train_rel = df_train[variables_incl_y].dropna()
    df_val_rel = df_val[variables_incl_y].dropna() 

    # Prepare the training data
    X_train = df_train_rel[variables]
    y_train = df_train_rel[groundtruth_ami]

    # Prepare the validation data
    X_val = df_val_rel[variables]
    y_val = df_val_rel[groundtruth_ami]
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Initialize the LASSO Logistic Regression classifier
    lasso_classifier = LogisticRegression(penalty='l1', solver='liblinear', C=0.07, random_state=42)

    # Train the model
    lasso_classifier.fit(X_train_scaled, y_train)
    
    joblib.dump(lasso_classifier, "structured_feature_logreg_acs.pkl")

    # Predict on validation set
    y_val_pred = lasso_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

    # Calculate AUC
#     auc_score = roc_auc_score(y_val, y_val_pred)

#     print(f"AUC Score on Validation Set: {auc_score}")
    
    auc, ci = roc_auc_score(y_val, y_val_pred,
                        confidence_level=0.95,
                        method='bootstrap_bca',
                        n_resamples=1000)
    
    print(f'Test AUC Score: {auc} ({ci[0]}, {ci[1]})')
    
    # Step 2: Calculate the optimal threshold using Youden's J statistic
    fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)
    youden_j = tpr - fpr
    optimal_idx = np.argmax(youden_j)
    optimal_threshold = thresholds[optimal_idx]

    # Step 3: Binarize the predictions based on the optimal threshold
    y_val_class = (y_val_pred >= optimal_threshold).astype(int)
    
    acc, ci_acc = roc_auc_score(y_val, y_val_class,
                        confidence_level=0.95,
                        method='bootstrap_bca',
                        n_resamples=1000)
    
    print(f'Test Accuracy Score: {acc} ({ci_acc[0]}, {ci_acc[1]})')

LASSO: St elevation, T-wave inversion, ST depression
Test AUC Score: 0.5209991931915283 (0.5123868370868708, 0.5296115492961858)
Test Accuracy Score: 0.8771692115274375 (0.8724493045938633, 0.8817381371696147)


ImportError: cannot import name 'SequenceNotStr' from 'pandas._typing' (/opt/venv/default/lib/python3.10/site-packages/pandas/_typing.py)

### SVM

In [16]:

warnings.filterwarnings("ignore")#category=ConvergenceWarning

for variables, name in zip(input_spec_list, input_spec_name):
    
    print(f"SVM: {name}")
    
    if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)' or 'human ECG labels + age + sex + tropt (KNN imputed)' or 'human ECG labels + age + sex + race + tropt (KNN imputed)':
        # Compute the median of the 'maxtrop_sameday' column
        median_maxtrop = df_train['maxtrop_sameday'].median()

        # Replace missing values with the median
        #imputer = SimpleImputer(strategy='median')
        imputer = KNNImputer(n_neighbors=5) 
        df_train['maxtrop_sameday'] = imputer.fit_transform(df_train[['maxtrop_sameday']])
        df_val['maxtrop_sameday'] = imputer.transform(df_val[['maxtrop_sameday']])
    
    variables_incl_y = variables + [groundtruth_ami]
    
    df_train_rel = df_train[variables_incl_y].dropna()
    df_val_rel = df_val[variables_incl_y].dropna() 

    # Prepare the training data
    X_train = df_train_rel[variables]
    y_train = df_train_rel[groundtruth_ami]

    # Prepare the validation data
    X_val = df_val_rel[variables]
    y_val = df_val_rel[groundtruth_ami]
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Initialize the Non-Linear Support Vector Machine (SVM) classifier with a kernel (e.g., 'rbf')
    svm_classifier = LinearSVC(random_state=42, C=0.05)

    # Train the model
    svm_classifier.fit(X_train_scaled, y_train)

    # Predict on validation set
    y_val_pred = svm_classifier.decision_function(X_val_scaled)  # get probabilities for non-linear SVM

#     # Calculate AUC
#     auc_score = roc_auc_score(y_val, y_val_pred)

#     print(f"AUC Score on Validation Set: {auc_score}")
    
    auc, ci = roc_auc_score(y_val, y_val_pred,
                        confidence_level=0.95,
                        method='bootstrap_bca',
                        n_resamples=1000)
    
    print(f'Test AUC Score: {auc} ({ci[0]}, {ci[1]})')
    
    # Step 2: Calculate the optimal threshold using Youden's J statistic
    fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)
    youden_j = tpr - fpr
    optimal_idx = np.argmax(youden_j)
    optimal_threshold = thresholds[optimal_idx]

    # Step 3: Binarize the predictions based on the optimal threshold
    y_val_class = (y_val_pred >= optimal_threshold).astype(int)
    
    acc, ci_acc = roc_auc_score(y_val, y_val_class,
                        confidence_level=0.95,
                        method='bootstrap_bca',
                        n_resamples=1000)
    
    print(f'Test Accuracy Score: {acc} ({ci_acc[0]}, {ci_acc[1]})')

warnings.resetwarnings()

SVM: St elevation, T-wave inversion, ST depression
Test AUC Score: 0.5883535271622037 (0.5624532170548846, 0.6160186151969017)
Test Accuracy Score: 0.581600751371318 (0.5566160026926681, 0.6077904836131369)


### Random Forest

In [7]:

for variables, name in zip(input_spec_list, input_spec_name):
    
    print(f"Random Forest: {name}")
    
    if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)' or 'human ECG labels + age + sex + tropt (KNN imputed)' or 'human ECG labels + age + sex + race + tropt (KNN imputed)':
        # Compute the median of the 'maxtrop_sameday' column
        median_maxtrop = df_train['maxtrop_sameday'].median()

        # Replace missing values with the median
        #imputer = SimpleImputer(strategy='median')
        imputer = KNNImputer(n_neighbors=5) 
        df_train['maxtrop_sameday'] = imputer.fit_transform(df_train[['maxtrop_sameday']])
        df_val['maxtrop_sameday'] = imputer.transform(df_val[['maxtrop_sameday']])
    
    variables_incl_y = variables + [groundtruth_ami]
    
    df_train_rel = df_train[variables_incl_y].dropna()
    df_val_rel = df_val[variables_incl_y].dropna() 

    # Prepare the training data
    X_train = df_train_rel[variables]
    y_train = df_train_rel[groundtruth_ami]

    # Prepare the validation data
    X_val = df_val_rel[variables]
    y_val = df_val_rel[groundtruth_ami]
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Initialize the Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=200, min_samples_leaf=75,max_depth=5, random_state=42)

    # Train the model
    rf_classifier.fit(X_train_scaled, y_train)

    # Predict on validation set
    y_val_pred = rf_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

    # Calculate AUC
#     auc_score = roc_auc_score(y_val, y_val_pred)

#     print(f"AUC Score on Validation Set: {auc_score}")
    
    auc, ci = roc_auc_score(y_val, y_val_pred,
                        confidence_level=0.95,
                        method='bootstrap_bca',
                        n_resamples=1000)
    
    print(f'Test AUC Score: {auc} ({ci[0]}, {ci[1]})')
    
    # Step 2: Calculate the optimal threshold using Youden's J statistic
    fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)
    youden_j = tpr - fpr
    optimal_idx = np.argmax(youden_j)
    optimal_threshold = thresholds[optimal_idx]

    # Step 3: Binarize the predictions based on the optimal threshold
    y_val_class = (y_val_pred >= optimal_threshold).astype(int)
    
    acc, ci_acc = roc_auc_score(y_val, y_val_class,
                        confidence_level=0.95,
                        method='bootstrap_bca',
                        n_resamples=1000)
    
    print(f'Test Accuracy Score: {acc} ({ci_acc[0]}, {ci_acc[1]})')


Random Forest: human ECG labels + age + sex
AUC Score on Validation Set: 0.6606830518697225
Random Forest: human ECG labels + age + sex + race
AUC Score on Validation Set: 0.6428716827503016
Random Forest: human ECG labels + age + sex + agi
AUC Score on Validation Set: 0.6275250090626519
Random Forest: human ECG labels + age + sex + agi + tropt (KNN imputed)
AUC Score on Validation Set: 0.7186432246072209
Random Forest: human ECG labels + age + sex + tropt (KNN imputed)
AUC Score on Validation Set: 0.7353023220747888
Random Forest: human ECG labels + age + sex + race + tropt (KNN imputed)
AUC Score on Validation Set: 0.7255710946924004
Random Forest: human ECG labels
AUC Score on Validation Set: 0.6398032267792522
Random Forest: St elevation, twi, st depression
AUC Score on Validation Set: 0.5688546064535586


### Gradient Boosted Trees

In [8]:
for variables, name in zip(input_spec_list, input_spec_name):
    
    print(f"Gradient Boosted Trees: {name}")
    
    if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)' or 'human ECG labels + age + sex + tropt (KNN imputed)' or 'human ECG labels + age + sex + race + tropt (KNN imputed)':
        # Compute the median of the 'maxtrop_sameday' column
        median_maxtrop = df_train['maxtrop_sameday'].median()

        # Replace missing values with the median
        #imputer = SimpleImputer(strategy='median')
        imputer = KNNImputer(n_neighbors=5) 
        df_train['maxtrop_sameday'] = imputer.fit_transform(df_train[['maxtrop_sameday']])
        df_val['maxtrop_sameday'] = imputer.transform(df_val[['maxtrop_sameday']])
    
    variables_incl_y = variables + [groundtruth_ami]
    
    df_train_rel = df_train[variables_incl_y].dropna()
    df_val_rel = df_val[variables_incl_y].dropna() 

    # Prepare the training data
    X_train = df_train_rel[variables]
    y_train = df_train_rel[groundtruth_ami]

    # Prepare the validation data
    X_val = df_val_rel[variables]
    y_val = df_val_rel[groundtruth_ami]
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Initialize the Gradient Boosting classifier
    gb_classifier = GradientBoostingClassifier( learning_rate=0.02, n_estimators=200, min_samples_leaf=50, max_depth=7, random_state=42)

    # Train the model
    gb_classifier.fit(X_train_scaled, y_train)

    # Predict on validation set
    y_val_pred = gb_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

#     # Calculate AUC
#     auc_score = roc_auc_score(y_val, y_val_pred)

#     print(f"AUC Score on Validation Set: {auc_score}")
    
    auc, ci = roc_auc_score(y_val, y_val_pred,
                        confidence_level=0.95,
                        method='bootstrap_bca',
                        n_resamples=1000)
    
    print(f'Test AUC Score: {auc} ({ci[0]}, {ci[1]})')
    
    # Step 2: Calculate the optimal threshold using Youden's J statistic
    fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)
    youden_j = tpr - fpr
    optimal_idx = np.argmax(youden_j)
    optimal_threshold = thresholds[optimal_idx]

    # Step 3: Binarize the predictions based on the optimal threshold
    y_val_class = (y_val_pred >= optimal_threshold).astype(int)
    
    acc, ci_acc = roc_auc_score(y_val, y_val_class,
                        confidence_level=0.95,
                        method='bootstrap_bca',
                        n_resamples=1000)
    
    print(f'Test Accuracy Score: {acc} ({ci_acc[0]}, {ci_acc[1]})')

Gradient Boosted Trees: human ECG labels + age + sex
AUC Score on Validation Set: 0.6631992611580217
Gradient Boosted Trees: human ECG labels + age + sex + race
AUC Score on Validation Set: 0.6610675512665862
Gradient Boosted Trees: human ECG labels + age + sex + agi
AUC Score on Validation Set: 0.6653278366100283
Gradient Boosted Trees: human ECG labels + age + sex + agi + tropt (KNN imputed)
AUC Score on Validation Set: 0.7384692216917464
Gradient Boosted Trees: human ECG labels + age + sex + tropt (KNN imputed)
AUC Score on Validation Set: 0.7596040033172496
Gradient Boosted Trees: human ECG labels + age + sex + race + tropt (KNN imputed)
AUC Score on Validation Set: 0.759741593787696
Gradient Boosted Trees: human ECG labels
AUC Score on Validation Set: 0.6176982810615199
Gradient Boosted Trees: St elevation, twi, st depression
AUC Score on Validation Set: 0.5688376432448734


### Summary table

In [9]:
# # Define the data with updated AUC scores
# data = {
#     'Model Class': ['LASSO', 'LASSO', 'LASSO', 'LASSO', 'LASSO', 'SVM', 'SVM', 'SVM', 'SVM', 'SVM',
#                     'Random Forest', 'Random Forest', 'Random Forest', 'Random Forest', 'Random Forest',
#                     'Gradient Boosted Trees', 'Gradient Boosted Trees', 'Gradient Boosted Trees', 'Gradient Boosted Trees', 'Gradient Boosted Trees'],
#     'Input Specification': ['human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)',
#                             'human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)',
#                             'human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)',
#                             'human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)'],
#     'AUC Score on Validation Set': [0.7066463395854508, 0.7297206566263205, 0.7400597492993317, 0.7411108050693185, 0.8028468218615151,
#                                     0.6963565246850842, 0.7202808833040747, 0.728156087344852, 0.7185059250746372, 0.7870613972601872,
#                                     0.6974929933167022, 0.7192244910530043, 0.7408173950537436, 0.7488705474876607, 0.8137788457581024,
#                                     0.6548307616495734, 0.6753395546521298, 0.7007299270072993, 0.7125096165718464, 0.8048698636425518]
# }

# # Create a DataFrame
# summary_df = pd.DataFrame(data)

# # Specify the desired order of rows and columns
# desired_row_order = ['LASSO', 'SVM', 'Random Forest', 'Gradient Boosted Trees']
# desired_column_order = ['human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)']

# # Filter the DataFrame to match the desired row order
# summary_df_filtered = summary_df[summary_df['Model Class'].isin(desired_row_order)]

# # Pivot the DataFrame
# summary_df_pivot = summary_df_filtered.pivot(index='Model Class', columns='Input Specification', values='AUC Score on Validation Set')

# # Reorder the rows and columns in the DataFrame
# summary_df_pivot = summary_df_pivot.reindex(desired_row_order)[desired_column_order]

# # Print the summary table with the desired ordering
# display(summary_df_pivot)


In [10]:
import pandas as pd

# Define the data with updated AUC scores
data = {
    'Model Class': ['LASSO', 'LASSO', 'LASSO', 'LASSO', 'LASSO', 'LASSO', 'LASSO',
                    'SVM', 'SVM', 'SVM', 'SVM', 'SVM', 'SVM', 'SVM',
                    'Random Forest', 'Random Forest', 'Random Forest', 'Random Forest', 'Random Forest', 'Random Forest', 'Random Forest',
                    'Gradient Boosted Trees', 'Gradient Boosted Trees', 'Gradient Boosted Trees', 'Gradient Boosted Trees', 'Gradient Boosted Trees', 'Gradient Boosted Trees', 'Gradient Boosted Trees'],
    'Input Specification': ['human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)',
                            'human ECG labels + age + sex + tropt (KNN imputed)', 'human ECG labels + age + sex + race + tropt (KNN imputed)',
                            'human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)',
                            'human ECG labels + age + sex + tropt (KNN imputed)', 'human ECG labels + age + sex + race + tropt (KNN imputed)',
                            'human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)',
                            'human ECG labels + age + sex + tropt (KNN imputed)', 'human ECG labels + age + sex + race + tropt (KNN imputed)',
                            'human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)',
                            'human ECG labels + age + sex + tropt (KNN imputed)', 'human ECG labels + age + sex + race + tropt (KNN imputed)'],
    'AUC Score on Validation Set': [0.7270976694135304, 0.7665902075722821, 0.7687225667601167, 0.7722616001201459, 0.8135293753883394, 0.8116784187126553, 0.8121938096451732,
                                    0.7203154645597938, 0.7625237164783507, 0.7653328802424036, 0.7704829690182486, 0.8171602564291329, 0.8138022824455582, 0.8128932687678758,
                                    0.7333814742446124, 0.7585591708435986, 0.7559652252711466, 0.7349427392685821, 0.8235562609876115, 0.8513295386967972, 0.8473763203352873,
                                    0.6781666808257583, 0.7161243734601987, 0.7228754283125195, 0.7114318274845621, 0.8218836410109928, 0.8381899017359047, 0.8273921785178262]
}

# Create a DataFrame
summary_df = pd.DataFrame(data)

# Specify the desired order of rows and columns
desired_row_order = ['LASSO', 'SVM', 'Random Forest', 'Gradient Boosted Trees']
desired_column_order = ['human ECG labels', 'human ECG labels + age + sex', 'human ECG labels + age + sex + race', 'human ECG labels + age + sex + agi', 'human ECG labels + age + sex + agi + tropt (KNN imputed)', 
                        'human ECG labels + age + sex + tropt (KNN imputed)', 'human ECG labels + age + sex + race + tropt (KNN imputed)']

# Filter the DataFrame to match the desired row order
summary_df_filtered = summary_df[summary_df['Model Class'].isin(desired_row_order)]

# Pivot the DataFrame
summary_df_pivot = summary_df_filtered.pivot(index='Model Class', columns='Input Specification', values='AUC Score on Validation Set')

# Reorder the rows and columns in the DataFrame
summary_df_pivot = summary_df_pivot.reindex(desired_row_order)[desired_column_order]

# Print the summary table with the desired ordering
display(summary_df_pivot)

Input Specification,human ECG labels,human ECG labels + age + sex,human ECG labels + age + sex + race,human ECG labels + age + sex + agi,human ECG labels + age + sex + agi + tropt (KNN imputed),human ECG labels + age + sex + tropt (KNN imputed),human ECG labels + age + sex + race + tropt (KNN imputed)
Model Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LASSO,0.727098,0.76659,0.768723,0.772262,0.813529,0.811678,0.812194
SVM,0.720315,0.762524,0.765333,0.770483,0.81716,0.813802,0.812893
Random Forest,0.733381,0.758559,0.755965,0.734943,0.823556,0.85133,0.847376
Gradient Boosted Trees,0.678167,0.716124,0.722875,0.711432,0.821884,0.83819,0.827392


In [11]:
summary_df_pivot[['human ECG labels','human ECG labels + age + sex + tropt (KNN imputed)']]

Input Specification,human ECG labels,human ECG labels + age + sex + tropt (KNN imputed)
Model Class,Unnamed: 1_level_1,Unnamed: 2_level_1
LASSO,0.727098,0.811678
SVM,0.720315,0.813802
Random Forest,0.733381,0.85133
Gradient Boosted Trees,0.678167,0.83819


## Analysis in untested population

Evaluate how well AMI models trained above do in predicting Mace_death_30 in the untested population. Only done for Tropt_imputed model for now.

In [20]:
############################
## Load data tested
############################

df_train_t = pd.read_csv('train_ids_labels_untested_with_covars_all.csv')
df_val_t = pd.read_csv('val_ids_labels_untested_with_covars_all.csv')
df_test_t = pd.read_csv('test_ids_labels_untested_with_covars_all.csv')
df_all_t = pd.read_csv('all_ids_labels_untested_with_covars_all.csv')

### Lasso test

In [13]:
variables, name = input_spec_list[-1], input_spec_name[-1]

print(f"LASSO: {name}")

if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)' or 'human ECG labels + age + sex + tropt (KNN imputed)' or 'human ECG labels + age + sex + race + tropt (KNN imputed)':
    # Compute the median of the 'maxtrop_sameday' column
    median_maxtrop = df_train['maxtrop_sameday'].median()

    # Replace missing values with the median
    #imputer = SimpleImputer(strategy='median')
    imputer = KNNImputer(n_neighbors=5) 
    df_train_t['maxtrop_sameday'] = imputer.fit_transform(df_train_t[['maxtrop_sameday']])
    df_val_t['maxtrop_sameday'] = imputer.transform(df_val_t[['maxtrop_sameday']])

variables_incl_y = variables + [adverse_event]

df_train_rel = df_train_t[variables_incl_y].dropna()
df_val_rel = df_train_t[variables_incl_y].dropna() 

# Prepare the training data
X_train = df_train_rel[variables]
y_train = df_train_rel[adverse_event]

# Prepare the validation data
X_val = df_val_rel[variables]
y_val = df_val_rel[adverse_event]

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

#     # Initialize the LASSO Logistic Regression classifier
#     lasso_classifier = LogisticRegression(penalty='l1', solver='liblinear', C=1, random_state=42)

#     # Train the model
#     lasso_classifier.fit(X_train_scaled, y_train)

# Predict on validation set
y_val_pred = lasso_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val, y_val_pred)

print(f"AUC Score on Validation Set: {auc_score}")

LASSO: St elevation, twi, st depression
AUC Score on Validation Set: 0.5274126257863443


### SVM Test

In [14]:
variables, name = input_spec_list[-1], input_spec_name[-1]

print(f"SVM: {name}")

if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)' or 'human ECG labels + age + sex + tropt (KNN imputed)' or 'human ECG labels + age + sex + race + tropt (KNN imputed)':
    # Compute the median of the 'maxtrop_sameday' column
    median_maxtrop = df_train['maxtrop_sameday'].median()

    # Replace missing values with the median
    #imputer = SimpleImputer(strategy='median')
    imputer = KNNImputer(n_neighbors=5) 
    df_train_t['maxtrop_sameday'] = imputer.fit_transform(df_train_t[['maxtrop_sameday']])
    df_val_t['maxtrop_sameday'] = imputer.transform(df_val_t[['maxtrop_sameday']])

variables_incl_y = variables + [adverse_event]

df_train_rel = df_train_t[variables_incl_y].dropna()
df_val_rel = df_train_t[variables_incl_y].dropna() 

# Prepare the training data
X_train = df_train_rel[variables]
y_train = df_train_rel[adverse_event]

# Prepare the validation data
X_val = df_val_rel[variables]
y_val = df_val_rel[adverse_event]

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

#     # Initialize the LASSO Logistic Regression classifier
#     lasso_classifier = LogisticRegression(penalty='l1', solver='liblinear', C=1, random_state=42)

#     # Train the model
#     lasso_classifier.fit(X_train_scaled, y_train)

# Predict on validation set
y_val_pred = svm_classifier.decision_function(X_val_scaled)  # get probabilities for non-linear SVM

# Calculate AUC
auc_score = roc_auc_score(y_val, y_val_pred)

print(f"AUC Score on Validation Set: {auc_score}")

SVM: St elevation, twi, st depression
AUC Score on Validation Set: 0.5274126257863443


### Random Forest Test

In [15]:
variables, name = input_spec_list[-1], input_spec_name[-1]

print(f"Random Forest: {name}")

if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)' or 'human ECG labels + age + sex + tropt (KNN imputed)' or 'human ECG labels + age + sex + race + tropt (KNN imputed)':
    # Compute the median of the 'maxtrop_sameday' column
    median_maxtrop = df_train['maxtrop_sameday'].median()

    # Replace missing values with the median
    #imputer = SimpleImputer(strategy='median')
    imputer = KNNImputer(n_neighbors=5) 
    df_train_t['maxtrop_sameday'] = imputer.fit_transform(df_train_t[['maxtrop_sameday']])
    df_val_t['maxtrop_sameday'] = imputer.transform(df_val_t[['maxtrop_sameday']])

variables_incl_y = variables + [adverse_event]

df_train_rel = df_train_t[variables_incl_y].dropna()
df_val_rel = df_train_t[variables_incl_y].dropna() 

# Prepare the training data
X_train = df_train_rel[variables]
y_train = df_train_rel[adverse_event]

# Prepare the validation data
X_val = df_val_rel[variables]
y_val = df_val_rel[adverse_event]

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

#     # Initialize the LASSO Logistic Regression classifier
#     lasso_classifier = LogisticRegression(penalty='l1', solver='liblinear', C=1, random_state=42)

#     # Train the model
#     lasso_classifier.fit(X_train_scaled, y_train)

# Predict on validation set
y_val_pred = rf_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val, y_val_pred)

print(f"AUC Score on Validation Set: {auc_score}")

Random Forest: St elevation, twi, st depression
AUC Score on Validation Set: 0.5273914211928595


### Gradient Boosted Tree Test

In [16]:
variables, name = input_spec_list[-1], input_spec_name[-1]

print(f"Gradient Boosted Trees: {name}")

if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)' or 'human ECG labels + age + sex + tropt (KNN imputed)' or 'human ECG labels + age + sex + race + tropt (KNN imputed)':
    # Compute the median of the 'maxtrop_sameday' column
    median_maxtrop = df_train['maxtrop_sameday'].median()

    # Replace missing values with the median
    #imputer = SimpleImputer(strategy='median')
    imputer = KNNImputer(n_neighbors=5) 
    df_train_t['maxtrop_sameday'] = imputer.fit_transform(df_train_t[['maxtrop_sameday']])
    df_val_t['maxtrop_sameday'] = imputer.transform(df_val_t[['maxtrop_sameday']])

variables_incl_y = variables + [adverse_event]

df_train_rel = df_train_t[variables_incl_y].dropna()
df_val_rel = df_train_t[variables_incl_y].dropna() 

# Prepare the training data
X_train = df_train_rel[variables]
y_train = df_train_rel[adverse_event]

# Prepare the validation data
X_val = df_val_rel[variables]
y_val = df_val_rel[adverse_event]

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

#     # Initialize the LASSO Logistic Regression classifier
#     lasso_classifier = LogisticRegression(penalty='l1', solver='liblinear', C=1, random_state=42)

#     # Train the model
#     lasso_classifier.fit(X_train_scaled, y_train)

# Predict on validation set
y_val_pred = gb_classifier.predict_proba(X_val_scaled)[:, 1]  # get probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val, y_val_pred)

print(f"AUC Score on Validation Set: {auc_score}")

Gradient Boosted Trees: St elevation, twi, st depression
AUC Score on Validation Set: 0.5274073681296189


In [17]:
# # Define the data for the new table
# adverse_event_data = {
#     'Model Class': ['LASSO', 'SVM', 'Random Forest', 'Gradient Boosted Trees'],
#     'Input Specification': ['human ECG labels + age + sex + race + tropt (KNN imputed)'] * 4,
#     'AUC Score on Validation Set': [
#         0.6205558850177413,
#         0.6051718104474512,
#         0.6307197244884056,
#         0.5623672827896413
#     ]
# }

# # Create a DataFrame for the untested patient population model performance
# adverse_event_df = pd.DataFrame(adverse_event_data)

# # Pivot the DataFrame for better readability
# adverse_event_df_pivot = adverse_event_df.pivot(index='Model Class', columns='Input Specification', values='AUC Score on Validation Set')

# # Display the summary table
# display(adverse_event_df_pivot)

import pandas as pd

# Define the updated data for the new table
adverse_event_data = {
    'Model Class': ['LASSO', 'SVM', 'Random Forest', 'Gradient Boosted Trees'],
    'Input Specification': ['human ECG labels + age + sex + tropt (KNN imputed)', 
                            'human ECG labels + age + sex + tropt (KNN imputed)', 
                            'human ECG labels + age + sex + tropt (KNN imputed)', 
                            'human ECG labels + age + sex + tropt (KNN imputed)'],
    'AUC Score on Validation Set': [
        0.6033697663004493,  # Updated AUC for LASSO
        0.585304960924227,   # Updated AUC for SVM
        0.6084229055763204,  # Updated AUC for Random Forest
        0.4973938866236267   # Updated AUC for Gradient Boosted Trees
    ]
}

# Create a DataFrame for the updated model performance
adverse_event_df = pd.DataFrame(adverse_event_data)

# Pivot the DataFrame for better readability
adverse_event_df_pivot = adverse_event_df.pivot(index='Model Class', columns='Input Specification', values='AUC Score on Validation Set')

# Display the updated summary table
display(adverse_event_df_pivot)


Input Specification,human ECG labels + age + sex + tropt (KNN imputed)
Model Class,Unnamed: 1_level_1
Gradient Boosted Trees,0.497394
LASSO,0.60337
Random Forest,0.608423
SVM,0.585305


In [18]:
import pandas as pd

# Define the updated data for the table
data_untested = {
    'Model Class': ['LASSO', 'SVM', 'Random Forest', 'Gradient Boosted Trees'],
    'Input Specification': ['human ECG labels'] * 4,  # Same input specification for all models
    'AUC Score on Validation Set': [
        0.508169668139087,   # Updated AUC for LASSO
        0.5012620342054592,  # Updated AUC for SVM
        0.5692929078415827,  # Updated AUC for Random Forest
        0.47613980443236303  # Updated AUC for Gradient Boosted Trees
    ]
}

# Create a DataFrame from the updated data
untested_df = pd.DataFrame(data_untested)

# Pivot the DataFrame to align models and their scores
untested_df_pivot = untested_df.pivot(index='Model Class', columns='Input Specification', values='AUC Score on Validation Set')

# Display the updated table
display(untested_df_pivot)


Input Specification,human ECG labels
Model Class,Unnamed: 1_level_1
Gradient Boosted Trees,0.47614
LASSO,0.50817
Random Forest,0.569293
SVM,0.501262


### Off-topic waveform processing

In [19]:
# Assuming 'arr' is your original numpy array with shape [leads, time]
short_lead_arr = waveform_arr[:12]
long_lead_arr = waveform_arr[12:]
short_lead_arr = np.nan_to_num(short_lead_arr)


plot_ecg_waveform(short_lead_arr)
print(short_lead_arr[0])

compressed_arr = np.zeros((3, 5000))

# Sum the specified channels for each new channel
compressed_arr[0, :] = short_lead_arr[0, :] + short_lead_arr[3, :] + short_lead_arr[6, :] + short_lead_arr[9, :]
compressed_arr[1, :] = short_lead_arr[1, :] + short_lead_arr[4, :] + short_lead_arr[7, :] + short_lead_arr[10, :]
compressed_arr[2, :] = short_lead_arr[2, :] + short_lead_arr[5, :] + short_lead_arr[8, :] + short_lead_arr[11, :]

compressed_arr = np.concatenate((compressed_arr, long_lead_arr), axis=0)

plot_ecg_waveform(compressed_arr)
print(compressed_arr[0])

print(compressed_arr.shape)

NameError: name 'waveform_arr' is not defined

In [None]:
import os
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool

def process_file(filepath):
    filename = os.path.basename(filepath).split('.')[0]
    npz_file = np.load(filepath)
    waveform_arr = npz_file['waveform']
    
    short_lead_arr = waveform_arr[:12]
    long_lead_arr = waveform_arr[12:]
    short_lead_arr = np.nan_to_num(short_lead_arr)

    compressed_arr = np.zeros((3, 5000))

    # Sum the specified channels for each new channel
    compressed_arr[0, :] = short_lead_arr[0, :] + short_lead_arr[3, :] + short_lead_arr[6, :] + short_lead_arr[9, :]
    compressed_arr[1, :] = short_lead_arr[1, :] + short_lead_arr[4, :] + short_lead_arr[7, :] + short_lead_arr[10, :]
    compressed_arr[2, :] = short_lead_arr[2, :] + short_lead_arr[5, :] + short_lead_arr[8, :] + short_lead_arr[11, :]

    compressed_arr = np.concatenate((compressed_arr, long_lead_arr), axis=0)
    
    savepath = os.path.join(waveform_path, f'{filename}.npy')
    np.save(savepath, compressed_arr)


os.chdir('/home/ngsci')

waveform_path = '/home/ngsci/project/NEJM_benchmark/waveforms_all_channel'
# npz_files = [os.path.join(waveform_path, f) for f in os.listdir(waveform_path) if f.endswith('.npz')]

# Use multiprocessing to process files
with Pool(processes=os.cpu_count()) as pool:
    list(tqdm(pool.imap(process_file, npz_files), total=len(npz_files)))
