## Baseline training - Models only using metadata

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import roc_auc_score
from scipy.stats import mode
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from multiprocessing import Pool
import os
import warnings

from sklearn.metrics import roc_curve
from confidenceinterval import roc_auc_score, accuracy_score

import matplotlib.pyplot as plt
import joblib

In [3]:
############################
## Load data
############################

df_train = pd.read_csv('path/to/training_data')
df_val = pd.read_csv('path/to/validation_data') 
df_test = pd.read_csv('path/to/test_data')
df_all = pd.read_csv('path/to/all_structured_features')

In [1]:
# Configure everything
ste_plus = ['has_depress', 'has_st_eleva', 'has_twave_inver', 'has_twave_abnormal'] 
groundtruth_ami = 'stent_or_cabg_010_day'
input_spec_list = [ste_plus] 
input_spec_name = ['St elevation, T-wave inversion, ST depression, T-wave abnormal'] 

## Develop logistic regression baseline model based on ECG features

In [6]:
for variables, name in zip(input_spec_list, input_spec_name):
    
    print(f"LASSO: {name}")
    
    if name == 'human ECG labels + age + sex + agi + tropt (KNN imputed)' or 'human ECG labels + age + sex + tropt (KNN imputed)' or 'human ECG labels + age + sex + race + tropt (KNN imputed)':
        # Compute the median of the 'maxtrop_sameday' column
        median_maxtrop = df_train['maxtrop_sameday'].median()

        # Replace missing values with the median
        #imputer = SimpleImputer(strategy='median')
        imputer = KNNImputer(n_neighbors=5) 
        df_train['maxtrop_sameday'] = imputer.fit_transform(df_train[['maxtrop_sameday']])
        df_val['maxtrop_sameday'] = imputer.transform(df_val[['maxtrop_sameday']])
    
    variables_incl_y = variables + [groundtruth_ami]
    
    df_train_rel = df_train[variables_incl_y].dropna()
    df_val_rel = df_test[variables_incl_y].dropna() 

    # Prepare the training data
    X_train = df_train_rel[variables]
    y_train = df_train_rel[groundtruth_ami]

    # Prepare the validation data
    X_val = df_val_rel[variables]
    y_val = df_val_rel[groundtruth_ami]
    
    # Initialize the LASSO Logistic Regression classifier
    logistic_classifier = LogisticRegression(penalty='none', solver='lbfgs', random_state=42)

    # Train the model
    logistic_classifier.fit(X_train, y_train)
    
    joblib.dump(logistic_classifier, "structured_feature_logreg_acs.pkl")

    # Predict on validation set
    y_val_pred = logistic_classifier.predict_proba(X_val)[:, 1]  # get probabilities for the positive class
    
    auc, ci = roc_auc_score(y_val, y_val_pred,
                        confidence_level=0.95)
    
    print(f'Test AUC Score: {auc} ({ci[0]}, {ci[1]})')
    
    # Step 2: Calculate the optimal threshold using Youden's J statistic
    fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)
    youden_j = tpr - fpr
    optimal_idx = np.argmax(youden_j)
    optimal_threshold = thresholds[optimal_idx]

    # Step 3: Binarize the predictions based on the optimal threshold
    y_val_class = (y_val_pred >= optimal_threshold).astype(int)
    
    acc, ci_acc = accuracy_score(y_val, y_val_class,
                        confidence_level=0.95)
    
    print(f'Test Accuracy Score: {acc} ({ci_acc[0]}, {ci_acc[1]})')

df_test['preds_ste_sti_twi_logist'] = y_val_pred
df_test['binary_preds_ste_sti_twi_logist'] = y_val_class
df_test.to_csv('test_ids_labels_with_covars_all_final.csv') #In case you aim to use this as baseline on the validation proportion of the public Nightingale dataset

LASSO: St elevation, T-wave inversion, ST depression, T-wave abnormal
Test AUC Score: 0.5282750129699707 (0.4965039726284047, 0.5600460533115368)
Test Accuracy Score: 0.6295971978984238 (0.6011982387694503, 0.657127202085731)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['maxtrop_sameday'] = imputer.fit_transform(df_train[['maxtrop_sameday']])
