In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy.stats import mode
from scipy import stats
from multiprocessing import Pool
import neurokit2 as nk
from biosppy.signals import ecg
import biosppy
import os

import matplotlib.pyplot as plt

### Some playing around

In [3]:
ed_encounter = pd.read_csv('/home/ngsci/datasets/ed-bwh-ecg/v1/ed-encounter.csv')
ecg_metadata = pd.read_csv('/home/ngsci/datasets/ed-bwh-ecg/v1/ecg-metadata.csv')
ecg_npy = pd.read_csv('/home/ngsci/datasets/ed-bwh-ecg/v1/ecg-npy-index.csv')
patient = pd.read_csv('/home/ngsci/datasets/ed-bwh-ecg/v1/patient.csv')
ecg_to_ed_enc = pd.read_csv('/home/ngsci/datasets/ed-bwh-ecg/v1/ecg-ed-enc.csv')

In [4]:
#Remove ecg string from ID
ecg_metadata['ecg_id_new'] = ecg_metadata['ecg_id'].str[3:]
ecg_npy['ecg_id_new'] = ecg_npy['ecg_id'].str[3:]
ecg_to_ed_enc['ecg_id_new'] = ecg_to_ed_enc['ecg_id'].str[3:]

### Assess quality of guideline-based AMI labels

In [5]:
# Create dataset that has ECG metadata and testing information
###########################################
## Get encounter ID into the dataframe 
###########################################

# Check if the columns in ecg_to_ed_enc (except 'ecg_id') are not in ecg_metadata
ecg_to_ed_enc_new = ecg_to_ed_enc.drop('ecg_id', axis=1)
columns_to_check = [col for col in ecg_to_ed_enc.columns if col != 'ecg_id' and col != 'ecg_id_new']
if not any(col in ecg_metadata.columns for col in columns_to_check):
    # Merge the DataFrames
    merged_df = pd.merge(ecg_metadata, ecg_to_ed_enc_new, on='ecg_id_new', how='left')
    print('done')
else:
    print("Merge not performed: Columns from ecg_to_ed_enc already exist in ecg_metadata.")

###########################################
## Get encounter information into dataframe 
###########################################

# Check if the columns in ecg_to_ed_enc (except 'ecg_id') are not in ecg_metadata
ed_encounter_new = ed_encounter.drop('patient_ngsci_id', axis=1)
columns_to_check = [col for col in ed_encounter.columns if col not in ['ed_enc_id','patient_ngsci_id']]
if not any(col in merged_df.columns for col in columns_to_check):
    # Merge the DataFrames
    ecg_analysis_df = pd.merge(merged_df, ed_encounter_new, on='ed_enc_id', how='left')
    print('done')
else:
    print("Merge not performed: Columns from merged_df already exist in ed_encounter.")
    
###########################################
## Get patient info into dataframe 
###########################################

# Check if the columns in ecg_to_ed_enc (except 'ecg_id') are not in ecg_metadata
columns_to_check = [col for col in patient.columns if col not in ['ed_enc_id','patient_ngsci_id']]
if not any(col in merged_df.columns for col in columns_to_check):
    # Merge the DataFrames
    ecg_analysis_df = pd.merge(ecg_analysis_df, patient, on='patient_ngsci_id', how='left')
    print('done')
else:
    print("Merge not performed: Columns from merged_df already exist in ed_encounter.")

done
done
done


In [6]:
###############################################################
## Filter out the frail and chronically ill patients
###############################################################

ecg_analysis_df_tested = ecg_analysis_df[ecg_analysis_df['cath_010_day']==True]
ecg_analysis_df_included_all = ecg_analysis_df[ecg_analysis_df['exclude_modeling']==False]
ecg_analysis_df_tested_all = ecg_analysis_df_included_all[ecg_analysis_df_included_all['cath_010_day']==True]
## Restrict sample to patients that were tested
ecg_analysis_df_included_tested_all = ecg_analysis_df_included_all[ecg_analysis_df_included_all['cath_010_day']==True]
ecg_analysis_df_included_untested_all = ecg_analysis_df_included_all[ecg_analysis_df_included_all['cath_010_day']==False]

## Construct Table 1 (encounter level, as dataset overview)

Rationale demographics e.g. age may change between different patient encounters

In [7]:
ecg_ed_enc_meta = ecg_to_ed_enc.merge(ecg_metadata, on='ecg_id', how='left')
ecg_ed_enc_meta_merge = ecg_ed_enc_meta[['ed_enc_id','has_twave_inver', 'has_depress', 'has_st_eleva','has_afib']] #'ecg_id', 


In [8]:
encounter_w_demographic_df = (
    ed_encounter
    .merge(    
        patient, 
        on='patient_ngsci_id', 
        how='left'
    )
    # .merge(ecg_metadata,
    #        on='ed_enc_id',
    #        how='left')
    .merge(
        ( # merge in ECG counts for each encounter
            ecg_to_ed_enc
            .groupby('ed_enc_id')
            .agg(ecg_cnt=pd.NamedAgg(column="ecg_id", aggfunc="count"))
            .reset_index()
        ),
        on='ed_enc_id',
        how='left'
    )
    
)

encounter_w_demographic_df = encounter_w_demographic_df.merge(
        ( # merge in ECG characteristics
            ecg_ed_enc_meta_merge
            .groupby('ed_enc_id')
            .mean()
            .reset_index()
        ),
        on='ed_enc_id',
        how='left'
    )

In [9]:
def get_stats_for_ed_encounters(df):
    '''Get statistics for demographics and key variables.'''
    output = pd.Series()
    # counts
    enc_counts = pd.Series()
    enc_counts.loc['Visits'] = df['ed_enc_id'].nunique()
    enc_counts.loc['Patients'] = df['patient_ngsci_id'].nunique()
    enc_counts.loc['ECGs'] = df['ecg_cnt'].sum().astype(int)
    enc_counts = enc_counts.apply('{:,d}'.format)
    output = pd.concat([output,enc_counts])
    
    # demographics - age
    demographics = pd.Series()
    age_mean = df['age_at_admit'].mean()
    age_sem = df['age_at_admit'].sem()
    demographics.loc['Age Mean (years)'] = f"{age_mean:.2f} ({age_sem:.3f})"
    
    # demographics - sex
    sex_stats = df['sex'].value_counts(normalize=True, dropna=False)
    female_pct = sex_stats.get('Female', 0)  # Use .get to avoid KeyError if 'Female' is not a category
    female_sem = np.sqrt(female_pct * (1 - female_pct) / len(df))
    demographics.loc['Female'] = f"{female_pct:.3g} ({female_sem:.3f})"
    
    # demographics - race
    for race in ['black', 'hispanic', 'white', 'other']:
        mean = df[f'race_{race}'].mean()
        sem = df[f'race_{race}'].sem()
        demographics.loc[race.capitalize()] = f"{mean:.3g} ({sem:.3f})"
    
    output.loc['Demographics'] = ''
    output = pd.concat([output, demographics])
    
    # key variables
    key_variables = pd.Series()
    for var, label in [('stent_or_cabg_010_day', 'Positive Test')]:
        mean = round(df[var].replace({True: 1, False: 0}).mean(),4)
        sem = df[var].replace({True: 1, False: 0}).sem()
        key_variables.loc[label] = f"{mean:.3g} ({sem:.3f})"
    
    output.loc['Outcomes'] = ''
    output = pd.concat([output, key_variables])
    
    return output


In [10]:
included_df = encounter_w_demographic_df.query('exclude_modeling == False') 
#included_df = encounter_w_demographic_df.query('exclude == False')
tested_df = included_df.query('cath_010_day == True')
untested_df = included_df.query('cath_010_day == False')

ecg_analysis_df['ecg_cnt'] = 1
ecg_analysis_df_included_all['ecg_cnt'] = 1
ecg_analysis_df_tested_all['ecg_cnt'] = 1
ecg_analysis_df_included_untested_all['ecg_cnt'] = 1

column_subsets = {
    #'All': ecg_analysis_df, #encounter_w_demographic_df,
    'Total': ecg_analysis_df_included_all, #included_df,
    'Tested': ecg_analysis_df_tested_all, #tested_df,
    'Untested': ecg_analysis_df_included_untested_all #untested_df,
}

table1 = pd.DataFrame()
for col_name, col_subset in column_subsets.items():
    table1[col_name] = get_stats_for_ed_encounters(col_subset)

table1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ecg_analysis_df_included_all['ecg_cnt'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ecg_analysis_df_tested_all['ecg_cnt'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ecg_analysis_df_included_untested_all['ecg_cnt'] = 1


Unnamed: 0,Total,Tested,Untested
Visits,51158,1952,49206
Patients,35595,1758,34465
ECGs,73392,3925,69467
Demographics,,,
Age Mean (years),54.91 (0.070),63.51 (0.227),54.43 (0.073)
Female,0.561 (0.002),0.381 (0.008),0.571 (0.002)
Black,0.233 (0.002),0.145 (0.006),0.238 (0.002)
Hispanic,0.165 (0.001),0.0968 (0.005),0.169 (0.001)
White,0.522 (0.002),0.692 (0.007),0.513 (0.002)
Other,0.0791 (0.001),0.0652 (0.004),0.0799 (0.001)
