In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from scipy.stats import mode
from scipy import stats
from multiprocessing import Pool
import neurokit2 as nk
from biosppy.signals import ecg
import biosppy
import os

import matplotlib.pyplot as plt

## Generate "naive" training / test set

In [2]:
train_angio = pd.read_csv("/home/ngsci/project/NEJM_benchmark/train_ids_labels_with_covars_all.csv")
val_angio = pd.read_csv("/home/ngsci/project/NEJM_benchmark/val_ids_labels_with_covars_all.csv")
test_angio = pd.read_csv("/home/ngsci/project/NEJM_benchmark/test_ids_labels_with_covars_all.csv")

train_untested = pd.read_csv("/home/ngsci/project/NEJM_benchmark/train_ids_labels_untested_with_covars_all.csv")
val_untested = pd.read_csv("/home/ngsci/project/NEJM_benchmark/val_ids_labels_untested_with_covars_all.csv")
test_untested = pd.read_csv("/home/ngsci/project/NEJM_benchmark/test_ids_labels_untested_with_covars_all.csv")

In [5]:
print(train_angio.shape)
print(val_angio.shape)
print(test_angio.shape)
print(test_angio.columns)

print(train_untested.shape)
print(val_untested.shape)
print(test_untested.shape)

train_angio.head()

(5043, 82)
(1978, 82)
(2954, 82)
Index(['Unnamed: 0', 'patient_ngsci_id', 'ecg_id', 'date', 'p-r-t_axes',
       'p_axes', 'r_axes', 't_axes', 'pr_interval', 'pr_interval_units',
       'qrs_duration', 'qrs_duration_units', 'qtqtc', 'qt_interval',
       'qt_interval_units', 'qtc_interval', 'qtc_interval_units', 'vent_rate',
       'vent_rate_units', 'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
       'ecg_id_new', 'ed_enc_id', 'start_datetime', 'end_datetime',
       'age_at_admit', 'macetrop_030_pos', 'death_030_day',
       'macetrop_pos_or_death_030', 'stent_010_day', 'cabg_010_day',
       'stent_or_cabg_010_day', 'ami_day_of', 'da

Unnamed: 0.1,Unnamed: 0,patient_ngsci_id,ecg_id,date,p-r-t_axes,p_axes,r_axes,t_axes,pr_interval,pr_interval_units,...,race_other,agi_under_25k,agi_25k_to_50k,agi_50k_to_75k,agi_75k_to_100k,agi_100k_to_200k,agi_above_200k,ste_std_twi,female,split
0,26,pat001162d6,ecg162c83f05d,2114-06-21T21:04:34Z,45 59 0,45.0,59.0,0.0,224.0,ms,...,0,0.274465,0.181957,0.137615,0.106269,0.212538,0.087156,False,1,train
1,27,pat001162d6,ecg86065367ee,2114-06-21T21:04:34Z,45 59 0,45.0,59.0,0.0,224.0,ms,...,0,0.274465,0.181957,0.137615,0.106269,0.212538,0.087156,False,1,train
2,89,pat004815f1,ecgdd0d198786,2112-05-05T08:36:38Z,72 91 40,72.0,91.0,40.0,144.0,ms,...,0,0.254731,0.084425,0.061135,0.049491,0.142649,0.407569,False,1,train
3,235,pat009bc9fe,ecgcafc2054f7,2110-03-17T05:44:48Z,23 17 28,23.0,17.0,28.0,172.0,ms,...,0,0.418807,0.289609,0.138285,0.066377,0.072303,0.014619,False,1,train
4,276,pat00ab4be9,ecgde93ebc234,2114-08-16T08:39:34Z,65 4 40,65.0,4.0,40.0,156.0,ms,...,0,0.469675,0.315089,0.120562,0.048817,0.038462,0.007396,False,1,train


### Filter down to data to use

- Positive = positive based on angiography
- Negative = untested no MACE_death_010

In [6]:
train_positives = train_angio[train_angio['stent_or_cabg_010_day']==1]
val_positives = val_angio[val_angio['stent_or_cabg_010_day']==1]
test_positives = test_angio[test_angio['stent_or_cabg_010_day']==1]

train_negatives = train_untested[train_untested['macetrop_pos_or_death_030']==0]
val_negatives = val_untested[val_untested['macetrop_pos_or_death_030']==0]
test_negatives = test_untested[test_untested['macetrop_pos_or_death_030']==0]

train_positives['ecg_cnt'] = np.NaN
val_positives['ecg_cnt'] = np.NaN
test_positives['ecg_cnt'] = np.NaN

print(train_positives.shape)
print(train_negatives.shape)

print(val_positives.shape)
print(val_negatives.shape)

print(test_positives.shape)
print(test_negatives.shape)

(847, 83)
(40392, 83)
(320, 83)
(15790, 83)
(536, 83)
(24478, 83)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_positives['ecg_cnt'] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_positives['ecg_cnt'] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_positives['ecg_cnt'] = np.NaN


In [7]:
train_naive = pd.concat([train_positives, train_negatives], axis=0, ignore_index=True)
val_naive = pd.concat([val_positives, val_negatives], axis=0, ignore_index=True)
test_naive = pd.concat([test_positives, test_negatives], axis=0, ignore_index=True)
all_naive = pd.concat([train_naive,val_naive, test_naive], axis=0, ignore_index=True)

In [8]:
train_naive.to_csv("/home/ngsci/project/NEJM_benchmark/train_ids_labels_with_covars_all_NAIVE.csv")
test_naive.to_csv("/home/ngsci/project/NEJM_benchmark/val_ids_labels_with_covars_all_NAIVE.csv")
test_naive.to_csv("/home/ngsci/project/NEJM_benchmark/test_ids_labels_with_covars_all_NAIVE.csv")
all_naive.to_csv("/home/ngsci/project/NEJM_benchmark/all_ids_labels_NAIVE.csv")