### Select patients for CFR model: Split patients in train, val and test sets ###

In [1]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [2]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
pet_data_dir = os.path.normpath('/mnt/obi0/phi/pet/pet_cfr')
cfr_meta_date = '200425'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200425


In [3]:
match_view_filename = 'pet_match365_diff_files_'+cfr_meta_date+'.parquet'
files_cfr = pd.read_parquet(os.path.join(meta_dir, match_view_filename))

print('Total number of patients      {}'.format(len(files_cfr.mrn.unique())))
print('Total number of echo studies  {}'.format(len(files_cfr.study.unique())))
print('Total number of PET studies   {}'.format(len(files_cfr.petmrn_identifier.unique())))
print('Total number of echos         {}'.format(len(files_cfr.filename.unique())))

files_cfr.head(2)

Total number of patients      2755
Total number of echo studies  4327
Total number of PET studies   2956
Total number of echos         204907


Unnamed: 0,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),pet_measurement,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,a2c_laocc,a2c_lvocc_s,a3c,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views
0,35169325,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113,2017-05-22,2017-06-21,35169325_2017-05-22,30,30.0,1.0,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b090131...,2017-06-21 09:04:50,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113_...,BWH,iE33,Philips Medical Systems,0.0,33.333,77.0,71.0,0.044038,0.044038,8.544426e-11,1.134038e-13,5.271967e-14,1.336331e-13,1.708275e-15,4.455052e-13,1.0,1.499724e-11,5.666772e-14,6.379312e-14,6.884242e-12,1.625494e-14,1.684185e-12,1.107935e-14,2.403445e-12,6.817977e-16,1.88851e-14,5.909166e-11,1.349804e-14,8.245356e-14,4.105062e-13,4.618445e-11,4.207604e-15,4.772874e-14,9.679026e-17,8.89431e-18,5.028371e-13,a4c,1.0
1,35169325,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113,2017-05-22,2017-06-21,35169325_2017-05-22,30,30.0,1.0,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b090131...,2017-06-21 09:04:50,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113_...,BWH,iE33,Philips Medical Systems,0.0,49.49,52.0,71.0,0.04404,0.04404,2.206956e-11,2.73909e-11,1.041237e-11,1.219875e-10,7.501796e-11,1.225508e-11,2.75189e-09,4.431975e-12,7.776415e-12,1.314174e-11,6.367319e-12,5.739965e-12,6.897379e-10,1.617736e-11,0.9999999,3.547227e-12,8.979309e-12,3.594578e-11,9.016331e-11,9.644979e-11,2.262643e-12,1.0363e-09,4.691505e-11,7.552487e-08,8.309125e-13,1.391146e-14,1.978723e-10,other,1.0


### Filter data sets: GLOBAL and NON-DEFECT variables ###

#### GLOBAL VARIABLES ####

Notebook 3/17/2020: global_pet_cfr
File used: pet_cfr_petdata_02_26_2020_withperfandseg7.xlsx

* As described above, 2871 after excluding CABG, transplant, and those with missing perfusion data

File used: post_2018_pets_with_clinical_cfr_all.csv

* After excluding CABG, transplant, and missing CFR values, 167 PETs with any perfusion

Merge:

After combining, 3038 PETs
File saved as pets_with_echos_global_all.parquet
Use notes:

* Variables to use: rest_global_mbf, stress_global_mbf, global_cfr_calc
* Other variables- myocardial_perfusion, segmental data, perfusion data, TID, gated SPECT results, calcium score, height, weight
* For "cleaner" data, would exclude those with post-2018==1

In [7]:
global_pet_file = 'pets_with_echos_global_all.parquet'
global_pet = pd.read_parquet(os.path.join(pet_data_dir, global_pet_file))
global_pet = global_pet.astype({'pet_date': 'datetime64[ns]'})
print(f'PET studies: {len(global_pet.petmrn_identifier.unique())}')
print(os.path.join(pet_data_dir, global_pet_file))
global_pet.head()

PET studies: 3031
/mnt/obi0/phi/pet/pet_cfr/pets_with_echos_global_all.parquet


Unnamed: 0,mrn,pet_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,rest_global_mbf,stress_global_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,rest_seg9_mbf,rest_seg10_mbf,rest_seg11_mbf,rest_seg12_mbf,rest_seg13_mbf,rest_seg14_mbf,rest_seg15_mbf,rest_seg16_mbf,rest_seg17_mbf,...,stress_seg3_mbf,stress_seg4_mbf,stress_seg5_mbf,stress_seg6_mbf,stress_seg7_mbf,stress_seg8_mbf,stress_seg9_mbf,stress_seg10_mbf,stress_seg11_mbf,stress_seg12_mbf,stress_seg13_mbf,stress_seg14_mbf,stress_seg15_mbf,stress_seg16_mbf,stress_seg17_mbf,summed_stress_score,summed_rest_score,summed_difference_score,TID_ratio,gated_spect_results,agatston_coronary_calcium_score,height_in,weight_lb,reportID,subjectID
0,7924277,2008-08-15,7924277_2008-08-15,0,abnormal,1.368012,0.644,0.881,0.448,0.38,0.435,0.83,0.402,0.334,0.73,0.89,0.537,0.388,0.607,0.823,0.964,0.766,0.445,1.34,1.504,...,0.529,0.348,0.558,0.562,1.325,1.446,0.636,0.502,1.136,1.186,1.415,0.979,0.67,1.174,1.676,18.0,18.0,0.0,1.09,normal,,67.0,133.0,69718.0,2540.0
1,12853099,2006-01-25,12853099_2006-01-25,0,normal,2.109661,1.532,3.232,1.082,0.94,1.135,1.224,1.332,1.945,1.482,1.977,1.475,1.656,1.509,1.444,1.801,2.18,1.677,1.359,1.977,...,2.069,2.657,2.626,3.712,3.981,3.359,2.215,4.01,3.89,3.883,3.249,2.609,3.483,3.443,3.037,0.0,0.0,0.0,,not assessed,0.0,64.0,160.0,69969.0,3379.0
2,20710471,2006-01-23,20710471_2006-01-23,0,abnormal,1.229572,1.028,1.264,0.872,0.892,1.126,1.884,1.061,1.057,1.286,1.657,0.735,0.995,1.153,1.252,1.38,0.966,0.86,0.943,1.101,...,1.864,0.91,1.32,1.936,1.283,1.617,1.33,1.266,1.366,1.726,1.533,1.477,1.484,1.037,1.537,20.0,18.0,2.0,0.97,normal,6602.0,71.0,200.0,69993.0,3542.0
3,12627030,2006-02-10,12627030_2006-02-10,0,abnormal,1.588915,1.732,2.752,1.984,1.499,1.556,1.333,1.134,1.292,2.499,1.883,1.703,1.794,1.873,1.933,1.767,2.056,1.662,1.804,1.754,...,0.64,2.197,1.413,3.395,4.533,1.301,2.149,2.391,3.3,4.714,3.184,2.567,4.154,4.662,3.595,16.0,0.0,16.0,0.79,normal,331.0,68.0,217.0,70008.0,2155.0
4,15324312,2006-02-23,15324312_2006-02-23,0,normal,2.656655,1.465,3.892,0.945,0.813,0.892,0.97,1.401,1.217,1.614,1.58,1.303,1.957,1.879,1.794,1.421,1.773,1.677,1.891,1.846,...,2.654,2.434,3.313,3.502,4.081,4.34,3.978,4.17,4.426,4.944,4.343,4.143,4.748,4.544,4.328,0.0,0.0,0.0,0.94,normal,0.0,64.0,146.0,70012.0,3417.0


In [8]:
global_pet[global_pet.petmrn_identifier=='1414556_2018-10-30']

Unnamed: 0,mrn,pet_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,rest_global_mbf,stress_global_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,rest_seg9_mbf,rest_seg10_mbf,rest_seg11_mbf,rest_seg12_mbf,rest_seg13_mbf,rest_seg14_mbf,rest_seg15_mbf,rest_seg16_mbf,rest_seg17_mbf,...,stress_seg3_mbf,stress_seg4_mbf,stress_seg5_mbf,stress_seg6_mbf,stress_seg7_mbf,stress_seg8_mbf,stress_seg9_mbf,stress_seg10_mbf,stress_seg11_mbf,stress_seg12_mbf,stress_seg13_mbf,stress_seg14_mbf,stress_seg15_mbf,stress_seg16_mbf,stress_seg17_mbf,summed_stress_score,summed_rest_score,summed_difference_score,TID_ratio,gated_spect_results,agatston_coronary_calcium_score,height_in,weight_lb,reportID,subjectID
3858,1414556,2018-10-30,1414556_2018-10-30,0,normal,1.809645,0.788,1.426,0.839083,0.799958,0.810667,0.721833,0.783875,0.781667,0.803083,0.918875,0.917708,0.872042,0.849458,0.767792,0.693972,0.796028,0.812972,0.736222,0.642786,...,1.5275,1.390125,1.45075,1.4535,1.446542,1.720542,1.705208,1.582792,1.645125,1.386583,1.128667,1.501056,1.447167,1.197389,1.21775,0.0,0.0,0.0,,normal,145.0,63.0,189.0,166220.0,7521.0
555,1414556,2018-10-30,1414556_2018-10-30,1,normal,2.1,0.69,1.45,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [14]:
# Add echo data to the pet studies (inner join, to keep only keys in both dataframes)
global_pet_echo = global_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='inner')
print(f'PET studies:  {len(global_pet_echo.petmrn_identifier.unique())}')
print(f'ECHO studies: {len(global_pet_echo.study.unique())}')

PET studies:  2239
ECHO studies: 3064


In [16]:
lobal_pet_noecho_18 = global_pet_echo[global_pet_echo['post-2018']==0]
print(f'PET studies:  {len(lobal_pet_noecho_18.petmrn_identifier.unique())}')
print(f'ECHO studies: {len(lobal_pet_noecho_18.study.unique())}')

PET studies:  2100
ECHO studies: 2883


In [10]:
# PET studies without echo data
global_pet_echo_all = global_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='left' )
global_pet_noecho = global_pet_echo_all.loc[global_pet_echo_all.study.isnull()]
print(f'PET studies total:        {len(global_pet_echo_all.petmrn_identifier.unique())}')
print(f'PET studies without echo: {len(global_pet_noecho.petmrn_identifier.unique())}')

PET studies total:        3031
PET studies without echo: 792


In [7]:
# Global_pet_echo table with the variables to use (drop rows with na in any of those variables)
global_pet_echo.head(2)
global_pet_variables_target = ['rest_global_mbf', 'stress_global_mbf', 'global_cfr_calc', 'post-2018']
global_pet_variables = global_pet_variables_target.copy()
global_pet_variables.extend(list(files_cfr.columns))
global_pet_echo_variables = global_pet_echo[global_pet_variables].dropna(axis=0)
print(f'PET target variables: {global_pet_variables_target}')
print(f'PET studies:          {len(global_pet_echo_variables.petmrn_identifier.unique())}')
print(f'ECHO studies:         {len(global_pet_echo_variables.study.unique())}')
print(f'global shape:         {global_pet_echo_variables.shape}')

PET target variables: ['rest_global_mbf', 'stress_global_mbf', 'global_cfr_calc', 'post-2018']
PET studies:          2172
ECHO studies:         2908
global shape:         (136091, 54)


#### Unaffected (non-defect) variables ####
Notebook 2/26/2020: clean_pet_cfr_petdata4_addingseg7
File used: pet_cfr_petdata_02_26_2020_withperfandseg7.xlsx

* All segments used (segment 7 data now available)
* MBF and CFR calculated for unaffected segments
Perfusion defect segments are those with a perfusion score >0 at rest or stress
Unaffected segments are those not in a perfusion defect, and not adjacent to a perfusion defect
* Of note, PETs with no unaffected segments were excluded from the dataset
* As described above, 2871 after excluding CABG, transplant, and those with missing perfusion data
* After excluding 295 for having no "unaffected segments", 2576 remaining
File used: post_2018_pets_with_clinical_cfr_all.csv

* Clinical global MBF and CFR values for PETs after 10/2018 were obtained from EPIC
These values were also used for unaffected MBF and CFR
After excluding CABG, transplant, and missing CFR values, 167 PETs with any perfusion
After keeping only studies with normal/probably normal perfusion, 83 PETs remaining

Merge:

Total 2659 PETs
File created: non_defect_plus_p2018.parquet
Use notes:

* Variables to use: rest_mbf_unaff, stress_mbf_unaff, unaffected_cfr
* Also available: perfusion data, segmental data, gated SPECT results, calcium score, height, weight
* For clean data, would exclude PETs with post-2018==1, and would limit to PETs with myocardial_perfusion of 'normal' or 'probably normal'- if this is done, would use global_cfr_calc instead of unaffected_cfr because it is slightly more accurate in PETs with normal/prob normal perfusion (could consider looking at whether normal/probably normal PETs are still scored as having small perfusion defects)

In [8]:
nondefect_pet_file = 'non_defect_plus_p2018.parquet'
nondefect_pet = pd.read_parquet(os.path.join(pet_data_dir, nondefect_pet_file))
nondefect_pet = nondefect_pet.astype({'pet_date': 'datetime64[ns]'})
print(f'PET studies: {len(nondefect_pet.petmrn_identifier.unique())}')
print(os.path.join(pet_data_dir, nondefect_pet_file))

PET studies: 2655
/mnt/obi0/phi/pet/pet_cfr/non_defect_plus_p2018.parquet


In [9]:
# Add echo data to the pet studies (inner join)
nondefect_pet_echo = nondefect_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='inner')
print(f'PET studies:  {len(nondefect_pet_echo.petmrn_identifier.unique())}')
print(f'ECHO studies: {len(nondefect_pet_echo.study.unique())}')

PET studies:  1986
ECHO studies: 2678


In [10]:
# Disqualified PET studies
nondefect_pet_echo_all = nondefect_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='left')
nondefect_pet_noecho = nondefect_pet_echo_all.loc[nondefect_pet_echo_all.study.isnull()]
print(f'PET studies total:        {len(nondefect_pet_echo_all.petmrn_identifier.unique())}')
print(f'PET studies without echo: {len(nondefect_pet_noecho.petmrn_identifier.unique())}')

PET studies total:        2655
PET studies without echo: 669


In [11]:
nondefect_variables_target = ['rest_mbf_unaff', 'stress_mbf_unaff', 'unaffected_cfr', 'post-2018']
nondefect_variables = nondefect_variables_target.copy()
nondefect_variables.extend(list(files_cfr.columns))
nondefect_pet_echo_variables = nondefect_pet_echo[nondefect_variables].dropna(axis=0)
print(f'PET target variables: {nondefect_variables_target}')
print(f'PET studies:          {len(nondefect_pet_echo_variables.petmrn_identifier.unique())}')
print(f'ECHO studies:         {len(nondefect_pet_echo_variables.study.unique())}')
print(f'nondefect shape:      {nondefect_pet_echo_variables.shape}')

PET target variables: ['rest_mbf_unaff', 'stress_mbf_unaff', 'unaffected_cfr', 'post-2018']
PET studies:          1923
ECHO studies:         2538
nondefect shape:      (117341, 54)


In [12]:
# Complete list of unique petmrn_identifier
petmrn_identifier_list = list(global_pet.petmrn_identifier.unique())+list(nondefect_pet.petmrn_identifier.unique())
petmrn_identifier_set = list(set(petmrn_identifier_list))

In [13]:
print(len(petmrn_identifier_list))
print(len(petmrn_identifier_set))

5686
3031


In [14]:
# Let's filter Rahuls list of missing echos
missing_echo = pd.read_parquet(os.path.join(meta_dir, 'mrn_pet_missing_echo_file.parquet'))
print(f'petmrn_identifier in original list: {len(missing_echo.petmrn_identifier.unique())}')
missing_echo.tail()

petmrn_identifier in original list: 674


Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,pet_measurement,study,echo_study_date
20145,7766041,NaT,2014-05-15,,7766041_2014-05-15,1.0,,NaT
20274,22087456,NaT,2014-11-25,,22087456_2014-11-25,1.0,,NaT
20313,17170440,NaT,2015-03-09,,17170440_2015-03-09,1.0,,NaT
20423,1825777,NaT,2016-12-08,,1825777_2016-12-08,1.0,,NaT
20434,26949750,NaT,2017-08-08,,26949750_2017-08-08,1.0,,NaT


In [32]:
missing_echo_filtered = missing_echo[missing_echo.petmrn_identifier.isin(petmrn_identifier_set)].\
                        drop(columns=['pet_measurement'], axis=1).reset_index(drop=True)
print(f'petmrn_identifier in filtered list:   {len(missing_echo_filtered.petmrn_identifier.unique())}')
missing_no_echo_date = missing_echo_filtered.loc[missing_echo_filtered.echo_date.isnull()]
print(f'petmrn_identifier without echo dates: {len(missing_no_echo_date.petmrn_identifier.unique())}')

missing_echo_filtered_file = 'mrn_pet_missing_echo_file_filtered.parquet'
missing_echo_filtered.to_parquet(os.path.join(meta_dir, missing_echo_filtered_file))

petmrn_identifier in filtered list:   460
petmrn_identifier without echo dates: 13


In [35]:
print(missing_echo_filtered.shape)
print(missing_no_echo_date.shape)
missing_echo_filtered.tail(20)

(700, 7)
(13, 7)


Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,study,echo_study_date
680,32030512,2016-01-25,2016-01-27,2.0,32030512_2016-01-27,,NaT
681,32299455,2016-08-20,2016-08-19,1.0,32299455_2016-08-19,,NaT
682,33476755,2016-08-19,2016-08-10,9.0,33476755_2016-08-10,,NaT
683,33657214,2017-01-25,2017-02-02,8.0,33657214_2017-02-02,,NaT
684,38235537,2018-08-16,2018-09-25,40.0,38235537_2018-09-25,,NaT
685,38235537,2018-09-22,2018-09-25,3.0,38235537_2018-09-25,,NaT
686,40902504,2019-11-05,2019-11-06,1.0,40902504_2019-11-06,,NaT
687,5291745,NaT,2008-07-15,,5291745_2008-07-15,,NaT
688,20725511,NaT,2007-07-10,,20725511_2007-07-10,,NaT
689,13752563,NaT,2008-12-23,,13752563_2008-12-23,,NaT


### Split the patients in train, validate and test sets ###
Although each view might have a little different patient pupulation distribution, because not all views are in each study. However, we want the same MRNS in each data set and for all views so that we can directly compare the performance of the algorithm for the same patients. We can expand the data frame above to add the splits.

In [16]:
def patientsplit(patient_list):

    train_test_split = 0.87
    train_eval_split = 0.90

    # Take a test set from all patients
    patient_list_train = np.random.choice(patient_list,
                                          size = int(np.floor(train_test_split*len(patient_list))),
                                          replace = False)
    patient_list_test = list(set(patient_list).difference(patient_list_train))
    train_test_intersection = set(patient_list_train).intersection(set(patient_list_test)) # This should be empty
    print('Intersection of patient_list_train and patient_list_test:', train_test_intersection)

    # Further separate some patients for evaluation from the training list
    patient_list_eval = np.random.choice(patient_list_train,
                                         size = int(np.ceil((1-train_eval_split)*len(patient_list_train))),
                                         replace = False)

    patient_list_train = set(patient_list_train).difference(patient_list_eval)
    train_eval_intersection = set(patient_list_train).intersection(set(patient_list_eval))
    print('Intersection of patient_list_train and patient_list_eval:', train_eval_intersection)

    # Show the numbers
    print('total patients:', len(patient_list))
    print()
    print('patients in set:', np.sum([len(patient_list_train),
                                     len(patient_list_eval),
                                     len(patient_list_test)]))
    print()
    print('patients in train:', len(patient_list_train))
    print('patients in eval:', len(patient_list_eval))
    print('patients in test:', len(patient_list_test))

    return patient_list_train, patient_list_eval, patient_list_test

In [26]:
#dataset = global_pet_echo_variables
#dataset_filename = 'global_pet_echo_dataset_'+cfr_meta_date+'.parquet'
#global_pet_echo_variables.head()

dataset= nondefect_pet_echo_variables
dataset_filename = 'nondefect_pet_echo_dataset_'+cfr_meta_date+'.parquet'
nondefect_pet_echo_variables.head()

Unnamed: 0,rest_mbf_unaff,stress_mbf_unaff,unaffected_cfr,post-2018,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),pet_measurement,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,...,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views
0,1.540882,3.274,2.124757,0,12853099,4a14f25359c230bf_4903a44516dacae1fafd6299fae6,2006-01-25,2006-01-24,12853099_2006-01-25,-1,1.0,1.0,4a14f25359c230bf_4903a44516dacae1fafd6299fae6_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a14/4a14f2535...,2006-01-24 13:44:03,4a14f25359c230bf_4903a44516dacae1fafd6299fae6_...,BWH,,Philips_Medical_Systems,0.0,25.0,97.0,73.0,0.039398,0.031519,...,4.407525e-09,8.573169e-11,1.166759e-11,7.554371e-12,0.9999748,7.057634e-13,2.44146e-12,7.972685e-14,3.058423e-13,1.048971e-11,7.276347e-10,4.649608e-11,1.237479e-13,6.410189e-15,3.820212e-13,7.770167e-13,6.201503e-13,9.144326e-13,2.30578e-10,9.06146e-12,1.248531e-14,7.712218e-16,3.759167e-15,a4c_laocc,1.0
1,1.540882,3.274,2.124757,0,12853099,4a14f25359c230bf_4903a44516dacae1fafd6299fae6,2006-01-25,2006-01-24,12853099_2006-01-25,-1,1.0,1.0,4a14f25359c230bf_4903a44516dacae1fafd6299fae6_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a14/4a14f2535...,2006-01-24 13:44:03,4a14f25359c230bf_4903a44516dacae1fafd6299fae6_...,BWH,,Philips_Medical_Systems,0.0,20.0,131.0,66.0,0.068052,0.054441,...,7.227913e-13,4.504239e-10,2.230969e-13,1.970521e-10,3.43304e-11,4.051892e-10,4.290476e-16,3.727703e-15,4.383825e-13,8.343084e-12,5.577484e-09,0.6681287,6.095825e-11,9.798778e-14,2.052378e-13,0.3318713,7.513965e-12,2.744428e-12,1.211765e-11,4.715875e-10,1.040652e-10,1.281145e-13,2.245519e-12,plax_far,1.0
2,1.540882,3.274,2.124757,0,12853099,4a14f25359c230bf_4903a44516dacae1fafd6299fae6,2006-01-25,2006-01-24,12853099_2006-01-25,-1,1.0,1.0,4a14f25359c230bf_4903a44516dacae1fafd6299fae6_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a14/4a14f2535...,2006-01-24 13:44:03,4a14f25359c230bf_4903a44516dacae1fafd6299fae6_...,BWH,,Philips_Medical_Systems,0.0,33.0,85.0,65.0,0.039398,0.031519,...,3.590285e-07,2.287998e-08,3.574695e-13,1.08566e-08,1.097643e-05,1.644205e-10,7.856313e-13,2.112234e-12,5.009047e-10,1.26611e-08,1.102441e-09,8.857064e-09,4.317589e-11,3.826693e-13,1.456313e-09,1.10033e-08,1.801396e-09,4.251072e-11,9.330494e-07,9.014583e-08,9.979393e-12,5.026618e-18,1.605093e-11,a2c_laocc,1.0
3,1.540882,3.274,2.124757,0,12853099,4a14f25359c230bf_4903a44516dacae1fafd6299fae6,2006-01-25,2006-01-24,12853099_2006-01-25,-1,1.0,1.0,4a14f25359c230bf_4903a44516dacae1fafd6299fae6_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a14/4a14f2535...,2006-01-24 13:44:03,4a14f25359c230bf_4903a44516dacae1fafd6299fae6_...,BWH,,Philips_Medical_Systems,0.0,29.0,93.0,66.0,0.05,0.04,...,2.553372e-07,6.899153e-06,3.371127e-07,3.390787e-08,4.934135e-10,1.721103e-07,5.929068e-07,4.284833e-09,4.399919e-09,0.0001108345,0.8858027,2.854507e-06,2.012875e-09,5.679172e-11,3.642607e-07,1.692721e-07,0.00644248,0.004996649,0.0007213246,0.1014473,2.431672e-05,6.709786e-11,2.973797e-09,other,1.0
4,1.540882,3.274,2.124757,0,12853099,4a14f25359c230bf_4903a44516dacae1fafd6299fae6,2006-01-25,2006-01-24,12853099_2006-01-25,-1,1.0,1.0,4a14f25359c230bf_4903a44516dacae1fafd6299fae6_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a14/4a14f2535...,2006-01-24 13:44:03,4a14f25359c230bf_4903a44516dacae1fafd6299fae6_...,BWH,,Philips_Medical_Systems,0.0,28.0,95.0,65.0,0.05,0.04,...,2.059672e-10,9.161177e-10,1.414507e-12,4.80657e-10,3.759216e-10,3.009975e-10,1.420308e-13,1.171174e-13,1.540204e-09,3.884519e-07,3.384351e-12,1.502973e-10,5.340333e-11,3.614473e-13,1.545933e-11,4.061539e-08,6.402495e-08,5.323219e-14,2.324908e-05,0.9999758,1.440222e-09,2.352892e-14,2.045466e-13,psax_pap,1.0


In [20]:
# Get a patient list
patient_list = list(dataset.sample(frac=1).mrn.unique())
patient_list_train, patient_list_eval, patient_list_test = patientsplit(patient_list)

patient_split = {'train': patient_list_train,
                 'eval': patient_list_eval,
                 'test': patient_list_test}

print('Patient IDs in train:', len(patient_split['train']))
print('Patient IDs in eval:', len(patient_split['eval']))
print('Patient IDs in test:', len(patient_split['test']))

print()

print('Intersection train-test:', set(patient_split['train']).intersection(set(patient_split['test'])))
print('Intersection train-eval:', set(patient_split['train']).intersection(set(patient_split['eval'])))
print('Intersection eval-test:', set(patient_split['eval']).intersection(set(patient_split['test'])))

Intersection of patient_list_train and patient_list_test: set()
Intersection of patient_list_train and patient_list_eval: set()
total patients: 2118

patients in set: 2118

patients in train: 1657
patients in eval: 185
patients in test: 276
Patient IDs in train: 1657
Patient IDs in eval: 185
Patient IDs in test: 276

Intersection train-test: set()
Intersection train-eval: set()
Intersection eval-test: set()


In [21]:
# Add mode column to files_cfr
split_list = []
for dset in patient_split.keys():
    dset_mrn_list = list(patient_split[dset])
    split_list.append(pd.DataFrame({'mrn': dset_mrn_list,
                                    'mode': [dset]*len(dset_mrn_list)}))

split_df = pd.concat(split_list, ignore_index = True)

dataset_split = dataset.merge(right = split_df, on = 'mrn', how = 'left').\
                    sample(frac = 1).\
                    reset_index(drop = True)

In [22]:
print(f'PET studies:          {len(dataset_split.petmrn_identifier.unique())}')
print(f'ECHO studies:         {len(dataset_split.study.unique())}')

PET studies:          2172
ECHO studies:         2908


In [23]:
# Prepare the final set that we will use for writing TFR files. We don't want any rows with NAs in some columns.
dataset_split_tfr = dataset_split.dropna(axis=0)

print('Dropped {} rows.'.format(dataset_split.shape[0]-dataset_split_tfr.shape[0]))
# Add some other information that we need and shuffle the whole thing
dataset_split_tfr = dataset_split_tfr.assign(rate = np.round(1/dataset_split_tfr.frame_time*1e3, decimals = 1))
dataset_split_tfr = dataset_split_tfr.sample(frac = 1)

Dropped 0 rows.


In [24]:
dataset_split_tfr.to_parquet(os.path.join(meta_dir, dataset_filename))
print('Saved to file:', dataset_filename)
print(dataset_split_tfr.shape)
dataset_split_tfr.head()

Saved to file: global_pet_echo_dataset_200425.parquet
(136091, 56)


Unnamed: 0,rest_global_mbf,stress_global_mbf,global_cfr_calc,post-2018,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),pet_measurement,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,...,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views,mode,rate
7163,2.261,2.737,1.210526,0,7465362,4b7c2a2a36f8d0b8_4903a58584260b5dfa1534ccfd1b,2010-08-05,2010-08-06,7465362_2010-08-05,1,1.0,1.0,4b7c2a2a36f8d0b8_4903a58584260b5dfa1534ccfd1b_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c2a2a3...,2010-08-06 11:45:47,4b7c2a2a36f8d0b8_4903a58584260b5dfa1534ccfd1b_...,BWH,iE33,Philips Medical Systems,0.0,33.333,50.0,76.0,0.033199,0.033199,...,1.79584e-07,6.797597e-11,1.799599e-09,2.69053e-09,2.093088e-05,4.48019e-10,3.158428e-07,8.832543e-10,1.669577e-06,7.128636e-10,0.0001142001,2.124635e-07,3.091842e-06,7.614702e-08,3.364367e-06,1.421245e-06,2.887435e-10,4.200002e-05,0.1246638,0.875097,5.155806e-08,subcostal,1.0,train,30.0
75640,0.604,1.259,2.084437,0,15568728,4a1346a2f4c42d98_4903a44b32e959f81bc28679eaaf,2008-03-05,2008-05-17,15568728_2008-03-05,73,73.0,1.0,4a1346a2f4c42d98_4903a44b32e959f81bc28679eaaf_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a13/4a1346a2f...,2008-05-17 15:00:32,4a1346a2f4c42d98_4903a44b32e959f81bc28679eaaf_...,BWH,Vivid i,GEMS Ultrasound,0.0,40.046911,77.0,61.0,0.046036,0.046036,...,6.218592e-18,6.560172e-13,2.676037e-15,7.163441e-15,4.595214e-18,1.385017e-11,8.231996e-15,1.300199e-12,3.260629e-13,1.419476e-13,3.098349e-14,7.502518e-17,1.488115e-15,1.470302e-14,7.055224e-14,4.059716e-12,1.018078e-10,4.271447e-10,2.099176e-11,7.102247e-11,1.0,suprasternal,1.0,test,25.0
25877,0.739,0.703,0.951286,0,22906382,49017c5d0869c97d_4903a44b32e95a3575ef8764eabd,2008-12-05,2008-05-28,22906382_2008-12-05,-191,191.0,1.0,49017c5d0869c97d_4903a44b32e95a3575ef8764eabd_...,/mnt/obi0/phi/echo/npyFiles/BWH/4901/49017c5d0...,2008-05-28 09:42:12,49017c5d0869c97d_4903a44b32e95a3575ef8764eabd_...,BWH,Vivid i,GEMS Ultrasound,0.0,59.255327,22.0,170.0,0.040921,0.040921,...,9.977381e-13,5.233795e-14,8.428437e-14,4.277898e-13,3.612634e-11,6.791633e-15,5.593776e-12,5.63138e-13,1.0,4.585516e-13,1.958458e-11,5.281277e-14,4.163003e-12,4.95912e-12,7.21276e-10,3.507588e-11,2.525605e-12,3.51983e-09,3.330906e-13,1.364477e-13,8.466453e-16,other,1.0,eval,16.9
12708,0.665,1.213,1.82406,0,31812357,48b4ce3121ac6fb3_4903a58dd54a6ac449c7d6610683,2018-07-24,2018-09-13,31812357_2018-07-24,51,51.0,1.0,48b4ce3121ac6fb3_4903a58dd54a6ac449c7d6610683_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b4/48b4ce312...,2018-09-13 11:20:53,48b4ce3121ac6fb3_4903a58dd54a6ac449c7d6610683_...,BWH,Vivid E95,GE Vingmed Ultrasound,0.0,20.089485,150.0,62.0,0.030652,0.030652,...,1.961298e-05,1.401434e-09,4.204748e-10,1.948465e-10,3.113817e-09,4.002137e-08,3.810604e-06,3.995776e-06,1.84297e-08,4.358957e-10,1.210147e-08,9.516044e-10,3.362392e-09,7.186424e-08,2.105646e-08,3.518612e-09,3.629746e-08,0.9998868,4.26094e-09,5.099114e-07,6.268984e-08,psax_pap,1.0,train,49.8
23985,0.793,1.253,1.580076,0,1122282,4b7a8654cace3b4c_4903a444c238ffb8998440627841,2008-03-12,2007-12-10,1122282_2008-03-12,-93,93.0,1.0,4b7a8654cace3b4c_4903a444c238ffb8998440627841_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7a/4b7a8654c...,2007-12-10 14:39:48,4b7a8654cace3b4c_4903a444c238ffb8998440627841_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,33.408682,79.0,94.0,0.0,0.0,...,4.048523e-14,4.070859e-12,4.281287e-16,6.672903e-13,9.668659e-09,1.743081e-12,8.258511e-11,5.909715e-12,1.705368e-11,1.505724e-10,5.444362e-10,3.613306e-13,2.77083e-13,1.0,2.476557e-12,3.200939e-12,3.070307e-09,3.74251e-13,2.871008e-14,5.641428e-14,2.857186e-11,plax_plax,1.0,train,29.9


In [25]:
minrate = 20
minframes = 40
minduration = minframes/minrate
print('Minmum rate:      {}'.format(minrate))
print('Minimum frames:   {}'.format(minframes))
print('Minimum duration: {}'.format(minduration))
#minduration = 1.5
#minframes = int(np.ceil(minrate*minduration))

maxrows = dataset_split_tfr.shape[0]
rows_disqualified = dataset_split_tfr[(dataset_split_tfr.rate < minrate) | (dataset_split_tfr.number_of_frames<minframes)].shape[0]
rows_qualified = dataset_split_tfr[(dataset_split_tfr.rate >= minrate) & (dataset_split_tfr.number_of_frames>=minframes)].shape[0]

print('Disqualified Rows {}, fraction:{:.1f}'.format(rows_disqualified, 
                                                     np.round(rows_disqualified/maxrows*100), decimals=1))

print('Qualified Rows {}, fraction:{:.1f}'.format(rows_qualified, 
                                                     np.round(rows_qualified/maxrows*100), decimals=1))

Minmum rate:      20
Minimum frames:   40
Minimum duration: 2.0
Disqualified Rows 35676, fraction:26.0
Qualified Rows 100415, fraction:74.0


In [4]:
df = pd.read_parquet(os.path.join(meta_dir, 'nondefect_pet_echo_dataset_200425.parquet'))
df.head()

Unnamed: 0,rest_mbf_unaff,stress_mbf_unaff,unaffected_cfr,post-2018,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),pet_measurement,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,...,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views,mode,rate
69000,0.832965,1.494424,1.794101,0,8759110,4b735ead86aeb4c6_4903a582ec70ba1475e182cfb98d,2017-06-13,2017-01-10,8759110_2017-06-13,-154,154.0,1.0,4b735ead86aeb4c6_4903a582ec70ba1475e182cfb98d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b735ead8...,2017-01-10 11:05:42,4b735ead86aeb4c6_4903a582ec70ba1475e182cfb98d_...,BWH,iE33,Philips Medical Systems,0.0,33.333,106.0,89.0,0.066275,0.066275,...,6.98697e-13,1.0,7.899704e-16,2.591669e-13,6.319677e-11,1.44318e-13,1.213088e-14,1.62449e-14,1.165333e-16,1.448864e-14,3.511176e-15,1.708671e-13,3.345947e-12,2.287968e-14,5.056855e-11,9.939989e-15,9.886744e-15,9.766502e-13,1.297512e-11,6.940029e-14,1.423959e-12,a4c_far,1.0,train,30.0
9991,1.965941,2.553118,1.298674,0,13003751,4a1536892bf9e07d_4903a444c30d501d823722750cc3,2007-09-19,2007-05-23,13003751_2007-09-19,-119,119.0,1.0,4a1536892bf9e07d_4903a444c30d501d823722750cc3_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a15/4a1536892...,2007-05-23 10:03:41,4a1536892bf9e07d_4903a444c30d501d823722750cc3_...,BWH,iE33,Philips Medical Systems,0.0,33.333,63.0,79.0,0.030868,0.030868,...,5.73301e-11,1.364234e-14,1.0,1.538719e-15,3.192212e-13,3.45715e-14,2.939989e-18,5.88786e-14,6.393981000000001e-17,4.114795e-15,1.218932e-15,1.555034e-16,4.569246e-15,1.98109e-15,1.392797e-12,9.113056000000001e-17,3.975404e-16,7.665577e-14,1.145365e-14,6.2901200000000004e-18,2.206669e-16,a4c_laocc,1.0,eval,30.0
101155,1.0666,0.9618,0.901744,0,7906787,4b7c27305e9279d4_4903a44ab12fcfca7a76184d77ab,2009-04-14,2009-04-13,7906787_2009-04-14,-1,1.0,1.0,4b7c27305e9279d4_4903a44ab12fcfca7a76184d77ab_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27305...,2009-04-13 09:14:09,4b7c27305e9279d4_4903a44ab12fcfca7a76184d77ab_...,BWH,iE33,Philips Medical Systems,0.0,33.333,83.0,66.0,0.028615,0.028615,...,2.913878e-11,2.232255e-08,0.003652254,6.395344e-10,1.80295e-11,3.425683e-10,3.274602e-09,3.781384e-10,3.070201e-06,7.118768e-10,8.688748e-08,4.682648e-10,1.370893e-07,1.110834e-06,1.796136e-08,4.815169e-08,4.313185e-07,2.721958e-05,0.9954194,2.503532e-09,0.0001399971,rvinf,1.0,train,30.0
90763,0.66,1.01,1.53,1,7235799,4b7c2c72ecc28fe9_4903a58dd44a9df84091c351f0a1,2018-10-29,2018-10-29,7235799_2018-10-29,0,0.0,1.0,4b7c2c72ecc28fe9_4903a58dd44a9df84091c351f0a1_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c2c72e...,2018-10-29 09:07:53,4b7c2c72ecc28fe9_4903a58dd44a9df84091c351f0a1_...,BWH,Vivid E95,GE Vingmed Ultrasound,0.0,20.080128,105.0,88.0,0.030652,0.030652,...,4.23927e-19,6.432208e-20,1.373409e-20,2.9910509999999996e-19,4.0081979999999997e-19,1.852411e-18,1.838755e-15,1.671054e-15,3.319786e-25,1.172666e-21,2.764478e-18,4.61428e-22,4.281082e-20,2.902495e-15,1.9422960000000003e-17,4.452581e-16,1.0,1.189285e-20,1.310458e-18,5.3943469999999996e-24,5.5132150000000004e-18,psax_mv,1.0,eval,49.8
14171,0.517647,0.778765,1.504432,0,9777111,4b729493c5477970_4903a44ab097f149a16a8be93276,2009-10-14,2009-10-13,9777111_2009-10-14,-1,1.0,1.0,4b729493c5477970_4903a44ab097f149a16a8be93276_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b72/4b729493c...,2009-10-13 11:23:00,4b729493c5477970_4903a44ab097f149a16a8be93276_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.194278,90.0,56.0,0.0,0.0,...,9.877962e-11,2.041331e-11,7.048488e-13,5.179256e-14,1.053245e-11,1.214161e-12,9.448951e-10,2.634516e-13,1.0,1.442799e-14,3.671445e-14,4.570046e-12,9.962909e-13,3.356667e-12,5.466734e-14,6.717785e-11,6.392168e-12,1.941048e-09,3.630017e-16,6.672974e-11,2.599256e-11,other,1.0,train,24.9
