### Select patients for CFR model: Split patients in train, val and test sets ###

In [1]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [3]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
pet_data_dir = os.path.normpath('/mnt/obi0/phi/pet/pet_cfr')
cfr_meta_date = '200519'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200519


In [3]:
match_view_filename = 'pet_match365_diff_files_'+cfr_meta_date+'.parquet'
files_cfr = pd.read_parquet(os.path.join(meta_dir, match_view_filename))

print('Total number of patients      {}'.format(len(files_cfr.mrn.unique())))
print('Total number of echo studies  {}'.format(len(files_cfr.study.unique())))
print('Total number of PET studies   {}'.format(len(files_cfr.petmrn_identifier.unique())))
print('Total number of echos         {}'.format(len(files_cfr.filename.unique())))

files_cfr.head(2)

Total number of patients      3257
Total number of echo studies  6443
Total number of PET studies   3590
Total number of echos         302202


Unnamed: 0,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),pet_measurement,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,a2c_laocc,a2c_lvocc_s,a3c,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views
0,35156678,48b09010a2219aad_4903a582edf3bd118ffb3386065b,2018-10-15,2017-12-06,35156678_2018-10-15,-313,313.0,1.0,48b09010a2219aad_4903a582edf3bd118ffb3386065b_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09010a...,2017-12-06 13:11:41,48b09010a2219aad_4903a582edf3bd118ffb3386065b_...,BWH,Vivid E95,GE Vingmed Ultrasound,0.0,47.769231,66.0,60.0,0.028951,0.028951,3.446593e-08,6.452001e-09,2.939033e-08,0.003095016,9.225302e-08,3.967397e-08,7.041133e-10,3.144316e-09,1.198068e-13,4.564848e-10,1.454091e-09,1.773068e-08,7.812481e-08,3.755003e-08,0.9968956,7.881907e-10,5.391607e-10,2.151894e-10,1.15716e-08,4.940982e-08,7.734493e-12,1.395856e-06,3.054052e-09,7.720808e-06,5.067335e-09,2.187016e-13,2.230961e-08,other,1.0
1,35156678,48b09010a2219aad_4903a582edf3bd118ffb3386065b,2018-10-15,2017-12-06,35156678_2018-10-15,-313,313.0,1.0,48b09010a2219aad_4903a582edf3bd118ffb3386065b_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09010a...,2017-12-06 13:11:41,48b09010a2219aad_4903a582edf3bd118ffb3386065b_...,BWH,Vivid E95,GE Vingmed Ultrasound,0.0,20.087146,154.0,60.0,0.020448,0.020448,4.212547e-13,1.0,5.081462e-15,5.16278e-16,3.739126e-14,4.737708e-15,3.574842e-17,1.105575e-13,7.188538e-16,3.057809e-15,4.562448e-17,4.610617e-16,1.488447e-12,2.472197e-12,2.470155e-16,1.77298e-16,7.891783e-16,6.224149e-17,6.330659e-14,5.080497e-13,2.415044e-15,4.977857e-15,6.916603e-13,5.833864e-15,2.562479e-15,6.839081e-22,8.41705e-17,a2c_laocc,1.0


### Filter data sets: GLOBAL and NON-DEFECT variables ###

#### GLOBAL VARIABLES ####

Notebook 3/17/2020: global_pet_cfr
File used: pet_cfr_petdata_02_26_2020_withperfandseg7.xlsx

* As described above, 2871 after excluding CABG, transplant, and those with missing perfusion data

File used: post_2018_pets_with_clinical_cfr_all.csv

* After excluding CABG, transplant, and missing CFR values, 167 PETs with any perfusion

Merge:

After combining, 3038 PETs
File saved as pets_with_echos_global_all.parquet
Use notes:

* Variables to use: rest_global_mbf, stress_global_mbf, global_cfr_calc
* Other variables- myocardial_perfusion, segmental data, perfusion data, TID, gated SPECT results, calcium score, height, weight
* For "cleaner" data, would exclude those with post-2018==1

In [4]:
global_pet_file = 'pets_with_echos_global_all.parquet'
global_pet = pd.read_parquet(os.path.join(pet_data_dir, global_pet_file))
global_pet = global_pet.astype({'pet_date': 'datetime64[ns]'})
print(f'PET studies: {len(global_pet.petmrn_identifier.unique())}')
print(os.path.join(pet_data_dir, global_pet_file))
global_pet.head()

PET studies: 3031
/mnt/obi0/phi/pet/pet_cfr/pets_with_echos_global_all.parquet


Unnamed: 0,mrn,pet_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,rest_global_mbf,stress_global_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,rest_seg9_mbf,rest_seg10_mbf,rest_seg11_mbf,rest_seg12_mbf,rest_seg13_mbf,rest_seg14_mbf,rest_seg15_mbf,rest_seg16_mbf,rest_seg17_mbf,...,stress_seg3_mbf,stress_seg4_mbf,stress_seg5_mbf,stress_seg6_mbf,stress_seg7_mbf,stress_seg8_mbf,stress_seg9_mbf,stress_seg10_mbf,stress_seg11_mbf,stress_seg12_mbf,stress_seg13_mbf,stress_seg14_mbf,stress_seg15_mbf,stress_seg16_mbf,stress_seg17_mbf,summed_stress_score,summed_rest_score,summed_difference_score,TID_ratio,gated_spect_results,agatston_coronary_calcium_score,height_in,weight_lb,reportID,subjectID
0,7924277,2008-08-15,7924277_2008-08-15,0,abnormal,1.368012,0.644,0.881,0.448,0.38,0.435,0.83,0.402,0.334,0.73,0.89,0.537,0.388,0.607,0.823,0.964,0.766,0.445,1.34,1.504,...,0.529,0.348,0.558,0.562,1.325,1.446,0.636,0.502,1.136,1.186,1.415,0.979,0.67,1.174,1.676,18.0,18.0,0.0,1.09,normal,,67.0,133.0,69718.0,2540.0
1,12853099,2006-01-25,12853099_2006-01-25,0,normal,2.109661,1.532,3.232,1.082,0.94,1.135,1.224,1.332,1.945,1.482,1.977,1.475,1.656,1.509,1.444,1.801,2.18,1.677,1.359,1.977,...,2.069,2.657,2.626,3.712,3.981,3.359,2.215,4.01,3.89,3.883,3.249,2.609,3.483,3.443,3.037,0.0,0.0,0.0,,not assessed,0.0,64.0,160.0,69969.0,3379.0
2,20710471,2006-01-23,20710471_2006-01-23,0,abnormal,1.229572,1.028,1.264,0.872,0.892,1.126,1.884,1.061,1.057,1.286,1.657,0.735,0.995,1.153,1.252,1.38,0.966,0.86,0.943,1.101,...,1.864,0.91,1.32,1.936,1.283,1.617,1.33,1.266,1.366,1.726,1.533,1.477,1.484,1.037,1.537,20.0,18.0,2.0,0.97,normal,6602.0,71.0,200.0,69993.0,3542.0
3,12627030,2006-02-10,12627030_2006-02-10,0,abnormal,1.588915,1.732,2.752,1.984,1.499,1.556,1.333,1.134,1.292,2.499,1.883,1.703,1.794,1.873,1.933,1.767,2.056,1.662,1.804,1.754,...,0.64,2.197,1.413,3.395,4.533,1.301,2.149,2.391,3.3,4.714,3.184,2.567,4.154,4.662,3.595,16.0,0.0,16.0,0.79,normal,331.0,68.0,217.0,70008.0,2155.0
4,15324312,2006-02-23,15324312_2006-02-23,0,normal,2.656655,1.465,3.892,0.945,0.813,0.892,0.97,1.401,1.217,1.614,1.58,1.303,1.957,1.879,1.794,1.421,1.773,1.677,1.891,1.846,...,2.654,2.434,3.313,3.502,4.081,4.34,3.978,4.17,4.426,4.944,4.343,4.143,4.748,4.544,4.328,0.0,0.0,0.0,0.94,normal,0.0,64.0,146.0,70012.0,3417.0


In [5]:
global_pet[global_pet.petmrn_identifier=='1414556_2018-10-30']

Unnamed: 0,mrn,pet_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,rest_global_mbf,stress_global_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,rest_seg9_mbf,rest_seg10_mbf,rest_seg11_mbf,rest_seg12_mbf,rest_seg13_mbf,rest_seg14_mbf,rest_seg15_mbf,rest_seg16_mbf,rest_seg17_mbf,...,stress_seg3_mbf,stress_seg4_mbf,stress_seg5_mbf,stress_seg6_mbf,stress_seg7_mbf,stress_seg8_mbf,stress_seg9_mbf,stress_seg10_mbf,stress_seg11_mbf,stress_seg12_mbf,stress_seg13_mbf,stress_seg14_mbf,stress_seg15_mbf,stress_seg16_mbf,stress_seg17_mbf,summed_stress_score,summed_rest_score,summed_difference_score,TID_ratio,gated_spect_results,agatston_coronary_calcium_score,height_in,weight_lb,reportID,subjectID
3858,1414556,2018-10-30,1414556_2018-10-30,0,normal,1.809645,0.788,1.426,0.839083,0.799958,0.810667,0.721833,0.783875,0.781667,0.803083,0.918875,0.917708,0.872042,0.849458,0.767792,0.693972,0.796028,0.812972,0.736222,0.642786,...,1.5275,1.390125,1.45075,1.4535,1.446542,1.720542,1.705208,1.582792,1.645125,1.386583,1.128667,1.501056,1.447167,1.197389,1.21775,0.0,0.0,0.0,,normal,145.0,63.0,189.0,166220.0,7521.0
555,1414556,2018-10-30,1414556_2018-10-30,1,normal,2.1,0.69,1.45,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
# Add echo data to the pet studies (inner join, to keep only keys that are in both dataframes)
global_pet_echo = global_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='inner')
print(f'PET studies:  {len(global_pet_echo.petmrn_identifier.unique())}')
print(f'ECHO studies: {len(global_pet_echo.study.unique())}')

PET studies:  2745
ECHO studies: 4642


In [7]:
lobal_pet_noecho_18 = global_pet_echo[global_pet_echo['post-2018']==0]
print(f'PET studies:  {len(lobal_pet_noecho_18.petmrn_identifier.unique())}')
print(f'ECHO studies: {len(lobal_pet_noecho_18.study.unique())}')

PET studies:  2594
ECHO studies: 4362


In [8]:
# PET studies without echo data
global_pet_echo_all = global_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='left' )
global_pet_noecho = global_pet_echo_all.loc[global_pet_echo_all.study.isnull()]
print(f'PET studies total:        {len(global_pet_echo_all.petmrn_identifier.unique())}')
print(f'PET studies without echo: {len(global_pet_noecho.petmrn_identifier.unique())}')

PET studies total:        3031
PET studies without echo: 286


In [9]:
# Global_pet_echo table with the variables to use (drop rows with na in any of those variables)
global_pet_echo.head(2)
global_pet_variables_target = ['rest_global_mbf', 'stress_global_mbf', 'global_cfr_calc', 'post-2018']
global_pet_variables = global_pet_variables_target.copy()
global_pet_variables.extend(list(files_cfr.columns))
global_pet_echo_variables = global_pet_echo[global_pet_variables].dropna(axis=0)
print(f'PET target variables: {global_pet_variables_target}')
print(f'PET studies:          {len(global_pet_echo_variables.petmrn_identifier.unique())}')
print(f'ECHO studies:         {len(global_pet_echo_variables.study.unique())}')
print(f'global shape:         {global_pet_echo_variables.shape}')

PET target variables: ['rest_global_mbf', 'stress_global_mbf', 'global_cfr_calc', 'post-2018']
PET studies:          2670
ECHO studies:         4348
global shape:         (207344, 54)


In [22]:
global_pet_echo_variables.head()

Unnamed: 0,rest_global_mbf,stress_global_mbf,global_cfr_calc,post-2018,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),pet_measurement,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,...,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views
0,0.644,0.881,1.368012,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,63.0,0.041843,0.041843,...,1.819508e-11,7.633399e-12,1.0,3.672189e-09,7.103154e-11,1.210577e-12,1.399852e-11,1.144677e-11,8.98265e-12,8.32667e-10,7.452399e-10,1.369981e-14,5.375912e-12,1.17887e-12,3.158863e-13,1.88389e-12,1.424565e-08,2.138163e-11,2.131893e-10,6.526593e-11,7.639662e-12,7.831371e-12,1.303826e-11,a4c,1.0
1,0.644,0.881,1.368012,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,66.0,0.041843,0.041843,...,0.0002866421,7.490995e-07,9.376452e-05,1.396282e-05,1.219092e-06,9.432758e-07,2.021102e-05,1.061805e-05,0.05039539,3.551894e-05,0.004092602,1.081343e-05,1.715862e-05,8.474183e-05,4.208984e-06,1.155519e-06,6.586986e-05,0.04604153,0.00181708,0.0001203537,2.055605e-06,0.02958272,7.959471e-06,a2c,1.0
2,0.644,0.881,1.368012,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,66.0,0.035325,0.035325,...,2.0331950000000002e-18,8.236164000000001e-17,6.861483e-20,1.2716380000000002e-17,3.9456199999999997e-19,9.706387000000001e-17,5.120881e-17,5.822363e-17,3.20279e-19,1.796597e-17,2.184731e-18,5.4184930000000004e-18,1.007037e-13,5.866526e-18,5.051431e-21,1.0,1.89044e-16,1.783125e-18,5.040794e-18,3.030318e-18,9.761003e-18,2.707398e-20,8.752964e-21,plax_plax,1.0
3,0.644,0.881,1.368012,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,127.0,0.035325,0.035325,...,3.372928e-13,4.704709e-13,9.757279e-18,1.413125e-12,2.663489e-15,8.321568e-15,1.503405e-10,2.906337e-13,4.043358e-15,2.23134e-12,2.7421e-15,2.610509e-15,1.003766e-13,7.184594e-15,3.124145e-14,3.251909e-13,7.110169e-13,5.55089e-12,4.177675e-14,1.0,1.335149e-09,2.782359e-10,6.012166e-15,psax_pap,1.0
4,0.644,0.881,1.368012,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,172.0,0.035325,0.035325,...,5.217069e-13,4.014975e-11,2.399355e-14,2.186645e-12,2.078839e-13,1.633007e-13,6.202011e-12,1.073491e-12,5.27399e-12,2.718212e-11,6.351976e-14,2.021436e-13,3.99631e-11,3.98581e-12,1.505163e-14,1.385028e-09,2.759727e-08,7.027128e-07,4.654197e-09,0.9999992,7.084056e-08,4.606314e-09,1.715498e-13,psax_pap,1.0


In [25]:
# Save global_pet_echo table
global_pet_echo_variables_file = 'global_pet_echo_variables.parquet'
global_pet_echo_variables.to_parquet(os.path.join(meta_dir, global_pet_echo_variables_file))

#### Unaffected (non-defect) variables ####
Notebook 2/26/2020: clean_pet_cfr_petdata4_addingseg7
File used: pet_cfr_petdata_02_26_2020_withperfandseg7.xlsx

* All segments used (segment 7 data now available)
* MBF and CFR calculated for unaffected segments
Perfusion defect segments are those with a perfusion score >0 at rest or stress
Unaffected segments are those not in a perfusion defect, and not adjacent to a perfusion defect
* Of note, PETs with no unaffected segments were excluded from the dataset
* As described above, 2871 after excluding CABG, transplant, and those with missing perfusion data
* After excluding 295 for having no "unaffected segments", 2576 remaining
File used: post_2018_pets_with_clinical_cfr_all.csv

* Clinical global MBF and CFR values for PETs after 10/2018 were obtained from EPIC
These values were also used for unaffected MBF and CFR
After excluding CABG, transplant, and missing CFR values, 167 PETs with any perfusion
After keeping only studies with normal/probably normal perfusion, 83 PETs remaining

Merge:

Total 2659 PETs
File created: non_defect_plus_p2018.parquet
Use notes:

* Variables to use: rest_mbf_unaff, stress_mbf_unaff, unaffected_cfr
* Also available: perfusion data, segmental data, gated SPECT results, calcium score, height, weight
* For clean data, would exclude PETs with post-2018==1, and would limit to PETs with myocardial_perfusion of 'normal' or 'probably normal'- if this is done, would use global_cfr_calc instead of unaffected_cfr because it is slightly more accurate in PETs with normal/prob normal perfusion (could consider looking at whether normal/probably normal PETs are still scored as having small perfusion defects)

In [26]:
nondefect_pet_file = 'non_defect_plus_p2018.parquet'
nondefect_pet = pd.read_parquet(os.path.join(pet_data_dir, nondefect_pet_file))
nondefect_pet = nondefect_pet.astype({'pet_date': 'datetime64[ns]'})
print(f'PET studies: {len(nondefect_pet.petmrn_identifier.unique())}')
print(os.path.join(pet_data_dir, nondefect_pet_file))

PET studies: 2655
/mnt/obi0/phi/pet/pet_cfr/non_defect_plus_p2018.parquet


In [27]:
# Add echo data to the pet studies (inner join)
nondefect_pet_echo = nondefect_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='inner')
print(f'PET studies:  {len(nondefect_pet_echo.petmrn_identifier.unique())}')
print(f'ECHO studies: {len(nondefect_pet_echo.study.unique())}')

PET studies:  2400
ECHO studies: 3932


In [28]:
# Disqualified PET studies
nondefect_pet_echo_all = nondefect_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='left')
nondefect_pet_noecho = nondefect_pet_echo_all.loc[nondefect_pet_echo_all.study.isnull()]
print(f'PET studies total:        {len(nondefect_pet_echo_all.petmrn_identifier.unique())}')
print(f'PET studies without echo: {len(nondefect_pet_noecho.petmrn_identifier.unique())}')

PET studies total:        2655
PET studies without echo: 255


In [29]:
nondefect_variables_target = ['rest_mbf_unaff', 'stress_mbf_unaff', 'unaffected_cfr', 'post-2018']
nondefect_variables = nondefect_variables_target.copy()
nondefect_variables.extend(list(files_cfr.columns))
nondefect_pet_echo_variables = nondefect_pet_echo[nondefect_variables].dropna(axis=0)
print(f'PET target variables: {nondefect_variables_target}')
print(f'PET studies:          {len(nondefect_pet_echo_variables.petmrn_identifier.unique())}')
print(f'ECHO studies:         {len(nondefect_pet_echo_variables.study.unique())}')
print(f'nondefect shape:      {nondefect_pet_echo_variables.shape}')

PET target variables: ['rest_mbf_unaff', 'stress_mbf_unaff', 'unaffected_cfr', 'post-2018']
PET studies:          2329
ECHO studies:         3673
nondefect shape:      (172943, 54)


In [40]:
nondefect_pet_echo_variables.head()

Unnamed: 0,rest_mbf_unaff,stress_mbf_unaff,unaffected_cfr,post-2018,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),pet_measurement,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,...,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views
0,0.6824,1.1168,1.636577,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,63.0,0.041843,0.041843,...,1.819508e-11,7.633399e-12,1.0,3.672189e-09,7.103154e-11,1.210577e-12,1.399852e-11,1.144677e-11,8.98265e-12,8.32667e-10,7.452399e-10,1.369981e-14,5.375912e-12,1.17887e-12,3.158863e-13,1.88389e-12,1.424565e-08,2.138163e-11,2.131893e-10,6.526593e-11,7.639662e-12,7.831371e-12,1.303826e-11,a4c,1.0
1,0.6824,1.1168,1.636577,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,66.0,0.041843,0.041843,...,0.0002866421,7.490995e-07,9.376452e-05,1.396282e-05,1.219092e-06,9.432758e-07,2.021102e-05,1.061805e-05,0.05039539,3.551894e-05,0.004092602,1.081343e-05,1.715862e-05,8.474183e-05,4.208984e-06,1.155519e-06,6.586986e-05,0.04604153,0.00181708,0.0001203537,2.055605e-06,0.02958272,7.959471e-06,a2c,1.0
2,0.6824,1.1168,1.636577,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,66.0,0.035325,0.035325,...,2.0331950000000002e-18,8.236164000000001e-17,6.861483e-20,1.2716380000000002e-17,3.9456199999999997e-19,9.706387000000001e-17,5.120881e-17,5.822363e-17,3.20279e-19,1.796597e-17,2.184731e-18,5.4184930000000004e-18,1.007037e-13,5.866526e-18,5.051431e-21,1.0,1.89044e-16,1.783125e-18,5.040794e-18,3.030318e-18,9.761003e-18,2.707398e-20,8.752964e-21,plax_plax,1.0
3,0.6824,1.1168,1.636577,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,127.0,0.035325,0.035325,...,3.372928e-13,4.704709e-13,9.757279e-18,1.413125e-12,2.663489e-15,8.321568e-15,1.503405e-10,2.906337e-13,4.043358e-15,2.23134e-12,2.7421e-15,2.610509e-15,1.003766e-13,7.184594e-15,3.124145e-14,3.251909e-13,7.110169e-13,5.55089e-12,4.177675e-14,1.0,1.335149e-09,2.782359e-10,6.012166e-15,psax_pap,1.0
4,0.6824,1.1168,1.636577,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,172.0,0.035325,0.035325,...,5.217069e-13,4.014975e-11,2.399355e-14,2.186645e-12,2.078839e-13,1.633007e-13,6.202011e-12,1.073491e-12,5.27399e-12,2.718212e-11,6.351976e-14,2.021436e-13,3.99631e-11,3.98581e-12,1.505163e-14,1.385028e-09,2.759727e-08,7.027128e-07,4.654197e-09,0.9999992,7.084056e-08,4.606314e-09,1.715498e-13,psax_pap,1.0


In [33]:
# Save global_pet_echo table
nondefect_pet_echo_variables_file = 'nondefect_pet_echo_variables.parquet'
nondefect_pet_echo_variables.to_parquet(os.path.join(meta_dir, nondefect_pet_echo_variables_file))

In [34]:
# Complete list of unique petmrn_identifier
petmrn_identifier_list = list(global_pet.petmrn_identifier.unique())+list(nondefect_pet.petmrn_identifier.unique())
petmrn_identifier_set = list(set(petmrn_identifier_list))

In [35]:
print(len(petmrn_identifier_list))
print(len(petmrn_identifier_set))

5686
3031


In [37]:
# Let's filter Rahuls list of missing echos
missing_echo = pd.read_parquet(os.path.join(meta_dir, 'mrn_pet_missing_echo_file.parquet'))
print(f'petmrn_identifier in original list: {len(missing_echo.petmrn_identifier.unique())}')
missing_echo.tail()

petmrn_identifier in original list: 245


Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,pet_measurement,study,echo_study_date
28467,21687652,NaT,2010-12-15,,21687652_2010-12-15,1.0,,NaT
28602,26881094,NaT,2012-06-11,,26881094_2012-06-11,1.0,,NaT
28797,18982686,NaT,2013-06-26,,18982686_2013-06-26,1.0,,NaT
28805,23588965,NaT,2013-07-19,,23588965_2013-07-19,1.0,,NaT
28888,4007589,NaT,2013-12-11,,4007589_2013-12-11,1.0,,NaT


In [38]:
missing_echo_filtered = missing_echo[missing_echo.petmrn_identifier.isin(petmrn_identifier_set)].\
                        drop(columns=['pet_measurement'], axis=1).reset_index(drop=True)
print(f'petmrn_identifier in filtered list:   {len(missing_echo_filtered.petmrn_identifier.unique())}')
missing_no_echo_date = missing_echo_filtered.loc[missing_echo_filtered.echo_date.isnull()]
print(f'petmrn_identifier without echo dates: {len(missing_no_echo_date.petmrn_identifier.unique())}')

missing_echo_filtered_file = 'mrn_pet_missing_echo_file_filtered.parquet'
missing_echo_filtered.to_parquet(os.path.join(meta_dir, missing_echo_filtered_file))

petmrn_identifier in filtered list:   75
petmrn_identifier without echo dates: 2


In [39]:
print(missing_echo_filtered.shape)
print(missing_no_echo_date.shape)
missing_echo_filtered.tail(20)

(79, 7)
(2, 7)


Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,study,echo_study_date
59,24713273,2010-03-16,2010-03-29,13.0,24713273_2010-03-29,,NaT
60,25017179,2019-03-20,2019-10-21,215.0,25017179_2019-10-21,,NaT
61,25048414,2010-08-11,2010-07-21,21.0,25048414_2010-07-21,,NaT
62,25232216,2010-09-22,2010-09-23,1.0,25232216_2010-09-23,,NaT
63,25428244,2010-12-16,2010-12-17,1.0,25428244_2010-12-17,,NaT
64,25795402,2011-05-07,2011-06-20,44.0,25795402_2011-06-20,,NaT
65,26008045,2011-06-24,2011-07-15,21.0,26008045_2011-07-15,,NaT
66,26396218,2012-01-19,2012-01-20,1.0,26396218_2012-01-20,,NaT
67,26620864,2013-12-14,2013-07-17,150.0,26620864_2013-07-17,,NaT
68,26828475,2012-10-24,2012-10-02,22.0,26828475_2012-10-02,,NaT


### Split the patients in train, validate and test sets ###
Although each view might have a little different patient pupulation distribution, because not all views are in each study. However, we want the same MRNS in each data set and for all views so that we can directly compare the performance of the algorithm for the same patients. We can expand the data frame above to add the splits.

In [41]:
def patientsplit(patient_list):

    train_test_split = 0.87
    train_eval_split = 0.90

    # Take a test set from all patients
    patient_list_train = np.random.choice(patient_list,
                                          size = int(np.floor(train_test_split*len(patient_list))),
                                          replace = False)
    patient_list_test = list(set(patient_list).difference(patient_list_train))
    train_test_intersection = set(patient_list_train).intersection(set(patient_list_test)) # This should be empty
    print('Intersection of patient_list_train and patient_list_test:', train_test_intersection)

    # Further separate some patients for evaluation from the training list
    patient_list_eval = np.random.choice(patient_list_train,
                                         size = int(np.ceil((1-train_eval_split)*len(patient_list_train))),
                                         replace = False)

    patient_list_train = set(patient_list_train).difference(patient_list_eval)
    train_eval_intersection = set(patient_list_train).intersection(set(patient_list_eval))
    print('Intersection of patient_list_train and patient_list_eval:', train_eval_intersection)

    # Show the numbers
    print('total patients:', len(patient_list))
    print()
    print('patients in set:', np.sum([len(patient_list_train),
                                     len(patient_list_eval),
                                     len(patient_list_test)]))
    print()
    print('patients in train:', len(patient_list_train))
    print('patients in eval:', len(patient_list_eval))
    print('patients in test:', len(patient_list_test))

    return patient_list_train, patient_list_eval, patient_list_test

In [50]:
#dataset = global_pet_echo_variables
#dataset_filename = 'global_pet_echo_dataset_'+cfr_meta_date+'.parquet'
#global_pet_echo_variables.head()

dataset= nondefect_pet_echo_variables
dataset_filename = 'nondefect_pet_echo_dataset_'+cfr_meta_date+'.parquet'
nondefect_pet_echo_variables.head()

Unnamed: 0,rest_mbf_unaff,stress_mbf_unaff,unaffected_cfr,post-2018,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),pet_measurement,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,...,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views
0,0.6824,1.1168,1.636577,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,63.0,0.041843,0.041843,...,1.819508e-11,7.633399e-12,1.0,3.672189e-09,7.103154e-11,1.210577e-12,1.399852e-11,1.144677e-11,8.98265e-12,8.32667e-10,7.452399e-10,1.369981e-14,5.375912e-12,1.17887e-12,3.158863e-13,1.88389e-12,1.424565e-08,2.138163e-11,2.131893e-10,6.526593e-11,7.639662e-12,7.831371e-12,1.303826e-11,a4c,1.0
1,0.6824,1.1168,1.636577,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,66.0,0.041843,0.041843,...,0.0002866421,7.490995e-07,9.376452e-05,1.396282e-05,1.219092e-06,9.432758e-07,2.021102e-05,1.061805e-05,0.05039539,3.551894e-05,0.004092602,1.081343e-05,1.715862e-05,8.474183e-05,4.208984e-06,1.155519e-06,6.586986e-05,0.04604153,0.00181708,0.0001203537,2.055605e-06,0.02958272,7.959471e-06,a2c,1.0
2,0.6824,1.1168,1.636577,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,66.0,0.035325,0.035325,...,2.0331950000000002e-18,8.236164000000001e-17,6.861483e-20,1.2716380000000002e-17,3.9456199999999997e-19,9.706387000000001e-17,5.120881e-17,5.822363e-17,3.20279e-19,1.796597e-17,2.184731e-18,5.4184930000000004e-18,1.007037e-13,5.866526e-18,5.051431e-21,1.0,1.89044e-16,1.783125e-18,5.040794e-18,3.030318e-18,9.761003e-18,2.707398e-20,8.752964e-21,plax_plax,1.0
3,0.6824,1.1168,1.636577,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,127.0,0.035325,0.035325,...,3.372928e-13,4.704709e-13,9.757279e-18,1.413125e-12,2.663489e-15,8.321568e-15,1.503405e-10,2.906337e-13,4.043358e-15,2.23134e-12,2.7421e-15,2.610509e-15,1.003766e-13,7.184594e-15,3.124145e-14,3.251909e-13,7.110169e-13,5.55089e-12,4.177675e-14,1.0,1.335149e-09,2.782359e-10,6.012166e-15,psax_pap,1.0
4,0.6824,1.1168,1.636577,0,7924277,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba,2008-08-15,2008-08-14,7924277_2008-08-15,-1,1.0,1.0,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7c/4b7c27321...,2008-08-14 11:57:38,4b7c2732122e43dc_4903a44b32e4622fe55093efdeba_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,172.0,0.035325,0.035325,...,5.217069e-13,4.014975e-11,2.399355e-14,2.186645e-12,2.078839e-13,1.633007e-13,6.202011e-12,1.073491e-12,5.27399e-12,2.718212e-11,6.351976e-14,2.021436e-13,3.99631e-11,3.98581e-12,1.505163e-14,1.385028e-09,2.759727e-08,7.027128e-07,4.654197e-09,0.9999992,7.084056e-08,4.606314e-09,1.715498e-13,psax_pap,1.0


In [51]:
# Get a patient list
patient_list = list(dataset.sample(frac=1).mrn.unique())
patient_list_train, patient_list_eval, patient_list_test = patientsplit(patient_list)

patient_split = {'train': patient_list_train,
                 'eval': patient_list_eval,
                 'test': patient_list_test}

print('Patient IDs in train:', len(patient_split['train']))
print('Patient IDs in eval:', len(patient_split['eval']))
print('Patient IDs in test:', len(patient_split['test']))

print()

print('Intersection train-test:', set(patient_split['train']).intersection(set(patient_split['test'])))
print('Intersection train-eval:', set(patient_split['train']).intersection(set(patient_split['eval'])))
print('Intersection eval-test:', set(patient_split['eval']).intersection(set(patient_split['test'])))

Intersection of patient_list_train and patient_list_test: set()
Intersection of patient_list_train and patient_list_eval: set()
total patients: 2245

patients in set: 2245

patients in train: 1757
patients in eval: 196
patients in test: 292
Patient IDs in train: 1757
Patient IDs in eval: 196
Patient IDs in test: 292

Intersection train-test: set()
Intersection train-eval: set()
Intersection eval-test: set()


In [52]:
# Add mode column to files_cfr
split_list = []
for dset in patient_split.keys():
    dset_mrn_list = list(patient_split[dset])
    split_list.append(pd.DataFrame({'mrn': dset_mrn_list,
                                    'mode': [dset]*len(dset_mrn_list)}))

split_df = pd.concat(split_list, ignore_index = True)

dataset_split = dataset.merge(right = split_df, on = 'mrn', how = 'left').\
                    sample(frac = 1).\
                    reset_index(drop = True)

In [53]:
print(f'PET studies:          {len(dataset_split.petmrn_identifier.unique())}')
print(f'ECHO studies:         {len(dataset_split.study.unique())}')

PET studies:          2329
ECHO studies:         3673


In [54]:
# Prepare the final set that we will use for writing TFR files. We don't want any rows with NAs in some columns.
dataset_split_tfr = dataset_split.dropna(axis=0)

print('Dropped {} rows.'.format(dataset_split.shape[0]-dataset_split_tfr.shape[0]))
# Add some other information that we need and shuffle the whole thing
dataset_split_tfr = dataset_split_tfr.assign(rate = np.round(1/dataset_split_tfr.frame_time*1e3, decimals = 1))
dataset_split_tfr = dataset_split_tfr.sample(frac = 1)

Dropped 0 rows.


In [55]:
dataset_split_tfr.to_parquet(os.path.join(meta_dir, dataset_filename))
print('Saved to file:', dataset_filename)
print(dataset_split_tfr.shape)
dataset_split_tfr.head()

Saved to file: nondefect_pet_echo_dataset_200519.parquet
(172943, 56)


Unnamed: 0,rest_mbf_unaff,stress_mbf_unaff,unaffected_cfr,post-2018,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),pet_measurement,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,...,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views,mode,rate
82893,0.7852,2.935,3.737901,0,5615307,4b7e4f5003c53ff1_4903a585855e2f3d40e9483eb4e7,2010-01-19,2010-10-12,5615307_2010-01-19,266,266.0,1.0,4b7e4f5003c53ff1_4903a585855e2f3d40e9483eb4e7_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7e/4b7e4f500...,2010-10-12 15:06:57,4b7e4f5003c53ff1_4903a585855e2f3d40e9483eb4e7_...,BWH,iE33,Philips Medical Systems,0.0,33.333,56.0,116.0,0.041843,0.041843,...,0.00373305,7.165111e-08,8.401705e-10,3.811551e-08,0.987301,2.214152e-05,3.693566e-10,7.789717e-10,3.322451e-09,3.937793e-10,2.019399e-07,2.790909e-07,1.833952e-11,0.002590995,8.677904e-09,0.006285981,2.165729e-08,5.902916e-08,3.256602e-10,4.231703e-12,6.517973e-05,a4c_rv,1.0,eval,30.0
139500,1.141182,0.931455,0.816219,0,32458259,48b79820661c6f07_4903a5835834ae16731e8aae7517,2016-03-23,2016-03-18,32458259_2016-03-23,-5,5.0,1.0,48b79820661c6f07_4903a5835834ae16731e8aae7517_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b7/48b798206...,2016-03-18 14:02:05,48b79820661c6f07_4903a5835834ae16731e8aae7517_...,BWH,CX50,Philips Medical Systems,0.0,50.702,69.0,52.0,0.041869,0.041869,...,4.4107e-11,1.076906e-09,1.671425e-13,4.540347e-09,7.202248e-12,1.32434e-11,7.452849e-11,1.478087e-11,0.9999425,4.249902e-10,1.316529e-08,1.47908e-11,4.143725e-10,3.623753e-10,1.196839e-11,2.645966e-10,1.677805e-11,2.763652e-05,4.271944e-13,1.06494e-10,3.414919e-10,other,1.0,test,19.7
9812,1.0342,0.7096,0.686134,0,1621101,4b7a8193fd636cc9_4903a44b32efc286a1e36d633f5a,2009-02-23,2008-03-24,1621101_2009-02-23,-336,336.0,1.0,4b7a8193fd636cc9_4903a44b32efc286a1e36d633f5a_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7a/4b7a8193f...,2008-03-24 09:36:38,4b7a8193fd636cc9_4903a44b32efc286a1e36d633f5a_...,BWH,Vivid i,GEMS Ultrasound,0.0,37.358165,71.0,72.0,0.046036,0.046036,...,2.0563e-12,5.310765e-09,1.485672e-13,3.007412e-11,5.516751e-13,5.349539e-13,1.806909e-11,8.49987e-07,1.201431e-12,4.962722e-09,2.856872e-09,5.496921e-13,8.484688e-12,1.250863e-09,1.933631e-10,1.005891e-13,2.523843e-08,0.999994,1.878893e-09,9.496106000000001e-17,3.308092e-14,psax_pap,1.0,train,26.8
20446,0.4805,0.8545,1.778356,0,405811,4b7b42ea7990f522_4903a585842c23b85b4779bb7711,2010-09-22,2010-02-05,405811_2010-09-22,-229,229.0,1.0,4b7b42ea7990f522_4903a585842c23b85b4779bb7711_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b42ea7...,2010-02-05 13:18:09,4b7b42ea7990f522_4903a585842c23b85b4779bb7711_...,BWH,iE33,Philips Medical Systems,0.0,33.333,56.0,67.0,0.037481,0.037481,...,0.9673775,5.568245e-05,1.228608e-11,2.452318e-06,0.01805651,4.07781e-07,9.314117e-08,2.982378e-05,2.577516e-06,2.294358e-06,1.649656e-08,0.0001200273,2.154523e-05,9.768465e-05,0.006101391,3.044464e-05,0.001009268,3.529356e-06,4.929144e-08,1.492394e-07,3.961655e-07,a4c,1.0,train,30.0
127554,0.86213,1.418431,1.645264,0,22247878,490177d46d5efaa3_4903a58359a01354c3981dc8dccf,2017-05-23,2016-12-19,22247878_2017-05-23,-155,155.0,1.0,490177d46d5efaa3_4903a58359a01354c3981dc8dccf_...,/mnt/obi0/phi/echo/npyFiles/BWH/4901/490177d46...,2016-12-19 13:28:47,490177d46d5efaa3_4903a58359a01354c3981dc8dccf_...,BWH,iE33,Philips Medical Systems,0.0,33.333,70.0,125.0,0.082844,0.082844,...,4.102466e-11,0.9998125,4.148305e-12,5.395332e-09,2.892619e-10,2.678648e-09,9.171661e-09,1.869904e-10,0.000175072,5.225467e-07,3.942199e-10,3.015358e-09,1.529065e-09,1.748127e-07,2.021181e-10,6.103316e-10,6.084948e-11,3.691091e-09,3.013436e-10,1.113953e-05,3.331915e-09,a4c_far,1.0,eval,30.0


In [56]:
minrate = 20
minframes = 40
minduration = minframes/minrate
print('Minmum rate:      {}'.format(minrate))
print('Minimum frames:   {}'.format(minframes))
print('Minimum duration: {}'.format(minduration))
#minduration = 1.5
#minframes = int(np.ceil(minrate*minduration))

maxrows = dataset_split_tfr.shape[0]
rows_disqualified = dataset_split_tfr[(dataset_split_tfr.rate < minrate) | (dataset_split_tfr.number_of_frames<minframes)].shape[0]
rows_qualified = dataset_split_tfr[(dataset_split_tfr.rate >= minrate) & (dataset_split_tfr.number_of_frames>=minframes)].shape[0]

print('Disqualified Rows {}, fraction:{:.1f}'.format(rows_disqualified, 
                                                     np.round(rows_disqualified/maxrows*100), decimals=1))

print('Qualified Rows {}, fraction:{:.1f}'.format(rows_qualified, 
                                                     np.round(rows_qualified/maxrows*100), decimals=1))

Minmum rate:      20
Minimum frames:   40
Minimum duration: 2.0
Disqualified Rows 46520, fraction:27.0
Qualified Rows 126423, fraction:73.0


In [4]:
df = pd.read_parquet(os.path.join(meta_dir, 'global_pet_echo_dataset_200519.parquet'))
df.head()

Unnamed: 0,rest_global_mbf,stress_global_mbf,global_cfr_calc,post-2018,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),pet_measurement,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,...,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views,mode,rate
143360,0.489,0.517,1.05726,0,20196192,4903a58c1dc8a997_4903a585842f4017d3be5d4e6311,2010-09-22,2010-01-15,20196192_2010-09-22,-250,250.0,1.0,4903a58c1dc8a997_4903a585842f4017d3be5d4e6311_...,/mnt/obi0/phi/echo/npyFiles/BWH/4903/4903a58c1...,2010-01-15 09:00:24,4903a58c1dc8a997_4903a585842f4017d3be5d4e6311_...,BWH,iE33,Philips Medical Systems,0.0,33.333,42.0,87.0,0.035325,0.035325,...,3.1670380000000004e-17,1.458876e-15,2.26394e-14,2.439137e-13,1.27287e-14,7.978906e-15,5.648506e-09,2.596805e-13,1.312029e-13,1.067549e-14,2.279238e-15,2.595446e-15,7.252949e-13,1.706543e-11,5.655046e-14,5.717229e-08,3.881268e-10,4.083272e-11,5.261172e-09,4.863208e-17,1.0,suprasternal,1.0,train,30.0
97349,0.89,0.95,1.067416,0,24478307,4907bd6f073eeb36_4903a44ab0957d4f820fa5856798,2009-12-23,2009-12-18,24478307_2009-12-23,-5,5.0,1.0,4907bd6f073eeb36_4903a44ab0957d4f820fa5856798_...,/mnt/obi0/phi/echo/npyFiles/BWH/4907/4907bd6f0...,2009-12-18 13:20:01,4907bd6f073eeb36_4903a44ab0957d4f820fa5856798_...,BWH,iE33,Philips Medical Systems,0.0,48.476,43.0,48.0,0.0332,0.0332,...,2.409861e-09,1.647152e-09,9.204727e-11,1.487199e-11,1.823151e-09,1.831744e-11,1.798087e-11,7.657828e-15,1.0,1.032708e-12,4.584639e-10,3.379228e-12,4.596527e-11,1.533004e-11,7.693103e-13,3.048389e-10,6.473896e-09,1.520418e-08,1.591869e-15,4.757059e-11,6.396654e-09,other,1.0,train,20.6
20245,1.15,1.337,1.162609,0,11320694,4a1750ea3b0c754f_4903a58792a3687e4d23d9a0e607,2013-03-01,2012-12-03,11320694_2013-03-01,-88,88.0,1.0,4a1750ea3b0c754f_4903a58792a3687e4d23d9a0e607_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a1750ea3...,2012-12-03 11:01:18,4a1750ea3b0c754f_4903a58792a3687e4d23d9a0e607_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,48.333332,59.0,69.0,0.064005,0.064005,...,4.150084e-08,6.373574e-11,1.845399e-10,1.718454e-10,7.417859e-14,8.081975e-12,7.104746e-11,3.527057e-09,0.9999989,4.890339e-12,2.905603e-09,3.244673e-10,4.749769e-09,2.374563e-12,1.570561e-10,2.921829e-10,8.684407e-10,1.270551e-08,1.545107e-12,1.027591e-06,6.883822e-10,other,1.0,train,20.7
47725,1.299,1.684,1.296382,0,4512133,4b7f09264f0d0d72_4903a44b32e9589f835cbb533dcd,2007-05-07,2008-05-05,4512133_2007-05-07,364,364.0,1.0,4b7f09264f0d0d72_4903a44b32e9589f835cbb533dcd_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7f/4b7f09264...,2008-05-05 12:49:28,4b7f09264f0d0d72_4903a44b32e9589f835cbb533dcd_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.575756,56.0,84.0,0.046036,0.046036,...,3.613773e-10,1.423286e-11,1.080313e-13,6.186913e-13,5.101235e-14,6.961296e-13,1.893432e-12,3.106272e-14,0.9999999,1.057082e-13,3.353844e-12,2.200249e-15,2.221128e-12,3.406275e-13,4.057523e-17,1.372358e-11,9.969681e-12,8.245678e-12,9.20372e-12,1.793095e-13,2.662584e-12,other,1.0,train,24.6
71015,0.654,0.846,1.293578,0,8326175,4b735a632bc105c8_4903a581a68027eccef06d111bef,2015-06-08,2014-11-21,8326175_2015-06-08,-199,199.0,1.0,4b735a632bc105c8_4903a581a68027eccef06d111bef_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b735a632...,2014-11-21 15:45:24,4b735a632bc105c8_4903a581a68027eccef06d111bef_...,BWH,iE33,Philips Medical Systems,0.0,33.333,103.0,69.0,0.033177,0.033177,...,2.551467e-13,2.050063e-13,1.471773e-11,5.848888e-11,5.00674e-12,2.175025e-10,9.369477e-13,1.2077e-12,1.334596e-11,6.578644e-10,1.008648e-08,4.906414e-10,6.751207e-13,0.9999989,9.643591e-11,2.291166e-10,2.475981e-10,4.158672e-09,1.03588e-06,4.027625e-13,2.919244e-14,plax_plax,1.0,train,30.0
