### Consort data ###

In [19]:
import os
import numpy as np
import pandas as pd
import glob

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [5]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
pet_data_dir = os.path.normpath('/mnt/obi0/phi/pet/pet_cfr')
cfr_meta_date = '200617'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200617


In [6]:
match_view_filename = 'pet_match365_diff_files_'+cfr_meta_date+'.parquet'
files_cfr = pd.read_parquet(os.path.join(meta_dir, match_view_filename))

print('Total number of patients      {}'.format(len(files_cfr.mrn.unique())))
print('Total number of echo studies  {}'.format(len(files_cfr.study.unique())))
print('Total number of PET studies   {}'.format(len(files_cfr.petmrn_identifier.unique())))
print('Total number of echos         {}'.format(len(files_cfr.filename.unique())))

files_cfr.head(2)

Total number of patients      3268
Total number of echo studies  6460
Total number of PET studies   3603
Total number of echos         307566


Unnamed: 0,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,pet_measurement,difference(days),filename,dir,datetime,file_base,identifier,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,a2c_laocc,a2c_lvocc_s,a3c,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,year_month,study_full_time,institution,model,manufacturer,max_view,sum_views
0,35156678,48b09010a2219aad_4903a582edf3bd118ffb3386065b,2018-10-15,2017-12-06,35156678_2018-10-15,-313,1.0,313.0,48b09010a2219aad_4903a582edf3bd118ffb3386065b_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09010a...,2017-12-06 13:11:41,48b09010a2219aad_4903a582edf3bd118ffb3386065b_...,48b09010a2219aad_4903a582edf3bd118ffb3386065b_...,47.769231,66.0,60.0,0.028951,0.028951,3.446593e-08,6.452001e-09,2.939033e-08,0.003095016,9.225302e-08,3.967397e-08,7.041133e-10,3.144316e-09,1.198068e-13,4.564848e-10,1.454091e-09,1.773068e-08,7.812481e-08,3.755003e-08,0.9968956,7.881907e-10,5.391607e-10,2.151894e-10,1.15716e-08,4.940982e-08,7.734493e-12,1.395856e-06,3.054052e-09,7.720808e-06,5.067335e-09,2.187016e-13,2.230961e-08,2017.0,20171206131141,BWH,Vivid E95,GE Vingmed Ultrasound,other,1.0
1,35156678,48b09010a2219aad_4903a582edf3bd118ffb3386065b,2018-10-15,2017-12-06,35156678_2018-10-15,-313,1.0,313.0,48b09010a2219aad_4903a582edf3bd118ffb3386065b_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09010a...,2017-12-06 13:11:41,48b09010a2219aad_4903a582edf3bd118ffb3386065b_...,48b09010a2219aad_4903a582edf3bd118ffb3386065b_...,20.087146,154.0,60.0,0.020448,0.020448,4.212547e-13,1.0,5.081462e-15,5.16278e-16,3.739126e-14,4.737708e-15,3.574842e-17,1.105575e-13,7.188538e-16,3.057809e-15,4.562448e-17,4.610617e-16,1.488447e-12,2.472197e-12,2.470155e-16,1.77298e-16,7.891783e-16,6.224149e-17,6.330659e-14,5.080497e-13,2.415044e-15,4.977857e-15,6.916603e-13,5.833864e-15,2.562479e-15,6.839081e-22,8.41705e-17,2017.0,20171206131141,BWH,Vivid E95,GE Vingmed Ultrasound,a2c_laocc,1.0


In [7]:
# Load the other data sets and generate a list without post-2018 identifiers
global_pet_file = 'pets_with_echos_global_all_withtracer.parquet'
global_pet = pd.read_parquet(os.path.join(pet_data_dir, global_pet_file))
global_pet = global_pet.astype({'pet_date': 'datetime64[ns]'})
post18_id_list_global = list(global_pet[global_pet['post-2018']==1]['petmrn_identifier'])
print(f'post-2018 petmrn_identifiers for global data: {len(post18_id_list_global)}')

nondefect_pet_file = 'non_defect_plus_p2018.parquet'
nondefect_pet = pd.read_parquet(os.path.join(pet_data_dir, nondefect_pet_file))
nondefect_pet = nondefect_pet.astype({'pet_date': 'datetime64[ns]'})
post18_id_list_nondefect = list(nondefect_pet[nondefect_pet['post-2018']==1]['petmrn_identifier'])
print(f'post-2018 petmrn_identifiers for nondefect data: {len(post18_id_list_nondefect)}')

# Get the combined post-2018 identifiers list (the PET studies we want to exclude)
post18_id_list = list(set(post18_id_list_global).union(set(post18_id_list_nondefect)))
print(f'total post-2018 petmrn_identifiers: {len(post18_id_list)}')

post-2018 petmrn_identifiers for global data: 160
post-2018 petmrn_identifiers for nondefect data: 83
total post-2018 petmrn_identifiers: 164


In [8]:
# Starting numbers without the post-2018 identifiers
files_cfr = pd.read_parquet(os.path.join(meta_dir, match_view_filename))
files_cfr = files_cfr.loc[~files_cfr['petmrn_identifier'].isin(post18_id_list)]

print('Total number of patients      {}'.format(len(files_cfr.mrn.unique())))
print('Total number of echo studies  {}'.format(len(files_cfr.study.unique())))
print('Total number of PET studies   {}'.format(len(files_cfr.petmrn_identifier.unique())))
print('Total number of echos         {}'.format(len(files_cfr.filename.unique())))

print()
# Removing videos without frame-time
files_cfr_failed = files_cfr.loc[files_cfr.frame_time.isnull()]
print(f'Failed ECHO studies: {len(files_cfr_failed.study.unique())}')
# The file names are not unique. Therefore, we cannot tell easliy how many videos.
#print(f'Failed ECHO videos : {len(files_cfr_failed.filename.unique())}')
#print(f'Failed Patients    : {len(files_cfr_failed.mrn.unique())}')

Total number of patients      3118
Total number of echo studies  6174
Total number of PET studies   3448
Total number of echos         293060

Failed ECHO studies: 189


In [9]:
# Starting numbers without the post-2018 identifiers and without the failed videos
files_cfr = pd.read_parquet(os.path.join(meta_dir, match_view_filename))
files_cfr = files_cfr.loc[~files_cfr['petmrn_identifier'].isin(post18_id_list)]
files_cfr = files_cfr.loc[~files_cfr.frame_time.isnull()]

print()
print('Total number of patients      {}'.format(len(files_cfr.mrn.unique())))
print('Total number of echo studies  {}'.format(len(files_cfr.study.unique())))
print('Total number of PET studies   {}'.format(len(files_cfr.petmrn_identifier.unique())))
print('Total number of echos         {}'.format(len(files_cfr.filename.unique())))


Total number of patients      3107
Total number of echo studies  6002
Total number of PET studies   3425
Total number of echos         292895


### Filter data sets: GLOBAL and NON-DEFECT variables ###

### Global variables ###:

Notebook 3/17/2020: global_pet_cfr
File used: pet_cfr_petdata_02_26_2020_withperfandseg7.xlsx

As described above, 2871 after excluding CABG, transplant, and those with missing perfusion data
File used: post_2018_pets_with_clinical_cfr_all.csv

After excluding CABG, transplant, and missing CFR values, 167 PETs with any perfusion
Merge:

After combining, 3038 PETs
File saved as pets_with_echos_global_all.parquet
Use notes:

NOTE- there are petmrn_identifiers that have two rows- the one with post_2018==0 is the one to use, and the one with post_2018==1 should be excluded
Variables to use: rest_global_mbf, stress_global_mbf, global_cfr_calc
Other variables- myocardial_perfusion, segmental data, perfusion data, TID, gated SPECT results, calcium score, height, weight
For "cleaner" data, would exclude those with post-2018==1

#### Update 6/14/2020 ####

Created a revised version of the above that includes CABG cases (from pre-2018), and excludes post-2018 cases that are already in the pre-2018 dataset (n=7). Total 3718 rows. File at /mnt/obi0/phi/pet/pet_cfr/pets_with_echos_global_all_withcabg.parquet
Update 6/17/2020:

Created a revised version that has a column tracer_obi for the tracer used. Also, the 7 post-2018 duplicate cases are excluded so there are are 3031 studies.
File used- aiCohort_withPerfusion_addRadiopharm.xlsx
The ammonia cases before 7/25/2011 have the value 'listed as ammonia', and the rubidium cases after 7/25/2011 have the value 'listed as rubidium. For the remaining discrepant values (i.e. missing values, FDG, sestamibi), the tracer was assumed to be the tracer in use at the time.
Rubidium 1,740, ammonia 1,276, listed as ammonia 4, listed as rubidium 2
File saved as /mnt/obi0/phi/pet/pet_cfr/pets_with_echos_global_all_withtracer.parquet

In [10]:
print(f'PET studies: {len(global_pet.petmrn_identifier.unique())}')
print(os.path.join(pet_data_dir, global_pet_file))
global_pet.head()

PET studies: 3031
/mnt/obi0/phi/pet/pet_cfr/pets_with_echos_global_all_withtracer.parquet


Unnamed: 0,mrn,pet_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,rest_global_mbf,stress_global_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,rest_seg9_mbf,rest_seg10_mbf,rest_seg11_mbf,rest_seg12_mbf,rest_seg13_mbf,rest_seg14_mbf,rest_seg15_mbf,rest_seg16_mbf,rest_seg17_mbf,stress_seg1_mbf,stress_seg2_mbf,stress_seg3_mbf,stress_seg4_mbf,stress_seg5_mbf,stress_seg6_mbf,stress_seg7_mbf,stress_seg8_mbf,stress_seg9_mbf,stress_seg10_mbf,stress_seg11_mbf,stress_seg12_mbf,stress_seg13_mbf,stress_seg14_mbf,stress_seg15_mbf,stress_seg16_mbf,stress_seg17_mbf,summed_stress_score,summed_rest_score,summed_difference_score,TID_ratio,gated_spect_results,agatston_coronary_calcium_score,height_in,weight_lb,reportID,subjectID,radiopharmaceutical,radiopharmaceutical2,tracer_obi
0,7924277,2008-08-15,7924277_2008-08-15,0,abnormal,1.368012,0.644,0.881,0.448,0.38,0.435,0.83,0.402,0.334,0.73,0.89,0.537,0.388,0.607,0.823,0.964,0.766,0.445,1.34,1.504,0.691,0.707,0.529,0.348,0.558,0.562,1.325,1.446,0.636,0.502,1.136,1.186,1.415,0.979,0.67,1.174,1.676,18.0,18.0,0.0,1.09,normal,,67.0,133.0,69718.0,2540.0,Rubidium-82,,rubidium
1,12853099,2006-01-25,12853099_2006-01-25,0,normal,2.109661,1.532,3.232,1.082,0.94,1.135,1.224,1.332,1.945,1.482,1.977,1.475,1.656,1.509,1.444,1.801,2.18,1.677,1.359,1.977,4.291,3.144,2.069,2.657,2.626,3.712,3.981,3.359,2.215,4.01,3.89,3.883,3.249,2.609,3.483,3.443,3.037,0.0,0.0,0.0,,not assessed,0.0,64.0,160.0,69969.0,3379.0,Rubidium-82,,rubidium
2,20710471,2006-01-23,20710471_2006-01-23,0,abnormal,1.229572,1.028,1.264,0.872,0.892,1.126,1.884,1.061,1.057,1.286,1.657,0.735,0.995,1.153,1.252,1.38,0.966,0.86,0.943,1.101,1.213,0.826,1.864,0.91,1.32,1.936,1.283,1.617,1.33,1.266,1.366,1.726,1.533,1.477,1.484,1.037,1.537,20.0,18.0,2.0,0.97,normal,6602.0,71.0,200.0,69993.0,3542.0,Rubidium-82,,rubidium
3,12627030,2006-02-10,12627030_2006-02-10,0,abnormal,1.588915,1.732,2.752,1.984,1.499,1.556,1.333,1.134,1.292,2.499,1.883,1.703,1.794,1.873,1.933,1.767,2.056,1.662,1.804,1.754,2.94,0.664,0.64,2.197,1.413,3.395,4.533,1.301,2.149,2.391,3.3,4.714,3.184,2.567,4.154,4.662,3.595,16.0,0.0,16.0,0.79,normal,331.0,68.0,217.0,70008.0,2155.0,Rubidium-82,,rubidium
4,15324312,2006-02-23,15324312_2006-02-23,0,normal,2.656655,1.465,3.892,0.945,0.813,0.892,0.97,1.401,1.217,1.614,1.58,1.303,1.957,1.879,1.794,1.421,1.773,1.677,1.891,1.846,2.559,2.759,2.654,2.434,3.313,3.502,4.081,4.34,3.978,4.17,4.426,4.944,4.343,4.143,4.748,4.544,4.328,0.0,0.0,0.0,0.94,normal,0.0,64.0,146.0,70012.0,3417.0,Rubidium-82,,rubidium


In [68]:
# Add echo data to the pet studies (inner join, to keep only keys in both dataframes)
global_pet_echo = global_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='inner')

# Here, we can already filter the tracer
#global_pet_echo = global_pet_echo[global_pet_echo.tracer_obi=='ammonia']

print(f'PET studies:  {len(global_pet_echo.petmrn_identifier.unique())}')
print(f'ECHO studies: {len(global_pet_echo.study.unique())}')

PET studies:  2579
ECHO studies: 4239


In [69]:
print(f'All data:    patients:     {len(files_cfr.mrn.unique())}')
print(f'All data:    PET studies:  {len(files_cfr.petmrn_identifier.unique())}')
print(f'All data:    ECHO studies: {len(files_cfr.study.unique())}')
print(f'All data:    videos:       {len(files_cfr.filename.unique())}')
print()
print(f'Global data: patients:     {len(global_pet_echo.mrn.unique())}')
print(f'Global data: PET studies:  {len(global_pet_echo.petmrn_identifier.unique())}')
print(f'Global data: ECHO studies: {len(global_pet_echo.study.unique())}')
print(f'Global data: videos:       {len(global_pet_echo.filename.unique())}')

excluded_pet_studies = len(files_cfr.petmrn_identifier.unique()) - len(global_pet_echo.petmrn_identifier.unique())
print(f'Excluded PET studies: {excluded_pet_studies}')

All data:    patients:     3107
All data:    PET studies:  3425
All data:    ECHO studies: 6002
All data:    videos:       292895

Global data: patients:     2440
Global data: PET studies:  2579
Global data: ECHO studies: 4239
Global data: videos:       202795
Excluded PET studies: 846


In [53]:
# Find out which echo studies do not have a4c views
# Get all studies WITH a4c views
a4c_study_list = list(global_pet_echo[global_pet_echo.max_view=='a4c'].study.unique())
print(len(a4c_study_list))

# Remove all studies from global list that have a4c views
global_pet_echo_no_a4c = global_pet_echo[~global_pet_echo.study.isin(a4c_study_list)]
print(f'Number of ECHO studies without a4c view: {len(global_pet_echo_no_a4c.study.unique())}')
print(f'For this number of patients:             {len(global_pet_echo_no_a4c.mrn.unique())}')
print(f'With this number of videos:              {len(global_pet_echo_no_a4c.filename.unique())}')

1630
Number of ECHO studies without a4c view: 164
For this number of patients:             143
With this number of videos:              5304


In [70]:
# Exclusions and filters
# A4C VIEW
global_pet_echo_a4c = global_pet_echo.loc[global_pet_echo.max_view=='a4c']
global_pet_echo_a4c = global_pet_echo_a4c.loc[~global_pet_echo_a4c.frame_time.isnull()]
global_a4c_pre18 = global_pet_echo_a4c.loc[global_pet_echo_a4c['post-2018']==0]

print(f'Global data: patients:     {len(global_pet_echo.mrn.unique())}')
print(f'Global data: PET studies:  {len(global_pet_echo.petmrn_identifier.unique())}')
print(f'Global data: ECHO studies: {len(global_pet_echo.study.unique())}')
print(f'Global data: videos:       {len(global_pet_echo.filename.unique())}')
print()
print('After a4c and post-2018 filters:')
print(f'Global data: patients:     {len(global_a4c_pre18.mrn.unique())}')
print(f'Global data: PET studies:  {len(global_a4c_pre18.petmrn_identifier.unique())}')
print(f'Global data: ECHO studies: {len(global_a4c_pre18.study.unique())}')
print(f'Global data: videos:       {len(global_a4c_pre18.filename.unique())}')

Global data: patients:     2440
Global data: PET studies:  2579
Global data: ECHO studies: 4239
Global data: videos:       202795

After a4c and post-2018 filters:
Global data: patients:     2284
Global data: PET studies:  2405
Global data: ECHO studies: 3678
Global data: videos:       11070


In [55]:
# Global_pet_echo table with the variables to use (drop rows with na in any of those variables)
global_pet_echo.head(2)
global_pet_variables_target = ['rest_global_mbf', 'stress_global_mbf', 'global_cfr_calc', 'post-2018']
global_pet_variables = global_pet_variables_target.copy()
global_pet_variables.extend(list(files_cfr.columns))

global_pet_echo_variables = global_a4c_pre18[global_pet_variables].dropna(subset=global_pet_variables_target,
                                                                          axis=0)

print(f'Global data: patients:     {len(global_pet_echo_variables.mrn.unique())}')
print(f'Global data: PET studies:  {len(global_pet_echo_variables.petmrn_identifier.unique())}')
print(f'Global data: ECHO studies: {len(global_pet_echo_variables.study.unique())}')
print(f'Global data: videos:       {len(global_pet_echo_variables.filename.unique())}')

Global data: patients:     941
Global data: PET studies:  974
Global data: ECHO studies: 1630
Global data: videos:       5405


#### Unaffected (non-defect) variables ####
Notebook 2/26/2020: clean_pet_cfr_petdata4_addingseg7
File used: pet_cfr_petdata_02_26_2020_withperfandseg7.xlsx

* All segments used (segment 7 data now available)
* MBF and CFR calculated for unaffected segments
Perfusion defect segments are those with a perfusion score >0 at rest or stress
Unaffected segments are those not in a perfusion defect, and not adjacent to a perfusion defect
* Of note, PETs with no unaffected segments were excluded from the dataset
* As described above, 2871 after excluding CABG, transplant, and those with missing perfusion data
* After excluding 295 for having no "unaffected segments", 2576 remaining
File used: post_2018_pets_with_clinical_cfr_all.csv

* Clinical global MBF and CFR values for PETs after 10/2018 were obtained from EPIC
These values were also used for unaffected MBF and CFR
After excluding CABG, transplant, and missing CFR values, 167 PETs with any perfusion
After keeping only studies with normal/probably normal perfusion, 83 PETs remaining

Merge:

Total 2659 PETs
File created: non_defect_plus_p2018.parquet
Use notes:

* Variables to use: rest_mbf_unaff, stress_mbf_unaff, unaffected_cfr
* Also available: perfusion data, segmental data, gated SPECT results, calcium score, height, weight
* For clean data, would exclude PETs with post-2018==1, and would limit to PETs with myocardial_perfusion of 'normal' or 'probably normal'- if this is done, would use global_cfr_calc instead of unaffected_cfr because it is slightly more accurate in PETs with normal/prob normal perfusion (could consider looking at whether normal/probably normal PETs are still scored as having small perfusion defects)

In [17]:
print(f'PET studies: {len(nondefect_pet.petmrn_identifier.unique())}')
print(os.path.join(pet_data_dir, nondefect_pet_file))
display(nondefect_pet)

PET studies: 2655
/mnt/obi0/phi/pet/pet_cfr/non_defect_plus_p2018.parquet


Unnamed: 0,mrn,pet_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,unaffected_segments,perf_seg_cfr,perf_segs,rest_global_mbf,var_rest_mbf,stress_global_mbf,var_stress_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,rest_seg9_mbf,rest_seg10_mbf,rest_seg11_mbf,rest_seg12_mbf,rest_seg13_mbf,rest_seg14_mbf,rest_seg15_mbf,rest_seg16_mbf,rest_seg17_mbf,stress_seg1_mbf,stress_seg2_mbf,stress_seg3_mbf,stress_seg4_mbf,stress_seg5_mbf,stress_seg6_mbf,stress_seg7_mbf,stress_seg8_mbf,stress_seg9_mbf,stress_seg10_mbf,stress_seg11_mbf,stress_seg12_mbf,stress_seg13_mbf,stress_seg14_mbf,stress_seg15_mbf,stress_seg16_mbf,stress_seg17_mbf,cfr_seg_1,cfr_seg_2,cfr_seg_3,cfr_seg_4,cfr_seg_5,cfr_seg_6,cfr_seg_7,cfr_seg_8,cfr_seg_9,cfr_seg_10,cfr_seg_11,cfr_seg_12,cfr_seg_13,cfr_seg_14,cfr_seg_15,cfr_seg_16,cfr_seg_17,summed_stress_score,summed_rest_score,summed_difference_score,TID_ratio,gated_spect_results,agatston_coronary_calcium_score,rest_basal_anterior,rest_basal_anteroseptal,rest_basal_inferoseptal,rest_basal_inferior,rest_basal_inferolateral,rest_basal_anterolateral,rest_mid_anterior,rest_mid_anteroseptal,rest_mid_inferoseptal,rest_mid_inferior,rest_mid_inferolateral,rest_mid_anterolateral,rest_apical_anterior,rest_apical_septal,rest_apical_inferior,rest_apical_lateral,rest_apex,stress_basal_anterior,stress_basal_anteroseptal,stress_basal_inferoseptal,stress_basal_inferior,stress_basal_inferolateral,stress_basal_anterolateral,stress_mid_anterior,stress_mid_anteroseptal,stress_mid_inferoseptal,stress_mid_inferior,stress_mid_inferolateral,stress_mid_anterolateral,stress_apical_anterior,stress_apical_septal,stress_apical_inferior,stress_apical_lateral,stress_apex,height_in,weight_lb,reportID,subjectID
0,7924277,2008-08-15,7924277_2008-08-15,0,abnormal,1.368012,1.636577,0.682400,1.116800,5,"[1, 2, 7, 8, 13]",1.202844,"[4, 5, 10, 11, 15]",0.644,0.115952,0.881,0.162315,0.448,0.380,0.435,0.830,0.402,0.334,0.730,0.890,0.537,0.388,0.607,0.823,0.964,0.766,0.445,1.340,1.504,0.691,0.707,0.529,0.348,0.558,0.562,1.325,1.446,0.636,0.502,1.136,1.186,1.415,0.979,0.670,1.174,1.676,1.542411,1.860526,1.216092,0.419277,1.388060,1.682635,1.815068,1.624719,1.184358,1.293814,1.871499,1.441069,1.467842,1.278068,1.505618,0.876119,1.114362,18.0,18.0,0.0,1.09,normal,,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,67.0,133.0,69718.0,2540.0
1,12853099,2006-01-25,12853099_2006-01-25,0,normal,2.109661,2.124757,1.540882,3.274000,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.000000,[],1.532,0.123958,3.232,0.436719,1.082,0.940,1.135,1.224,1.332,1.945,1.482,1.977,1.475,1.656,1.509,1.444,1.801,2.180,1.677,1.359,1.977,4.291,3.144,2.069,2.657,2.626,3.712,3.981,3.359,2.215,4.010,3.890,3.883,3.249,2.609,3.483,3.443,3.037,3.965804,3.344681,1.822907,2.170752,1.971471,1.908483,2.686235,1.699039,1.501695,2.421498,2.577866,2.689058,1.803998,1.196789,2.076923,2.533481,1.536166,0.0,0.0,0.0,,not assessed,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,160.0,69969.0,3379.0
2,20710471,2006-01-23,20710471_2006-01-23,0,abnormal,1.229572,1.011962,1.309667,1.325333,3,"[2, 8, 13]",1.181455,"[4, 5, 6, 10, 11, 15]",1.028,0.086979,1.264,0.092843,0.872,0.892,1.126,1.884,1.061,1.057,1.286,1.657,0.735,0.995,1.153,1.252,1.380,0.966,0.860,0.943,1.101,1.213,0.826,1.864,0.910,1.320,1.936,1.283,1.617,1.330,1.266,1.366,1.726,1.533,1.477,1.484,1.037,1.537,1.391055,0.926009,1.655417,0.483015,1.244109,1.831599,0.997667,0.975860,1.809524,1.272362,1.184735,1.378594,1.110870,1.528986,1.725581,1.099682,1.396004,20.0,18.0,2.0,0.97,normal,6602.0,0.0,0.0,0.0,3.0,4.0,0.0,0.0,0.0,0.0,3.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,2.0,0.0,0.0,0.0,3.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,71.0,200.0,69993.0,3542.0
3,12627030,2006-02-10,12627030_2006-02-10,0,abnormal,1.588915,1.248541,2.056000,2.567000,1,[14],1.708358,"[1, 3, 5, 12, 16]",1.732,0.103225,2.752,1.735979,1.984,1.499,1.556,1.333,1.134,1.292,2.499,1.883,1.703,1.794,1.873,1.933,1.767,2.056,1.662,1.804,1.754,2.940,0.664,0.640,2.197,1.413,3.395,4.533,1.301,2.149,2.391,3.300,4.714,3.184,2.567,4.154,4.662,3.595,1.481855,0.442962,0.411311,1.648162,1.246032,2.627709,1.813926,0.690919,1.261891,1.332776,1.761879,2.438696,1.801924,1.248541,2.499398,2.584257,2.049601,16.0,0.0,16.0,0.79,normal,331.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,68.0,217.0,70008.0,2155.0
4,15324312,2006-02-23,15324312_2006-02-23,0,normal,2.656655,2.613463,1.469000,3.839176,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.000000,[],1.465,0.149121,3.892,0.655387,0.945,0.813,0.892,0.970,1.401,1.217,1.614,1.580,1.303,1.957,1.879,1.794,1.421,1.773,1.677,1.891,1.846,2.559,2.759,2.654,2.434,3.313,3.502,4.081,4.340,3.978,4.170,4.426,4.944,4.343,4.143,4.748,4.544,4.328,2.707937,3.393604,2.975336,2.509278,2.364739,2.877568,2.528501,2.746835,3.052955,2.130812,2.355508,2.755853,3.056298,2.336717,2.831246,2.402961,2.344529,0.0,0.0,0.0,0.94,normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,146.0,70012.0,3417.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,11700127,2018-11-02,11700127_2018-11-02,1,probably normal,3.770000,3.770000,0.620000,2.340000,17,,,,0.620,,2.340,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
555,1414556,2018-10-30,1414556_2018-10-30,1,normal,2.100000,2.100000,0.690000,1.450000,17,,,,0.690,,1.450,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
557,7235799,2018-10-29,7235799_2018-10-29,1,probably normal,1.530000,1.530000,0.660000,1.010000,17,,,,0.660,,1.010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
573,28944601,2018-10-11,28944601_2018-10-11,1,normal,1.760000,1.760000,0.490000,0.860000,17,,,,0.490,,0.860,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [9]:
# Add echo data to the pet studies (inner join)
nondefect_pet_echo = nondefect_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='inner')
print(f'PET studies:  {len(nondefect_pet_echo.petmrn_identifier.unique())}')
print(f'ECHO studies: {len(nondefect_pet_echo.study.unique())}')

PET studies:  1986
ECHO studies: 2678


In [10]:
# Disqualified PET studies
nondefect_pet_echo_all = nondefect_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='left')
nondefect_pet_noecho = nondefect_pet_echo_all.loc[nondefect_pet_echo_all.study.isnull()]
print(f'PET studies total:        {len(nondefect_pet_echo_all.petmrn_identifier.unique())}')
print(f'PET studies without echo: {len(nondefect_pet_noecho.petmrn_identifier.unique())}')

PET studies total:        2655
PET studies without echo: 669


In [11]:
nondefect_variables_target = ['rest_mbf_unaff', 'stress_mbf_unaff', 'unaffected_cfr', 'post-2018']
nondefect_variables = nondefect_variables_target.copy()
nondefect_variables.extend(list(files_cfr.columns))
nondefect_pet_echo_variables = nondefect_pet_echo[nondefect_variables].dropna(axis=0)
print(f'PET target variables: {nondefect_variables_target}')
print(f'PET studies:          {len(nondefect_pet_echo_variables.petmrn_identifier.unique())}')
print(f'ECHO studies:         {len(nondefect_pet_echo_variables.study.unique())}')
print(f'nondefect shape:      {nondefect_pet_echo_variables.shape}')

PET target variables: ['rest_mbf_unaff', 'stress_mbf_unaff', 'unaffected_cfr', 'post-2018']
PET studies:          1923
ECHO studies:         2538
nondefect shape:      (117341, 54)


In [12]:
# Complete list of unique petmrn_identifier
petmrn_identifier_list = list(global_pet.petmrn_identifier.unique())+list(nondefect_pet.petmrn_identifier.unique())
petmrn_identifier_set = list(set(petmrn_identifier_list))

In [13]:
print(len(petmrn_identifier_list))
print(len(petmrn_identifier_set))

5686
3031


In [14]:
# Let's filter Rahuls list of missing echos
missing_echo = pd.read_parquet(os.path.join(meta_dir, 'mrn_pet_missing_echo_file.parquet'))
print(f'petmrn_identifier in original list: {len(missing_echo.petmrn_identifier.unique())}')
missing_echo.tail()

petmrn_identifier in original list: 674


Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,pet_measurement,study,echo_study_date
20145,7766041,NaT,2014-05-15,,7766041_2014-05-15,1.0,,NaT
20274,22087456,NaT,2014-11-25,,22087456_2014-11-25,1.0,,NaT
20313,17170440,NaT,2015-03-09,,17170440_2015-03-09,1.0,,NaT
20423,1825777,NaT,2016-12-08,,1825777_2016-12-08,1.0,,NaT
20434,26949750,NaT,2017-08-08,,26949750_2017-08-08,1.0,,NaT


In [32]:
missing_echo_filtered = missing_echo[missing_echo.petmrn_identifier.isin(petmrn_identifier_set)].\
                        drop(columns=['pet_measurement'], axis=1).reset_index(drop=True)
print(f'petmrn_identifier in filtered list:   {len(missing_echo_filtered.petmrn_identifier.unique())}')
missing_no_echo_date = missing_echo_filtered.loc[missing_echo_filtered.echo_date.isnull()]
print(f'petmrn_identifier without echo dates: {len(missing_no_echo_date.petmrn_identifier.unique())}')

missing_echo_filtered_file = 'mrn_pet_missing_echo_file_filtered.parquet'
missing_echo_filtered.to_parquet(os.path.join(meta_dir, missing_echo_filtered_file))

petmrn_identifier in filtered list:   460
petmrn_identifier without echo dates: 13


In [35]:
print(missing_echo_filtered.shape)
print(missing_no_echo_date.shape)
missing_echo_filtered.tail(20)

(700, 7)
(13, 7)


Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,study,echo_study_date
680,32030512,2016-01-25,2016-01-27,2.0,32030512_2016-01-27,,NaT
681,32299455,2016-08-20,2016-08-19,1.0,32299455_2016-08-19,,NaT
682,33476755,2016-08-19,2016-08-10,9.0,33476755_2016-08-10,,NaT
683,33657214,2017-01-25,2017-02-02,8.0,33657214_2017-02-02,,NaT
684,38235537,2018-08-16,2018-09-25,40.0,38235537_2018-09-25,,NaT
685,38235537,2018-09-22,2018-09-25,3.0,38235537_2018-09-25,,NaT
686,40902504,2019-11-05,2019-11-06,1.0,40902504_2019-11-06,,NaT
687,5291745,NaT,2008-07-15,,5291745_2008-07-15,,NaT
688,20725511,NaT,2007-07-10,,20725511_2007-07-10,,NaT
689,13752563,NaT,2008-12-23,,13752563_2008-12-23,,NaT


### Names for patient lists that were converted to TFR files ###

In [16]:
#dataset = global_pet_echo_variables
#dataset_filename = 'global_pet_echo_dataset_'+cfr_meta_date+'.parquet'

#dataset= nondefect_pet_echo_variables
#dataset_filename = 'nondefect_pet_echo_dataset_'+cfr_meta_date+'.parquet'
#nondefect_pet_echo_variables.head()

In [27]:
def numbers_print(dset, mode_list):

    print(f'Patients:     {len(dset.mrn.unique())}')
    print(f'ECHO studies: {len(dset.study.unique())}')
    print(f'PET studies:  {len(dset.petmrn_identifier.unique())}')
    print(f'Videos:       {dset.shape[0]}')

    print()

    for m in mode_list:
        print(m)
        dset_m = dset[dset['dset_mode']==m]
        print(f'Patients:     {len(dset_m.mrn.unique())}')
        print(f'ECHO studies: {len(dset_m.study.unique())}')
        print(f'PET studies:  {len(dset_m.petmrn_identifier.unique())}')
        print(f'Videos:       {dset_m.shape[0]}')
        print()

In [30]:
dset.head()

Unnamed: 0,rest_global_mbf,stress_global_mbf,global_cfr_calc,post-2018,tracer_obi,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,pet_measurement,difference(days),filename,dir,datetime,file_base,identifier,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,a2c_laocc,a2c_lvocc_s,a3c,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,year_month,study_full_time,institution,model,manufacturer,max_view,sum_views,dset_mode,rate,dur
2236,0.74,2.613,3.531081,0,rubidium,22302947,490176ecfed54843_4903a444c23a29accc9891d29685,2008-04-17,2007-10-31,22302947_2008-04-17,-169,1.0,169.0,490176ecfed54843_4903a444c23a29accc9891d29685_...,/mnt/obi0/phi/echo/npyFiles/BWH/4901/490176ecf...,2007-10-31 13:12:32,490176ecfed54843_4903a444c23a29accc9891d29685_...,490176ecfed54843_4903a444c23a29accc9891d29685_...,40.322581,63.0,75.0,0.048593,0.048593,1.526044e-07,6.288687e-09,9.698587e-08,1.122459e-07,2.441809e-07,1.198979e-07,0.71647,1.192437e-07,9.70748e-10,2.404599e-10,0.003660227,7.116121e-11,0.2797258,6.446593e-07,3.062847e-08,1.440944e-09,8.717898e-05,2.040459e-08,5.293421e-10,2.148711e-11,7.354546e-06,1.033843e-08,5.604565e-08,1.32573e-05,1.419972e-05,3.58138e-10,2.09198e-05,2007.0,20071031131232,BWH,Vivid7,GE Vingmed Ultrasound,a4c,1.0,eval,24.8,2.540323
274,0.955,2.308,2.416754,0,rubidium,19317163,4a1fe5efc43dec66_4903a44b32e8ddd345993135cebc,2008-08-01,2008-04-14,19317163_2008-08-01,-109,1.0,109.0,4a1fe5efc43dec66_4903a44b32e8ddd345993135cebc_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1f/4a1fe5efc...,2008-04-14 13:46:36,4a1fe5efc43dec66_4903a44b32e8ddd345993135cebc_...,4a1fe5efc43dec66_4903a44b32e8ddd345993135cebc_...,33.333,91.0,51.0,0.065784,0.065784,4.855964e-11,1.207787e-10,1.780659e-09,9.496988e-11,8.827991e-10,3.3681e-09,0.979295,0.02044863,2.538073e-09,8.161536e-10,2.676549e-06,2.534934e-08,8.560859e-13,1.130816e-09,2.296065e-09,1.084421e-09,1.520502e-11,3.056828e-08,1.652438e-09,3.760336e-08,7.733466e-07,4.376413e-06,3.777619e-11,0.0002479796,4.573167e-12,3.731746e-13,1.490915e-09,2008.0,20080414134636,BWH,iE33,Philips Medical Systems,a4c,1.0,train,30.0,3.033303
8830,0.609,1.35,2.216749,0,ammonia,27151703,4904ba6c53618c7f_4903a58793bfceb2ce3868c8151b,2013-03-21,2012-08-06,27151703_2013-03-21,-227,1.0,227.0,4904ba6c53618c7f_4903a58793bfceb2ce3868c8151b_...,/mnt/obi0/phi/echo/npyFiles/BWH/4904/4904ba6c5...,2012-08-06 08:37:10,4904ba6c53618c7f_4903a58793bfceb2ce3868c8151b_...,4904ba6c53618c7f_4903a58793bfceb2ce3868c8151b_...,40.119048,85.0,55.0,0.053153,0.053153,1.040321e-13,7.938899e-13,6.542746e-15,4.358271e-17,9.511394e-16,3.07085e-14,1.0,2.724828e-11,5.122407e-16,2.229038e-13,1.316427e-12,5.678736e-15,2.227005e-14,2.773552e-14,1.712471e-11,2.891193e-13,5.739325e-14,1.812508e-15,2.653622e-15,1.278631e-12,1.612062e-12,7.903387e-15,2.619207e-15,1.164287e-12,7.852771e-13,8.872656e-18,1.058232e-13,2012.0,20120806083710,BWH,Vivid E9,GE Vingmed Ultrasound,a4c,1.0,eval,24.9,3.410119
10233,0.902,1.653,1.832594,0,ammonia,4183273,4b7f0d5bca3a6ac1_4903a58dd547fecf9f5f4492bb1e,2017-11-07,2018-04-27,4183273_2017-11-07,171,1.0,171.0,4b7f0d5bca3a6ac1_4903a58dd547fecf9f5f4492bb1e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7f/4b7f0d5bc...,2018-04-27 11:14:51,4b7f0d5bca3a6ac1_4903a58dd547fecf9f5f4492bb1e_...,4b7f0d5bca3a6ac1_4903a58dd547fecf9f5f4492bb1e_...,39.317,61.0,75.0,0.038882,0.038882,2.332095e-10,3.806092e-09,7.480858e-10,2.43249e-06,5.350698e-09,1.141113e-09,0.999665,3.392791e-09,5.445255e-08,2.732179e-10,0.0003302598,1.617755e-08,5.155525e-07,1.301765e-06,1.411343e-07,3.785148e-09,7.247441e-08,4.78543e-10,7.27334e-11,2.186505e-09,4.516802e-10,7.925628e-11,2.74294e-07,1.584705e-08,3.680352e-10,7.610151e-14,7.395368e-10,2018.0,20180427111451,BWH,Affiniti 70C,Philips Medical Systems,a4c,1.0,test,25.4,2.398337
7480,0.632,1.245,1.969937,0,rubidium,23855372,49004692a66dceb9_4903a44ab12e6bfea1c3eeda8650,2009-05-06,2009-05-06,23855372_2009-05-06,0,1.0,0.0,49004692a66dceb9_4903a44ab12e6bfea1c3eeda8650_...,/mnt/obi0/phi/echo/npyFiles/BWH/4900/49004692a...,2009-05-06 07:55:21,49004692a66dceb9_4903a44ab12e6bfea1c3eeda8650_...,49004692a66dceb9_4903a44ab12e6bfea1c3eeda8650_...,33.333,31.0,65.0,0.039485,0.039485,0.000844653,3.157553e-06,2.194643e-08,1.116456e-06,1.410417e-10,6.501203e-08,0.998814,5.295115e-05,4.470071e-09,3.981303e-09,1.266734e-05,3.402177e-08,5.156852e-08,9.454626e-09,3.872496e-09,7.032413e-08,2.078618e-09,5.728752e-08,6.096406e-09,4.154108e-08,4.196633e-07,9.960001e-06,1.258057e-10,0.0002595027,1.58763e-06,4.449165e-12,6.150728e-09,2009.0,20090506075521,BWH,iE33,Philips Medical Systems,a4c,1.0,train,30.0,1.033323


In [58]:
n_pet_studies = len(dset.petmrn_identifier.unique())
print(f'Total PET studies:    {n_pet_studies}')
n_pet_ammonia_studies = len(dset[dset.tracer_obi=='ammonia'].petmrn_identifier.unique())
n_pet_rubidium_studies = len(dset[dset.tracer_obi=='rubidium'].petmrn_identifier.unique())
print(f'Ammonia PET studies:  {n_pet_ammonia_studies}')
print(f'Rubidium PET studies: {n_pet_rubidium_studies}')
print(f'Excluded PET studies: {n_pet_studies - n_pet_ammonia_studies}')

Total PET studies:    2408
Ammonia PET studies:  977
Rubidium PET studies: 1425
Excluded PET studies: 1431


In [48]:
3425-1431-831

1163

In [72]:
cfr_meta_date = '200617'
dataset_filename = 'global_pet_echo_dataset_'+cfr_meta_date+'.parquet'
dset = pd.read_parquet(os.path.join(meta_dir, dataset_filename))

# Filter the tracer
#dset_ammonia = dset[dset.tracer_obi=='ammonia']

mode_list = ['train', 'eval', 'test']
numbers_print(dset, mode_list)

Patients:     2287
ECHO studies: 3681
PET studies:  2408
Videos:       11077

train
Patients:     1748
ECHO studies: 2811
PET studies:  1844
Videos:       8420

eval
Patients:     195
ECHO studies: 300
PET studies:  204
Videos:       944

test
Patients:     344
ECHO studies: 570
PET studies:  360
Videos:       1713



In [63]:
4106+439+867

5412

In [73]:
#TFR_videos
tfr_dir = os.path.join(cfr_data_root, 'tfr_200617', 'cfr')
tfr_files = sorted(glob.glob(os.path.join(tfr_dir, 'cfr_a4c_*.parquet')))
file_df_list = []
for file in tfr_files:
    file_df_list.append(pd.read_parquet(file))
tfr_df = pd.concat(file_df_list)
numbers_print(tfr_df, mode_list)

Patients:     1709
ECHO studies: 2490
PET studies:  1782
Videos:       7445

train
Patients:     1310
ECHO studies: 1907
PET studies:  1365
Videos:       5716

eval
Patients:     148
ECHO studies: 208
PET studies:  154
Videos:       634

test
Patients:     251
ECHO studies: 375
PET studies:  263
Videos:       1095



In [67]:
998+97+202

1297