### Compare MRNs in PET and ECHO reports ###
This notebook just compares PET studies and ECHO videos on the file system. 

In [1]:
import os
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [2]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
cfr_meta_date = '200320'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200320


#### PET REPORTS ####

In [3]:
pet_echo_file = os.path.normpath('/mnt/obi0/phi/pet/pet_cfr/non_defect_plus_p2018.parquet')
pet_echo_df = pd.read_parquet(pet_echo_file)

pet = pet_echo_df.rename(columns = {'pet_date': 'cfr_study_date'})
pet = pet.astype({'cfr_study_date': 'datetime64[ns]'})
#pet = pet.merge(right = rpet, on = ['mrn', 'cfr_study_date'], how='left')
print(pet.shape)

# Remove rows or columns with all NAs
pet2 = pet.dropna(how='all')

# Remove rows in pet df with missing values in very important columns
pet2 = pet.dropna(subset=['mrn', 'cfr_study_date', 'unaffected_cfr', 'rest_mbf_unaff'], axis=0)

print(pet2.shape)

# NEW PET_REPORT_ID: unique combination of mrn and study date
ID = pet2.groupby(['mrn', 'cfr_study_date']).ngroup()
pet2 = pet2.assign(mrnstudyid=ID)
print('Unique mrn_cfr_date_id values: {}'.format(len(pet2.mrnstudyid.unique())))
print('model_cfr values: {}'.format(len(pet2.unaffected_cfr.unique())))
print('Unique patients: {}'.format(len(pet2.mrn.unique())))

(2659, 112)
(2659, 112)
Unique mrn_cfr_date_id values: 2655
model_cfr values: 2650
Unique patients: 2376


In [4]:
pet2.head(2)

Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,unaffected_segments,perf_seg_cfr,perf_segs,rest_global_mbf,var_rest_mbf,stress_global_mbf,var_stress_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,...,rest_apical_inferior,rest_apical_lateral,rest_apex,stress_basal_anterior,stress_basal_anteroseptal,stress_basal_inferoseptal,stress_basal_inferior,stress_basal_inferolateral,stress_basal_anterolateral,stress_mid_anterior,stress_mid_anteroseptal,stress_mid_inferoseptal,stress_mid_inferior,stress_mid_inferolateral,stress_mid_anterolateral,stress_apical_anterior,stress_apical_septal,stress_apical_inferior,stress_apical_lateral,stress_apex,height_in,weight_lb,reportID,subjectID,mrnstudyid
0,7924277,2008-08-15,7924277_2008-08-15,0,abnormal,1.368012,1.636577,0.6824,1.1168,5,"[1, 2, 7, 8, 13]",1.202844,"[4, 5, 10, 11, 15]",0.644,0.115952,0.881,0.162315,0.448,0.38,0.435,0.83,0.402,0.334,0.73,0.89,...,4.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,67.0,133.0,69718.0,2540.0,698
1,12853099,2006-01-25,12853099_2006-01-25,0,normal,2.109661,2.124757,1.540882,3.274,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.0,[],1.532,0.123958,3.232,0.436719,1.082,0.94,1.135,1.224,1.332,1.945,1.482,1.977,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,160.0,69969.0,3379.0,1084


#### NPY ECHO DATA ####

In [5]:
echo_files_name = 'echo_npyFiles_BWH_'+cfr_meta_date+'.parquet'
echo = pd.read_parquet(os.path.join(meta_dir, echo_files_name))
echo = echo.astype({'mrn': 'int64'})
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)
print('Echo file: {}'.format(echo_files_name))

Echo file: echo_npyFiles_BWH_200320.parquet


In [6]:
print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()
echo.head()

Number of npy files: 1186743
Unique echo studies: 23646
Unique patients:     13422



Unnamed: 0,filename,dir,study,mrn,datetime,echo_study_date,echo_study_time
0,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
1,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
2,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
3,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
4,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24


#### PET STUDIES + NPY ECHO DATA ####

In [7]:
pet_echo = pet.merge(right = echo, on = 'mrn', how = 'left')
pet_echo.tail()

Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,unaffected_segments,perf_seg_cfr,perf_segs,rest_global_mbf,var_rest_mbf,stress_global_mbf,var_stress_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,...,stress_basal_inferoseptal,stress_basal_inferior,stress_basal_inferolateral,stress_basal_anterolateral,stress_mid_anterior,stress_mid_anteroseptal,stress_mid_inferoseptal,stress_mid_inferior,stress_mid_inferolateral,stress_mid_anterolateral,stress_apical_anterior,stress_apical_septal,stress_apical_inferior,stress_apical_lateral,stress_apex,height_in,weight_lb,reportID,subjectID,filename,dir,study,datetime,echo_study_date,echo_study_time
160411,11827664,2018-10-09,11827664_2018-10-09,1,normal,2.38,2.38,0.8,1.9,17,,,,0.8,,1.9,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,4a175b3edaaf29b9_4903a58c1bc619d185b1c9479b97_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a175b3ed...,4a175b3edaaf29b9_4903a58c1bc619d185b1c9479b97,2019-09-27 10:38:57,2019-09-27,10:38:57
160412,11827664,2018-10-09,11827664_2018-10-09,1,normal,2.38,2.38,0.8,1.9,17,,,,0.8,,1.9,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,4a175b3edaaf29b9_4903a58c1bc619d185b1c9479b97_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a175b3ed...,4a175b3edaaf29b9_4903a58c1bc619d185b1c9479b97,2019-09-27 10:38:57,2019-09-27,10:38:57
160413,11827664,2018-10-09,11827664_2018-10-09,1,normal,2.38,2.38,0.8,1.9,17,,,,0.8,,1.9,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,4a175b3edaaf29b9_4903a58c1bc619d185b1c9479b97_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a175b3ed...,4a175b3edaaf29b9_4903a58c1bc619d185b1c9479b97,2019-09-27 10:38:57,2019-09-27,10:38:57
160414,11827664,2018-10-09,11827664_2018-10-09,1,normal,2.38,2.38,0.8,1.9,17,,,,0.8,,1.9,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,4a175b3edaaf29b9_4903a58c1bc619d185b1c9479b97_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a175b3ed...,4a175b3edaaf29b9_4903a58c1bc619d185b1c9479b97,2019-09-27 10:38:57,2019-09-27,10:38:57
160415,11827664,2018-10-09,11827664_2018-10-09,1,normal,2.38,2.38,0.8,1.9,17,,,,0.8,,1.9,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,4a175b3edaaf29b9_4903a58c1bc619d185b1c9479b97_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a175b3ed...,4a175b3edaaf29b9_4903a58c1bc619d185b1c9479b97,2019-09-27 10:38:57,2019-09-27,10:38:57


In [8]:
# Filter pet studies with current echo
pet_echo_nofiles = pet_echo.loc[pet_echo.filename.isnull()].reset_index(drop = True)
petecho = pet_echo.loc[~pet_echo.study.isnull()].reset_index(drop = True)

# Save those
pet_echo_nofiles_name = 'pet_no_npyFiles_BWH_'+cfr_meta_date+'.parquet'
pet_echo_files_name = 'pet_npyFiles_BWH_'+cfr_meta_date+'.parquet'

pet_echo_nofiles.to_parquet(os.path.join(meta_dir, pet_echo_nofiles_name))
petecho.to_parquet(os.path.join(meta_dir, pet_echo_files_name))

In [9]:
pet_echo_nofiles.tail()

Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,unaffected_segments,perf_seg_cfr,perf_segs,rest_global_mbf,var_rest_mbf,stress_global_mbf,var_stress_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,...,stress_basal_inferoseptal,stress_basal_inferior,stress_basal_inferolateral,stress_basal_anterolateral,stress_mid_anterior,stress_mid_anteroseptal,stress_mid_inferoseptal,stress_mid_inferior,stress_mid_inferolateral,stress_mid_anterolateral,stress_apical_anterior,stress_apical_septal,stress_apical_inferior,stress_apical_lateral,stress_apex,height_in,weight_lb,reportID,subjectID,filename,dir,study,datetime,echo_study_date,echo_study_time
769,5950324,2018-10-26,5950324_2018-10-26,0,normal,2.920573,2.936366,0.770028,2.261084,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.0,[],0.768,0.005487,2.243,0.070517,0.698292,0.805042,0.701417,0.795375,0.783125,0.647125,0.794792,0.907542,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.0,223.0,166129.0,7473.0,,,,NaT,,
770,12224283,2020-01-17,12224283_2020-01-17,1,normal,2.88,2.88,0.8,2.3,17,,,,0.8,,2.3,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,NaT,,
771,8911976,2019-11-08,8911976_2019-11-08,1,normal,3.17,3.17,0.69,2.19,17,,,,0.69,,2.19,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,NaT,,
772,40902504,2019-11-06,40902504_2019-11-06,1,normal,2.11,2.11,0.65,1.37,17,,,,0.65,,1.37,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,NaT,,
773,25017179,2019-10-21,25017179_2019-10-21,1,normal,3.5,3.5,0.62,2.17,17,,,,0.62,,2.17,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,NaT,,


In [10]:
print('PET with echo videos:')
#print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique echo studies: {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(petecho.mrn.unique())))

PET with echo videos:
Unique echo studies: 2954
Unique patients:     1662


In [11]:
print('PET without ECHO:')
#print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique CFR studies: {}'.format(len(pet_echo_nofiles.reportID.unique())))
print('Unique patients:     {}'.format(len(pet_echo_nofiles.mrn.unique())))

PET without ECHO:
Unique CFR studies: 771
Unique patients:     714


In [12]:
pet_echo_nofiles.head()

Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,unaffected_segments,perf_seg_cfr,perf_segs,rest_global_mbf,var_rest_mbf,stress_global_mbf,var_stress_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,...,stress_basal_inferoseptal,stress_basal_inferior,stress_basal_inferolateral,stress_basal_anterolateral,stress_mid_anterior,stress_mid_anteroseptal,stress_mid_inferoseptal,stress_mid_inferior,stress_mid_inferolateral,stress_mid_anterolateral,stress_apical_anterior,stress_apical_septal,stress_apical_inferior,stress_apical_lateral,stress_apex,height_in,weight_lb,reportID,subjectID,filename,dir,study,datetime,echo_study_date,echo_study_time
0,7924277,2008-08-15,7924277_2008-08-15,0,abnormal,1.368012,1.636577,0.6824,1.1168,5,"[1, 2, 7, 8, 13]",1.202844,"[4, 5, 10, 11, 15]",0.644,0.115952,0.881,0.162315,0.448,0.38,0.435,0.83,0.402,0.334,0.73,0.89,...,0.0,4.0,3.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,67.0,133.0,69718.0,2540.0,,,,NaT,,
1,15324312,2006-02-23,15324312_2006-02-23,0,normal,2.656655,2.613463,1.469,3.839176,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.0,[],1.465,0.149121,3.892,0.655387,0.945,0.813,0.892,0.97,1.401,1.217,1.614,1.58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,146.0,70012.0,3417.0,,,,NaT,,
2,20711909,2006-02-13,20711909_2006-02-13,0,normal,1.670179,1.665151,0.957235,1.593941,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.0,[],0.949,0.07038,1.585,0.101941,0.764,0.702,0.623,0.592,0.752,0.762,1.177,1.221,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,140.0,70056.0,3544.0,,,,NaT,,
3,4465266,2006-03-13,4465266_2006-03-13,0,equivocal,1.06099,1.040927,2.4678,2.5688,10,"[1, 2, 3, 4, 5, 6, 7, 8, 12, 13]",1.138175,[15],2.607,0.431242,2.766,0.397648,2.475,1.686,1.183,1.547,2.078,3.004,3.069,3.187,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,66.0,140.0,70105.0,3230.0,,,,NaT,,
4,20835062,2006-03-13,20835062_2006-03-13,0,abnormal,1.049759,1.094421,0.466,0.51,1,[5],0.999487,"[2, 7, 8, 9, 13, 14, 15, 16, 17]",0.623,0.167041,0.654,0.088593,1.178,0.701,0.551,0.464,0.466,0.527,1.803,0.923,...,0.0,0.0,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,4.0,4.0,4.0,4.0,4.0,70.0,180.0,70148.0,3554.0,,,,NaT,,


#### ECHO STUDIES ON DCM BUT NOT CONVERTED TO NPY ####

In [13]:
dcm_studies_file = 'echo_deIdentifyedEcho_BWH_dcm_'+cfr_meta_date+'.parquet'
dcm = pd.read_parquet(os.path.join(meta_dir, dcm_studies_file))
dcm = dcm.astype({'mrn': 'int64'})
print(dcm.shape)
dcm.head()

OSError: Passed non-file path: /mnt/obi0/andreas/data/cfr/metadata_200320/echo_deIdentifyedEcho_BWH_dcm_200320.parquet

In [46]:
rm_cols = list(echo.columns)
rm_cols.remove('mrn')
pet_echo_nofiles_x = pet_echo_nofiles.drop(columns = rm_cols)

# These are CFR studies without .npy files
pet_echo_nofiles_x.head()

Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,unaffected_segments,perf_seg_cfr,perf_segs,rest_global_mbf,var_rest_mbf,stress_global_mbf,var_stress_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,...,rest_apical_septal,rest_apical_inferior,rest_apical_lateral,rest_apex,stress_basal_anterior,stress_basal_anteroseptal,stress_basal_inferoseptal,stress_basal_inferior,stress_basal_inferolateral,stress_basal_anterolateral,stress_mid_anterior,stress_mid_anteroseptal,stress_mid_inferoseptal,stress_mid_inferior,stress_mid_inferolateral,stress_mid_anterolateral,stress_apical_anterior,stress_apical_septal,stress_apical_inferior,stress_apical_lateral,stress_apex,height_in,weight_lb,reportID,subjectID
0,7924277,2008-08-15,7924277_2008-08-15,0,abnormal,1.368012,1.636577,0.6824,1.1168,5,"[1, 2, 7, 8, 13]",1.202844,"[4, 5, 10, 11, 15]",0.644,0.115952,0.881,0.162315,0.448,0.38,0.435,0.83,0.402,0.334,0.73,0.89,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,67.0,133.0,69718.0,2540.0
1,15324312,2006-02-23,15324312_2006-02-23,0,normal,2.656655,2.613463,1.469,3.839176,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.0,[],1.465,0.149121,3.892,0.655387,0.945,0.813,0.892,0.97,1.401,1.217,1.614,1.58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,146.0,70012.0,3417.0
2,20711909,2006-02-13,20711909_2006-02-13,0,normal,1.670179,1.665151,0.957235,1.593941,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.0,[],0.949,0.07038,1.585,0.101941,0.764,0.702,0.623,0.592,0.752,0.762,1.177,1.221,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,140.0,70056.0,3544.0
3,4465266,2006-03-13,4465266_2006-03-13,0,equivocal,1.06099,1.040927,2.4678,2.5688,10,"[1, 2, 3, 4, 5, 6, 7, 8, 12, 13]",1.138175,[15],2.607,0.431242,2.766,0.397648,2.475,1.686,1.183,1.547,2.078,3.004,3.069,3.187,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,66.0,140.0,70105.0,3230.0
4,20835062,2006-03-13,20835062_2006-03-13,0,abnormal,1.049759,1.094421,0.466,0.51,1,[5],0.999487,"[2, 7, 8, 9, 13, 14, 15, 16, 17]",0.623,0.167041,0.654,0.088593,1.178,0.701,0.551,0.464,0.466,0.527,1.803,0.923,...,3.0,4.0,4.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,4.0,4.0,4.0,4.0,4.0,70.0,180.0,70148.0,3554.0


In [47]:
# Merge CFR studies (without .npy) with _dcm data
pet_echo_nofiles_dcm = pet_echo_nofiles_x.merge(right = dcm, on = 'mrn', how = 'left')

# Keep just the relevant columns
cols = ['mrn', 'cfr_study_date', 'petmrn_identifier',
        'unaffected_cfr', 'filename', 'dir', 'study', 'datetime']

pet_echo_nofiles_dcm = pet_echo_nofiles_dcm[cols]
pet_echo_nofiles_dcm.head()

Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,unaffected_cfr,filename,dir,study,datetime
0,7924277,2008-08-15,7924277_2008-08-15,1.636577,,,,NaT
1,15324312,2006-02-23,15324312_2006-02-23,2.613463,Image-25.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4a13/4...,4a134023a879892a_4903a44517b58fa56642e3a22964,2006-12-21 16:06:22
2,15324312,2006-02-23,15324312_2006-02-23,2.613463,Image-7.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4a13/4...,4a134023a879892a_4903a44517b58fa56642e3a22964,2006-12-21 16:06:22
3,15324312,2006-02-23,15324312_2006-02-23,2.613463,Image-22.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4a13/4...,4a134023a879892a_4903a44517b58fa56642e3a22964,2006-12-21 16:06:22
4,15324312,2006-02-23,15324312_2006-02-23,2.613463,Image-50.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4a13/4...,4a134023a879892a_4903a44517b58fa56642e3a22964,2006-12-21 16:06:22


In [48]:
# We have many files for each mrn. 
# But we only want to know the mrn numbers for which have/not have .dcm data
#pet_echo_nofiles_dcm = pet_echo_nofiles_dcm.dropna(axis=1, how='any')
pet_echo_nofiles_dcm = pet_echo_nofiles_dcm.groupby('mrn').first().reset_index(drop = False)
pet_echo_nofiles_dcm.head()

Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,unaffected_cfr,filename,dir,study,datetime
0,81182,2007-08-17,81182_2007-08-17,1.709905,,,,NaT
1,142232,2008-09-12,142232_2008-09-12,1.406022,Image-16.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
2,240762,2007-05-14,240762_2007-05-14,1.488136,,,,NaT
3,337014,2015-01-13,337014_2015-01-13,1.161245,Image-36.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b45b3f6be661e_4903a5805096e33b7bded49feda1,2015-01-13 08:42:42
4,355958,2009-05-04,355958_2009-05-04,1.796596,Image-57.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b45b511f6fc45_4903a44ab12e6bfc035b257db8da,2009-05-04 13:30:47


In [49]:
#pet_echo_nofiles = pet_echo.loc[pet_echo.filename.isnull()].reset_index(drop = True)
df_pet_echo_nodcm = pet_echo_nofiles_dcm.loc[pet_echo_nofiles_dcm.filename.isnull()].\
                    drop(columns = ['filename', 'dir', 'study', 'datetime']).\
                    reset_index(drop=True)

# PET CFR studies where we have .dcm files, but not yet converted to .npy
df_pet_echo_dcm = pet_echo_nofiles_dcm.loc[~pet_echo_nofiles_dcm.filename.isnull()].\
                    drop(columns = 'filename').\
                    reset_index(drop=True)

# Save those
df_pet_echo_nodcm_file = 'pet_echo_deIdentifyedEcho_BWH_nodcm_'+cfr_meta_date+'.parquet'
df_pet_echo_dcm_file = 'pet_echo_deIdentifyedEcho_BWH_dcm_'+cfr_meta_date+'.parquet'
df_pet_echo_nodcm.to_parquet(os.path.join(meta_dir, df_pet_echo_nodcm_file))
df_pet_echo_dcm.to_parquet(os.path.join(meta_dir, df_pet_echo_dcm_file))

In [50]:
print(len(df_pet_echo_dcm.study.unique()))
df_pet_echo_dcm.head()

373


Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,unaffected_cfr,dir,study,datetime
0,142232,2008-09-12,142232_2008-09-12,1.406022,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
1,337014,2015-01-13,337014_2015-01-13,1.161245,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b45b3f6be661e_4903a5805096e33b7bded49feda1,2015-01-13 08:42:42
2,355958,2009-05-04,355958_2009-05-04,1.796596,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b45b511f6fc45_4903a44ab12e6bfc035b257db8da,2009-05-04 13:30:47
3,813071,2008-04-09,813071_2008-04-09,2.113199,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b4edb2e8d49d5_4903a44b32efc00c83a4de086041,2008-03-05 13:13:20
4,1114701,2011-03-07,1114701_2011-03-07,1.657001,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7a/4...,4b7a8657f190974a_4903a58584283852121803e13c25,2010-06-22 10:49:51


In [51]:
print(df_pet_echo_nodcm.shape)

(341, 4)


In [52]:
print(len(df_pet_echo_nodcm.mrn.unique()))
df_pet_echo_nodcm.head()

341


Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,unaffected_cfr
0,81182,2007-08-17,81182_2007-08-17,1.709905
1,240762,2007-05-14,240762_2007-05-14,1.488136
2,358002,2006-03-13,358002_2006-03-13,1.116887
3,371260,2010-01-27,371260_2010-01-27,1.428194
4,529537,2009-09-30,529537_2009-09-30,0.918612


In [53]:
print('MRNs that are not available as .dcm files: {}'.format(len(df_pet_echo_nodcm.mrn.unique())))

MRNs that are not available as .dcm files: 341


In [54]:
df_pet_echo_dcm.head()

Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,unaffected_cfr,dir,study,datetime
0,142232,2008-09-12,142232_2008-09-12,1.406022,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
1,337014,2015-01-13,337014_2015-01-13,1.161245,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b45b3f6be661e_4903a5805096e33b7bded49feda1,2015-01-13 08:42:42
2,355958,2009-05-04,355958_2009-05-04,1.796596,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b45b511f6fc45_4903a44ab12e6bfc035b257db8da,2009-05-04 13:30:47
3,813071,2008-04-09,813071_2008-04-09,2.113199,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b4edb2e8d49d5_4903a44b32efc00c83a4de086041,2008-03-05 13:13:20
4,1114701,2011-03-07,1114701_2011-03-07,1.657001,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7a/4...,4b7a8657f190974a_4903a58584283852121803e13c25,2010-06-22 10:49:51


In [55]:
print('Studies that are yet available as .dcm files: {}'.format(len(df_pet_echo_dcm.study.unique())))
print('MRNs that are available as .dcm files: {}'.format(len(df_pet_echo_dcm.mrn.unique())))

Studies that are yet available as .dcm files: 373
MRNs that are available as .dcm files: 373


In [56]:
print('MRNs that are NOT available as .dcm files: {}'.format(len(df_pet_echo_nodcm.mrn.unique())))

MRNs that are NOT available as .dcm files: 341


In [57]:
df_pet_echo_nodcm

Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,unaffected_cfr
0,81182,2007-08-17,81182_2007-08-17,1.709905
1,240762,2007-05-14,240762_2007-05-14,1.488136
2,358002,2006-03-13,358002_2006-03-13,1.116887
3,371260,2010-01-27,371260_2010-01-27,1.428194
4,529537,2009-09-30,529537_2009-09-30,0.918612
5,661447,2007-04-05,661447_2007-04-05,1.330137
6,994186,2012-07-02,994186_2012-07-02,1.66656
7,1074152,2009-07-07,1074152_2009-07-07,1.217633
8,1241975,2010-07-30,1241975_2010-07-30,1.615243
9,1688415,2009-04-27,1688415_2009-04-27,1.946004
