### Match ECHO videos with PET CFR labels ###
We need this to assign the CFR labels to the videos. The goal is to have a data set with PET CFR values for each video that we will use to train.

In [2]:
import os
import pickle
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 2000)

In [3]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
cfr_meta_date = '200320'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200320


#### PET DATA ####

In [8]:
pet_echo_file = os.path.normpath('/mnt/obi0/phi/pet/pet_cfr/non_defect_plus_p2018.parquet')
pet_echo_df = pd.read_parquet(pet_echo_file)

pet = pet_echo_df.rename(columns = {'pet_date': 'cfr_study_date'})
pet = pet.astype({'cfr_study_date': 'datetime64[ns]'})

# Remove rows or columns with all NAs
pet = pet.dropna(how='all')
print(f'Shape {pet2.shape}')

# Keep only those columns that we need for 

Shape (2659, 112)


In [5]:
# NEW PET_REPORT_ID: unique combination of mrn and study date
ID = pet2.groupby(['mrn', 'cfr_study_date']).ngroup()
pet2 = pet2.assign(mrnstudyid=ID)
print('Unique mrn_cfr_date_id values: {}'.format(len(pet2.mrnstudyid.unique())))
print('model_cfr values: {}'.format(len(pet2.unaffected_cfr.unique())))
print('Unique patients: {}'.format(len(pet2.mrn.unique())))

Unique mrn_cfr_date_id values: 2655
model_cfr values: 2650
Unique patients: 2376


#### NPY ECHO FILES ####
This is a data frame that contains the filenames, MRNs and dates of all echo studies

In [6]:
echo_files_name = 'echo_npyFiles_BWH_'+cfr_meta_date+'.parquet'
echo = pd.read_parquet(os.path.join(meta_dir, echo_files_name))
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)

echo = echo.astype({'mrn': 'int64',
                    'echo_study_date': 'datetime64[ns]'})

print(echo_files_name)
print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()

# For matching dates with PET, we only need the studies
# So we can greatly reduce this data frame
echo = echo[['study', 'mrn', 'echo_study_date']].drop_duplicates().reset_index(drop = True).\
    sort_values(by = 'mrn', ascending = True)

echo_npyFiles_BWH_200320.parquet
Number of npy files: 1186743
Unique echo studies: 23646
Unique patients:     13422



In [7]:
echo.head()

Unnamed: 0,study,mrn,echo_study_date
3372,4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5,9241,2010-01-26
3377,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,9241,2010-04-02
3452,4b7b463e0550e9de_4903a58051d59d426fe452739a29,41723,2015-11-03
3383,4b7b463fc2f49a36_4903a58dd5465407d53d20ba3790,51599,2018-05-22
3392,4b7b463fc2f49a36_4903a58793b43bb4bb1078d62f4e,51599,2012-03-08


#### PET STUDIES + NPY ECHO DATA ####

In [8]:
pet_echo = pet2.merge(right = echo, on = 'mrn', how = 'left')
pet_echo = pet_echo.sort_values(by='mrn')
pet_echo.iloc[5:20]

Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,unaffected_segments,perf_seg_cfr,perf_segs,rest_global_mbf,var_rest_mbf,stress_global_mbf,var_stress_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,rest_seg9_mbf,rest_seg10_mbf,rest_seg11_mbf,rest_seg12_mbf,rest_seg13_mbf,rest_seg14_mbf,rest_seg15_mbf,rest_seg16_mbf,rest_seg17_mbf,stress_seg1_mbf,stress_seg2_mbf,stress_seg3_mbf,stress_seg4_mbf,stress_seg5_mbf,stress_seg6_mbf,stress_seg7_mbf,stress_seg8_mbf,stress_seg9_mbf,stress_seg10_mbf,stress_seg11_mbf,stress_seg12_mbf,stress_seg13_mbf,stress_seg14_mbf,stress_seg15_mbf,stress_seg16_mbf,...,cfr_seg_15,cfr_seg_16,cfr_seg_17,summed_stress_score,summed_rest_score,summed_difference_score,TID_ratio,gated_spect_results,agatston_coronary_calcium_score,rest_basal_anterior,rest_basal_anteroseptal,rest_basal_inferoseptal,rest_basal_inferior,rest_basal_inferolateral,rest_basal_anterolateral,rest_mid_anterior,rest_mid_anteroseptal,rest_mid_inferoseptal,rest_mid_inferior,rest_mid_inferolateral,rest_mid_anterolateral,rest_apical_anterior,rest_apical_septal,rest_apical_inferior,rest_apical_lateral,rest_apex,stress_basal_anterior,stress_basal_anteroseptal,stress_basal_inferoseptal,stress_basal_inferior,stress_basal_inferolateral,stress_basal_anterolateral,stress_mid_anterior,stress_mid_anteroseptal,stress_mid_inferoseptal,stress_mid_inferior,stress_mid_inferolateral,stress_mid_anterolateral,stress_apical_anterior,stress_apical_septal,stress_apical_inferior,stress_apical_lateral,stress_apex,height_in,weight_lb,reportID,subjectID,mrnstudyid,study,echo_study_date
1712,92742,2010-02-24,92742_2010-02-24,0,probably abnormal,1.336032,1.267473,0.4793,0.6075,10,"[1, 2, 3, 4, 5, 6, 7, 8, 12, 13]",2.650873,[15],0.494,0.040876,0.66,0.05127,0.355,0.399,0.605,0.304,0.444,0.492,0.503,0.608,1.225,0.483,0.424,0.462,0.621,0.577,0.401,0.46,0.564,0.443,0.491,0.818,0.662,0.486,1.012,0.438,0.843,0.863,0.919,0.585,0.415,0.467,0.976,1.063,0.578,...,2.650873,1.256522,1.471631,3.0,0.0,3.0,,normal,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,66.0,200.0,101417.0,1646.0,2,4b7b4633cc96d8c2_4903a585855f5450c473b5bd5d51,2010-11-26
2255,92742,2010-11-29,92742_2010-11-29,0,abnormal,1.097118,1.601858,0.8075,1.2935,2,"[4, 5]",0.993763,"[7, 8, 13, 14, 15, 16, 17]",0.937,0.062661,1.028,0.269503,1.121,1.46,0.822,0.638,0.977,0.909,1.044,0.913,1.126,1.244,1.019,1.009,1.208,1.103,0.726,0.597,0.502,1.468,1.801,1.391,1.483,1.104,1.262,1.115,1.869,2.161,1.453,1.11,0.52,0.693,0.86,0.663,0.464,...,0.913223,0.777219,0.778884,15.0,2.0,13.0,1.07,abnormal,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,3.0,66.0,164.0,107029.0,1646.0,3,4b7b4633cc96d8c2_4903a584a1befac2ae418ace04c3,2011-04-13
2254,92742,2010-11-29,92742_2010-11-29,0,abnormal,1.097118,1.601858,0.8075,1.2935,2,"[4, 5]",0.993763,"[7, 8, 13, 14, 15, 16, 17]",0.937,0.062661,1.028,0.269503,1.121,1.46,0.822,0.638,0.977,0.909,1.044,0.913,1.126,1.244,1.019,1.009,1.208,1.103,0.726,0.597,0.502,1.468,1.801,1.391,1.483,1.104,1.262,1.115,1.869,2.161,1.453,1.11,0.52,0.693,0.86,0.663,0.464,...,0.913223,0.777219,0.778884,15.0,2.0,13.0,1.07,abnormal,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,3.0,66.0,164.0,107029.0,1646.0,3,4b7b4633cc96d8c2_4903a5858427dadde17abcd03be6,2010-09-27
2253,92742,2010-11-29,92742_2010-11-29,0,abnormal,1.097118,1.601858,0.8075,1.2935,2,"[4, 5]",0.993763,"[7, 8, 13, 14, 15, 16, 17]",0.937,0.062661,1.028,0.269503,1.121,1.46,0.822,0.638,0.977,0.909,1.044,0.913,1.126,1.244,1.019,1.009,1.208,1.103,0.726,0.597,0.502,1.468,1.801,1.391,1.483,1.104,1.262,1.115,1.869,2.161,1.453,1.11,0.52,0.693,0.86,0.663,0.464,...,0.913223,0.777219,0.778884,15.0,2.0,13.0,1.07,abnormal,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,3.0,66.0,164.0,107029.0,1646.0,3,4b7b4633cc96d8c2_4903a585855f5450c473b5bd5d51,2010-11-26
850,142232,2008-09-12,142232_2008-09-12,0,normal,1.443478,1.406022,0.701353,0.986118,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.0,[],0.69,0.016864,0.996,0.077585,0.58,0.539,0.602,0.425,0.663,0.51,0.839,0.839,0.714,0.758,0.759,0.795,0.727,0.878,0.724,0.738,0.833,0.647,0.379,0.579,0.893,0.936,0.803,0.935,0.991,1.034,1.043,1.18,1.189,1.243,1.509,1.249,0.944,...,1.725138,1.279133,1.452581,0.0,0.0,0.0,0.9,normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,185.0,90848.0,434.0,4,,NaT
2087,146456,2010-08-23,146456_2010-08-23,0,normal,1.529131,1.516989,1.045647,1.586235,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.0,[],1.047,0.046015,1.601,0.202165,0.687,0.914,0.839,0.952,1.116,0.706,0.964,1.032,1.075,1.102,1.12,1.369,1.252,1.277,1.456,0.864,1.051,0.884,0.861,1.262,0.912,1.315,1.387,2.11,1.485,1.409,2.117,1.791,1.635,1.988,2.008,2.293,1.805,...,1.574863,2.08912,1.621313,0.0,0.0,0.0,,normal,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,170.0,105164.0,3677.0,5,4b7b475df0c06b82_4903a58584260812ef8076acd261,2010-08-31
2086,146456,2010-08-23,146456_2010-08-23,0,normal,1.529131,1.516989,1.045647,1.586235,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.0,[],1.047,0.046015,1.601,0.202165,0.687,0.914,0.839,0.952,1.116,0.706,0.964,1.032,1.075,1.102,1.12,1.369,1.252,1.277,1.456,0.864,1.051,0.884,0.861,1.262,0.912,1.315,1.387,2.11,1.485,1.409,2.117,1.791,1.635,1.988,2.008,2.293,1.805,...,1.574863,2.08912,1.621313,0.0,0.0,0.0,,normal,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,170.0,105164.0,3677.0,5,4b7b475df0c06b82_4903a585842609311ccba48ea0c6,2010-08-23
2621,191122,2012-01-20,191122_2012-01-20,0,abnormal,2.415118,2.080237,0.9555,1.987667,6,"[1, 2, 3, 4, 5, 6]",2.200055,"[13, 14, 15, 17]",0.807,0.668415,1.949,0.206753,0.452,0.737,0.821,0.652,2.577,0.494,0.802,0.926,1.322,1.01,0.726,0.649,0.681,0.965,1.14,3.655,0.823,1.225,1.493,1.721,2.056,2.142,3.289,1.702,2.101,2.569,2.437,2.053,1.954,1.813,2.198,1.927,1.902,...,1.690351,0.520383,2.432564,12.0,12.0,0.0,,normal,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,0.0,3.0,62.0,246.0,114826.0,1459.0,7,4b7b4750ff437953_4903a58051d4fbaec473a17fc610,2015-10-28
2622,191122,2012-01-20,191122_2012-01-20,0,abnormal,2.415118,2.080237,0.9555,1.987667,6,"[1, 2, 3, 4, 5, 6]",2.200055,"[13, 14, 15, 17]",0.807,0.668415,1.949,0.206753,0.452,0.737,0.821,0.652,2.577,0.494,0.802,0.926,1.322,1.01,0.726,0.649,0.681,0.965,1.14,3.655,0.823,1.225,1.493,1.721,2.056,2.142,3.289,1.702,2.101,2.569,2.437,2.053,1.954,1.813,2.198,1.927,1.902,...,1.690351,0.520383,2.432564,12.0,12.0,0.0,,normal,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,0.0,3.0,62.0,246.0,114826.0,1459.0,7,4b7b4750ff437953_4903a44ab129ee8b9d4653dcee8a,2009-02-07
1093,191122,2009-02-09,191122_2009-02-09,0,abnormal,1.918107,1.333333,1.325,1.766667,3,"[5, 6, 11]",2.167903,"[8, 9, 13, 14, 17]",1.099,0.423529,2.108,0.551395,1.066,2.099,1.57,0.673,0.661,2.11,1.207,2.631,1.087,1.511,1.204,0.462,0.888,1.042,1.037,2.59,1.094,1.146,0.841,1.412,2.325,1.318,1.81,2.046,2.474,3.159,2.19,2.172,1.608,2.521,3.275,2.897,1.632,...,2.793635,0.630116,2.913163,10.0,10.0,0.0,0.92,normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,2.0,65.0,229.0,92982.0,1459.0,6,4b7b4750ff437953_4903a58051d4fbaec473a17fc610,2015-10-28


In [9]:
pet_echo[pet_echo.study=='4b7a857e1fbeb850_4903a58c1bcca937f65d91c9e305']

Unnamed: 0,mrn,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,unaffected_segments,perf_seg_cfr,perf_segs,rest_global_mbf,var_rest_mbf,stress_global_mbf,var_stress_mbf,rest_seg1_mbf,rest_seg2_mbf,rest_seg3_mbf,rest_seg4_mbf,rest_seg5_mbf,rest_seg6_mbf,rest_seg7_mbf,rest_seg8_mbf,rest_seg9_mbf,rest_seg10_mbf,rest_seg11_mbf,rest_seg12_mbf,rest_seg13_mbf,rest_seg14_mbf,rest_seg15_mbf,rest_seg16_mbf,rest_seg17_mbf,stress_seg1_mbf,stress_seg2_mbf,stress_seg3_mbf,stress_seg4_mbf,stress_seg5_mbf,stress_seg6_mbf,stress_seg7_mbf,stress_seg8_mbf,stress_seg9_mbf,stress_seg10_mbf,stress_seg11_mbf,stress_seg12_mbf,stress_seg13_mbf,stress_seg14_mbf,stress_seg15_mbf,stress_seg16_mbf,...,cfr_seg_15,cfr_seg_16,cfr_seg_17,summed_stress_score,summed_rest_score,summed_difference_score,TID_ratio,gated_spect_results,agatston_coronary_calcium_score,rest_basal_anterior,rest_basal_anteroseptal,rest_basal_inferoseptal,rest_basal_inferior,rest_basal_inferolateral,rest_basal_anterolateral,rest_mid_anterior,rest_mid_anteroseptal,rest_mid_inferoseptal,rest_mid_inferior,rest_mid_inferolateral,rest_mid_anterolateral,rest_apical_anterior,rest_apical_septal,rest_apical_inferior,rest_apical_lateral,rest_apex,stress_basal_anterior,stress_basal_anteroseptal,stress_basal_inferoseptal,stress_basal_inferior,stress_basal_inferolateral,stress_basal_anterolateral,stress_mid_anterior,stress_mid_anteroseptal,stress_mid_inferoseptal,stress_mid_inferior,stress_mid_inferolateral,stress_mid_anterolateral,stress_apical_anterior,stress_apical_septal,stress_apical_inferior,stress_apical_lateral,stress_apex,height_in,weight_lb,reportID,subjectID,mrnstudyid,study,echo_study_date
4194,1252980,2019-04-02,1252980_2019-04-02,1,normal,1.59,1.59,1.03,1.64,17,,,,1.03,,1.64,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,87,4b7a857e1fbeb850_4903a58c1bcca937f65d91c9e305,2019-03-20
225,1252980,2007-10-01,1252980_2007-10-01,0,normal,1.827052,1.822454,1.920294,3.499647,17,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0.0,[],1.937,0.158027,3.539,0.625532,1.718,1.316,1.082,1.148,1.995,1.796,2.116,2.011,1.797,2.001,2.278,2.192,2.188,2.175,2.366,2.207,2.259,3.144,2.604,1.66,2.277,2.786,3.272,3.912,4.049,3.296,3.698,4.484,4.272,3.818,3.925,4.351,3.791,...,1.838969,1.717716,1.839309,0.0,0.0,0.0,0.91,normal,1940.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,132.0,72981.0,1246.0,86,4b7a857e1fbeb850_4903a58c1bcca937f65d91c9e305,2019-03-20


In [12]:
# Filter NAN in study from merged PET report (we don't have .npy files from those studies)
pet_echo_nofiles = pet_echo.loc[pet_echo.study.isnull()].reset_index(drop = True)
petecho = pet_echo.loc[~pet_echo.study.isnull()].reset_index(drop = True)

In [13]:
print('PET with echo videos:')
#print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique echo studies: {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(petecho.mrn.unique())))

PET with echo videos:
Unique echo studies: 2954
Unique patients:     1662


In [20]:
print('PET without ECHO:')
#print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique CFR studies: {}'.format(len(pet_echo_nofiles.reportID.unique())))
print('Unique patients:     {}'.format(len(pet_echo_nofiles.mrn.unique())))

PET without ECHO:
Unique CFR studies: 771
Unique patients:     714


### Match ECHO study with PET reportID  ###
Goal:
Find a CFR label for each echo video taken from the closest PET measurement.
Requirement: We want to pick the most recent CFR measurments for each echo that is not older than 365 days.
For this, we need a data frame that contains for a given MRN:

- All echo studies for this patient
- A date for each echo study
- All pet reports for that patient
- Date for each pet report

In [16]:
# 1 PICK A PATIENT
#np.random.seed(112)
#mrn = np.random.choice(petecho.mrn.unique(), size = 1,)[0]
mrn = 191122
print('MRN {}'.format(mrn))
petecho_mrn = petecho[petecho.mrn == mrn].reset_index(drop=True)
print(petecho_mrn[['mrn', 'mrnstudyid', 'cfr_study_date', 'echo_study_date', 'study']])

MRN 191122
      mrn  mrnstudyid cfr_study_date echo_study_date                                          study
0  191122           7     2012-01-20      2015-10-28  4b7b4750ff437953_4903a58051d4fbaec473a17fc610
1  191122           7     2012-01-20      2009-02-07  4b7b4750ff437953_4903a44ab129ee8b9d4653dcee8a
2  191122           6     2009-02-09      2015-10-28  4b7b4750ff437953_4903a58051d4fbaec473a17fc610
3  191122           6     2009-02-09      2009-02-07  4b7b4750ff437953_4903a44ab129ee8b9d4653dcee8a


In [15]:
# ALL ECHO STUDIES FOR THIS PATIENT
print(list(petecho_mrn.study.unique()))
print(list(petecho_mrn.echo_study_date.unique()))

['4b7b4750ff437953_4903a58051d4fbaec473a17fc610', '4b7b4750ff437953_4903a44ab129ee8b9d4653dcee8a']
[numpy.datetime64('2015-10-28T00:00:00.000000000'), numpy.datetime64('2009-02-07T00:00:00.000000000')]


In [16]:
# EACH ECHO STUDY FOR THIS PATIENT HAS A DATE
echo_study = petecho_mrn.study.unique()[0]
echo_study_date = petecho_mrn[petecho_mrn.study == echo_study].echo_study_date.values[0]
print(echo_study)
print(echo_study_date)

4b7b4750ff437953_4903a58051d4fbaec473a17fc610
2015-10-28T00:00:00.000000000


In [17]:
pet_dates = petecho_mrn[['mrnstudyid', 'cfr_study_date']].groupby('mrnstudyid').first().reset_index(drop=False)

In [18]:
pet_dates

Unnamed: 0,mrnstudyid,cfr_study_date
0,6,2009-02-09
1,7,2012-01-20


In [19]:
# For each cfr date for this patient, calculate the time differnce to the echo date
pet_dates = petecho_mrn[['mrnstudyid', 'cfr_study_date']].groupby('mrnstudyid').first().reset_index(drop=False)
pet_dates = pet_dates.assign(days=(pet_dates.cfr_study_date-echo_study_date).dt.days)
pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days))
print()
print(pet_dates)


   mrnstudyid cfr_study_date  days  days_abs
0           6     2009-02-09 -2452      2452
1           7     2012-01-20 -1377      1377


In [20]:
# Now, we can find the earliest pet_study_date and the corresponding reportID
ser = pet_dates.loc[pet_dates.days_abs.idxmin()]
print('ReportID of earliest PET study {}'.format(ser.mrnstudyid))
print('cfr_days_after_echo {}'.format(ser.days))

ReportID of earliest PET study 7
cfr_days_after_echo -1377


In [70]:
# Is there at least one pet study per mrn?
d = petecho[['mrn', 'mrnstudyid']].groupby('mrn').count().reset_index(drop=False)

In [66]:
#def get_earliest_cfr_reports(df_cfr_echo):
# Loop through all MRNs in df_pet_echo
mrn_list = sorted(list(petecho.mrn.unique()))
cfr_echo_days = pd.DataFrame()

m = 200
mrn = mrn_list[m]
#for m, mrn in enumerate(mrn_list):

if (m+1)%200==0:
    print('Processing MRN {} of {}'.format(m+1, len(mrn_list)))

df_mrn = petecho[petecho.mrn == mrn]
echo_study_list = list(df_mrn.study.unique())

# Loop through all echo studies for this MRN
study = echo_study_list[0]
#for study in echo_study_list:
echo_study_date = df_mrn[df_mrn.study == study].echo_study_date.values[0]

# Find all pet dates for this patient and add the difference to the echo dates
pet_dates = df_mrn[['mrnstudyid', 'cfr_study_date']].groupby('mrnstudyid').first().reset_index(drop=False)
pet_dates = pet_dates.assign(days_post_cfr=(echo_study_date-pet_dates.cfr_study_date).dt.days)
pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days_post_cfr))
ser = pet_dates.loc[pet_dates.days_abs.idxmin()]

# We can return the reportID and the days of the earliest pet study
study_dict = {'mrn': [mrn],
              'study': [study],
              'echo_study_date': [echo_study_date],
              'mrnstudyid': [ser.mrnstudyid],
              'days_post_cfr': [ser.days_post_cfr]}
cfr_echo_days = pd.concat([cfr_echo_days, pd.DataFrame(study_dict)], ignore_index=True)
print(pet_dates)

   mrnstudyid cfr_study_date  days_post_cfr  days_abs
0         318     2008-08-26           3394      3394
1         319     2017-12-21            -10        10


In [14]:
def get_earliest_cfr_reports(df_cfr_echo):
    """ 
    Determine the earliest CFR measurement for all studies
    args: df_cfr_echo: pd.DataFrame() 
          with columns: mrn, study, reportID, cfr_study_date, echo_study_date
    returns: cfr_echo_days: pd.DataFrame()
          with columns: mrn, study, reportID, cfr_days_after_echo
    """

    # Loop through all MRNs in df_pet_echo
    mrn_list = sorted(list(df_cfr_echo.mrn.unique()))
    cfr_echo_days = pd.DataFrame()
    for m, mrn in enumerate(mrn_list):

        if (m+1)%200==0:
            print('Processing MRN {} of {}'.format(m+1, len(mrn_list)))

        df_mrn = df_cfr_echo[df_cfr_echo.mrn == mrn]
        echo_study_list = list(df_mrn.study.unique())

        # Loop through all echo studies for this MRN
        for study in echo_study_list:
            echo_study_date = df_mrn[df_mrn.study == study].echo_study_date.values[0]

            # Find all pet dates for this patient and add the difference to the echo dates
            pet_dates = df_mrn[['mrnstudyid', 'cfr_study_date']].groupby('mrnstudyid').first().reset_index(drop=False)
            pet_dates = pet_dates.assign(days_post_cfr=(echo_study_date-pet_dates.cfr_study_date).dt.days)
            pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days_post_cfr))
            ser = pet_dates.loc[pet_dates.days_abs.idxmin()]

            # We can return the reportID and the days of the earliest pet study
            study_dict = {'mrn': [mrn],
                          'study': [study],
                          'echo_study_date': [echo_study_date],
                          'mrnstudyid': [ser.mrnstudyid],
                          'days_post_cfr': [ser.days_post_cfr]}
            cfr_echo_days = pd.concat([cfr_echo_days, pd.DataFrame(study_dict)], ignore_index=True)
    
    return cfr_echo_days

In [15]:
cfr_echo_days = get_earliest_cfr_reports(petecho)

Processing MRN 200 of 1662
Processing MRN 400 of 1662
Processing MRN 600 of 1662
Processing MRN 800 of 1662
Processing MRN 1000 of 1662
Processing MRN 1200 of 1662
Processing MRN 1400 of 1662
Processing MRN 1600 of 1662


In [30]:
# After the match, do we have the same number of patients
print('Patients after match     {}'.format(len(cfr_echo_days.mrn.unique())))
print('Echo studies after match {}'.format(len(cfr_echo_days.study.unique())))

Patients after match     1662
Echo studies after match 2954


In [31]:
# We can merge this back with the original PET reports
pet_cfr_echo_days = cfr_echo_days.merge(right = pet2, on = ['mrnstudyid', 'mrn'], how = 'left')

In [32]:
# After the match, do we have the same number of patients
print('Patients after match     {}'.format(len(pet_cfr_echo_days.mrn.unique())))
print('Echo studies after match {}'.format(len(pet_cfr_echo_days.study.unique())))

Patients after match     1662
Echo studies after match 2954


In [34]:
# Next, we will filter those echo studies that occurred earlier or later than 365 days after CFR
pet_cfr_echo_days_365 = pet_cfr_echo_days.assign(abs_cfr_days = np.abs(pet_cfr_echo_days.days_post_cfr))
pet_cfr_echo_days_365 = pet_cfr_echo_days_365[pet_cfr_echo_days_365.abs_cfr_days<=365].drop(columns = ['abs_cfr_days'])
# Sort this by study so that we know how well we matched the dates
pet_cfr_echo_days_365 = pet_cfr_echo_days_365.sort_values(by = 'study')

In [36]:
# After the match, do we have the same number of patients
print('Patients after 365 filter     {}'.format(len(pet_cfr_echo_days_365.mrn.unique())))
print('Echo studies after 365 filter {}'.format(len(pet_cfr_echo_days_365.study.unique())))

Patients after 365 filter     1577
Echo studies after 365 filter 2292


In [54]:
# Patients who dropped out because of the 365 time limit
mrn_before_filter = set(list(cfr_echo_days.mrn.unique()))
mrn_after_filter = set(list(pet_cfr_echo_days_365.mrn.unique()))
mrn_365_dropped = list(mrn_before_filter.difference(mrn_after_filter))
print('MRNs lost after 365 filter {}'.format(len(mrn_365_dropped)))
mrn_dropped_file = os.path.join(meta_dir, 'mrn_365_dropped.pkl')
with open(mrn_dropped_file, 'wb') as fl:
    pickle.dump(mrn_365_dropped, fl)

MRNs lost after 365 filter 85


In [55]:
# To match with filenames, we really only need one study per row. 
# Remove colums where some rows have missing values
pet_cfr_echo_days_noNA = pet_cfr_echo_days_365.dropna(axis=1, how='any') 
pet_cfr_echo_days_study = pet_cfr_echo_days_noNA.groupby('study').first().reset_index(drop=False)

print('Original shape: {} now: {}'.format(cfr_echo_days.shape,
                                          pet_cfr_echo_days_365.shape))

print('Original number of studies: {} now: {}'.format(len(cfr_echo_days.study.unique()),
                                                      len(pet_cfr_echo_days_study.study.unique())))

Original shape: (2954, 5) now: (2297, 116)
Original number of studies: 2954 now: 2292


In [56]:
d = pet_cfr_echo_days_study[['mrn', 'study', 'cfr_study_date', 'echo_study_date', 'days_post_cfr']]
d.head(10)

Unnamed: 0,mrn,study,cfr_study_date,echo_study_date,days_post_cfr
0,35133560,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,2018-04-25,2018-11-02,191
1,35014216,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,2018-05-08,2019-04-25,352
2,35029545,48b091b714b41ffe_4903a582ec746dce5e9b6c1f4070,2017-07-24,2017-05-16,-69
3,35583509,48b09412e8754782_4903a582ec78ec0366e9db9e17cd,2018-08-08,2017-09-06,-336
4,35401116,48b095533cbfdf01_4903a58c1bcb2198682a98b7503a,2019-04-11,2019-04-09,-2
5,35401116,48b095533cbfdf01_4903a58dd5441181fcd696f7d50c,2019-04-11,2018-07-25,-260
6,35647072,48b097cb6874ce59_4903a582edf18da17812c4897c25,2017-11-01,2017-10-25,-7
7,35995307,48b0984d74745d55_4903a582edf18da3b9aa6d4547eb,2017-10-27,2017-10-27,0
8,34940536,48b1a5aeaf91d4d2_4903a582edf18da48d51c09a959b,2017-06-26,2017-10-20,116
9,34530097,48b1a9c06ce2b304_4903a582ec7226d2e7bd522f25ce,2017-04-18,2017-03-31,-18


In [57]:
pet_cfr_echo_days_study.head(2)

Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf
0,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2618,191,2018-04-25,35133560_2018-04-25,0,normal,1.716143,1.699871,0.934766,1.588982,17,0.923,1.584
1,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,35014216,2019-04-25,2616,352,2018-05-08,35014216_2018-05-08,0,abnormal,2.644905,2.876778,0.63689,1.832192,8,0.628,1.661


In [58]:
print(pet_cfr_echo_days_study.shape)
pet_cfr_echo_days_study.head()

(2292, 16)


Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf
0,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2618,191,2018-04-25,35133560_2018-04-25,0,normal,1.716143,1.699871,0.934766,1.588982,17,0.923,1.584
1,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,35014216,2019-04-25,2616,352,2018-05-08,35014216_2018-05-08,0,abnormal,2.644905,2.876778,0.63689,1.832192,8,0.628,1.661
2,48b091b714b41ffe_4903a582ec746dce5e9b6c1f4070,35029545,2017-05-16,2617,-69,2017-07-24,35029545_2017-07-24,0,abnormal,2.698341,2.554037,0.599508,1.531167,5,0.663,1.789
3,48b09412e8754782_4903a582ec78ec0366e9db9e17cd,35583509,2017-09-06,2620,-336,2018-08-08,35583509_2018-08-08,0,abnormal,1.99466,2.02646,0.544319,1.103042,3,0.749,1.494
4,48b095533cbfdf01_4903a58c1bcb2198682a98b7503a,35401116,2019-04-09,2619,-2,2019-04-11,35401116_2019-04-11,1,normal,1.11,1.11,0.79,0.88,17,0.79,0.88


In [59]:
print('Number of echo studies within 365 days of cfr {}'.format(len(pet_cfr_echo_days_study.study.unique())))
print('Number of PET studies {}'.format(len(pet_cfr_echo_days_study.mrnstudyid.unique())))
print('Total number of studies in petecho df {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(pet_cfr_echo_days_study.mrn.unique())))

Number of echo studies within 365 days of cfr 2292
Number of PET studies 1638
Total number of studies in petecho df 2954
Unique patients:     1577


In [60]:
# We can now save the data frame with the echo studies and the matched CFR measurements
match_filename = 'pet_match365_diff_'+cfr_meta_date+'.parquet'
pet_cfr_echo_days_study.to_parquet(os.path.join(meta_dir, match_filename))

# Save also as .csv file
match_filename_csv = 'pet_match365_diff_'+cfr_meta_date+'.csv'
pet_cfr_echo_days_study.to_csv(os.path.join(meta_dir, match_filename_csv))

In [61]:
pet_cfr_echo_days_study.head()

Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf
0,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2618,191,2018-04-25,35133560_2018-04-25,0,normal,1.716143,1.699871,0.934766,1.588982,17,0.923,1.584
1,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,35014216,2019-04-25,2616,352,2018-05-08,35014216_2018-05-08,0,abnormal,2.644905,2.876778,0.63689,1.832192,8,0.628,1.661
2,48b091b714b41ffe_4903a582ec746dce5e9b6c1f4070,35029545,2017-05-16,2617,-69,2017-07-24,35029545_2017-07-24,0,abnormal,2.698341,2.554037,0.599508,1.531167,5,0.663,1.789
3,48b09412e8754782_4903a582ec78ec0366e9db9e17cd,35583509,2017-09-06,2620,-336,2018-08-08,35583509_2018-08-08,0,abnormal,1.99466,2.02646,0.544319,1.103042,3,0.749,1.494
4,48b095533cbfdf01_4903a58c1bcb2198682a98b7503a,35401116,2019-04-09,2619,-2,2019-04-11,35401116_2019-04-11,1,normal,1.11,1.11,0.79,0.88,17,0.79,0.88


#### Filter the large echo_BWH_npy_feather_files dataframe for easier metadata collection ####

In [65]:
# With the names of the matched echo studies, we can now get the file names
# of the meta files that we will need

meta_files_name = 'echo_BWH_npy_feather_files_'+cfr_meta_date+'.parquet'
meta_files_cfr_name = 'echo_BWH_npy_feather_files_cfr_'+cfr_meta_date+'.parquet'

meta_files_df = pd.read_parquet(os.path.join(meta_dir, meta_files_name))
meta_files_df_matched = meta_files_df[meta_files_df.study.isin(pet_cfr_echo_days_study.study.unique())]
print(meta_files_df_matched.shape)
print('Unique studies in meta data: {}'.format(len(meta_files_df_matched.study.unique())))
meta_files_df_matched.head(2)
meta_files_df_matched.to_parquet(os.path.join(meta_dir, meta_files_cfr_name))

(603984, 8)
Unique studies in meta data: 2291
