### Match ECHO videos with PET CFR labels ###
We need this to assign the CFR labels to the videos. The goal is to have a data set with PET CFR values for each video that we will use to train.

In [1]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 2000)

In [2]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
cfr_meta_date = '200219'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200219


#### CFR REPORTS ####

In [3]:
# Rahul's PET studies
pet_echo_file = os.path.join(cfr_data_root, 'bwh_2020-Feb-4_pet_365_diff.txt')
pet_echo_df = pd.read_csv(pet_echo_file, sep = '\t') # Contains all PET MRNs use this one.
pet = pet_echo_df.rename(columns = {'MRN': 'mrn'}).reset_index(drop = True)
print('Unique patients:   {}'.format(len(pet.mrn.unique())))

# Let's rename the columns to allow matching the dates
pet = pet.rename(columns = {'pet_date': 'cfr_study_date',
                            'echo_date': 'rahul_echo_date'})
pet = pet.astype({'cfr_study_date': 'datetime64[ns]',
                  'rahul_echo_date': 'datetime64[ns]'})
rpet = pet.copy()
print(list(rpet.columns))

Unique patients:   3567
['mrn', 'rahul_echo_date', 'cfr_study_date', 'difference(days)']


In [175]:
pet_echo_file = os.path.normpath('/mnt/obi0/phi/pet/pet_cfr/non_defect_plus_p2018_noseg7.parquet')
pet_echo_df = pd.read_parquet(pet_echo_file)
pet = pet_echo_df.rename(columns = {'study_date': 'cfr_study_date'})
pet = pet.astype({'cfr_study_date': 'datetime64[ns]'})
pet = pet.merge(right = rpet, on = ['mrn', 'cfr_study_date'])

# Remove rows or columns with all NAs
pet = pet.dropna(how='all')

# Remove rows in pet df with missing values in very important columns
pet = pet.dropna(subset=['mrn', 'cfr_study_date', 'model_cfr'], axis=0)

# Remove colums where some rows have missing values
drop_cols = ['unaffected_cfr', 'perf_segs', 'perf_seg_cfr', 'unaffected_segments']
pet2 = pet.drop(columns=drop_cols)

pet2.head(2)

Unnamed: 0,reportID,subjectID,mrn,cfr_study_date,Reason_post_MI_eval,Reason_post_CABG,Reason_post_PCI,Reason_myocardial_viability,reason_postHeartTransplant,hx_recent_MI_yes,troponin,hx_prior_MI_yes,hx_prior_transplant,hx_prior_transplant_yes,hx_CABG,hx_CABG_yes,hx_PCI,hx_PCI_yes,hx_CHF,hx_CHF_yes,hx_cardiomyopathy_yes,hx_CAD_yes,hx_diabetes_yes,height_in,weight_lb,summed_stress_score,summed_rest_score,summed_difference_score,cvdb_rptCard_NuclearScoring_image_quality,LV_size,myocardial_perfusion,LV_ejection_fraction_stress,LV_ejection_fraction_rest,LV_mass,end_diastolic_volume_stress,end_systolic_volume_stress,end_diastolic_volume_rest,end_systolic_volume_rest,TID_ratio,gated_spect_results,stress_protocol,imaging_protocol,peak_HR,peak_systolic_BP,peak_diastolic_BP,rest_systolic_BP,rest_diastolic_BP,rest_HR,agatston_coronary_calcium_score,agatston_score_uninterpretable,...,cfr_seg_3,cfr_seg_4,cfr_seg_5,cfr_seg_6,cfr_seg_8,cfr_seg_9,cfr_seg_10,cfr_seg_11,cfr_seg_12,cfr_seg_13,cfr_seg_14,cfr_seg_15,cfr_seg_16,cfr_seg_17,ave_rest_mbf_segments,var_rest_mbf,ave_stress_mbf_segments,var_stress_mbf,perf_seg_1,perf_seg_2,perf_seg_3,perf_seg_4,perf_seg_5,perf_seg_6,perf_seg_7,perf_seg_8,perf_seg_9,perf_seg_10,perf_seg_11,perf_seg_12,perf_seg_13,perf_seg_14,perf_seg_15,perf_seg_16,perf_seg_17,model_rest_mbf,num_unaffected_segs,global_CFR,model_cfr,Stress,"Regional defect(normal, probably normal, abnormal)",CABG/Transplant,Comment,calc_res,Regional defect(yes/no),Unnamed: 7,Unnamed: 8,Unnamed: 9,rahul_echo_date,difference(days)
0,69718.0,2540.0,7924277,2008-08-15,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,67.0,133.0,18.0,18.0,0.0,excellent,moderately dilated,abnormal,30.0,25.0,202.0,190.0,132.0,188.0,141.0,1.09,normal,Rb-82 Dipyridamole,One-day study,76.0,160.0,55.0,152.0,44.0,75.0,,False,...,1.216092,0.419277,1.38806,1.682635,1.624719,1.184358,1.293814,1.871499,1.441069,1.467842,1.278068,1.505618,0.876119,1.114362,0.695471,0.115952,0.888437,0.161178,0.0,0.0,0.0,8.0,6.0,0.0,0.0,0.0,0.0,8.0,6.0,0.0,0.0,0.0,8.0,0.0,0.0,0.6705,4.0,1.368012,1.623875,,,,,,,,,,2008-08-14,1
1,69969.0,3379.0,12853099,2006-01-25,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,64.0,160.0,0.0,0.0,0.0,excellent,normal,normal,,72.0,,,,59.0,17.0,,not assessed,Rb-82 Dipyridamole,One-day study,82.0,146.0,89.0,144.0,84.0,75.0,0.0,False,...,1.822907,2.170752,1.971471,1.908483,1.699039,1.501695,2.421498,2.577866,2.689058,1.803998,1.196789,2.076923,2.533481,1.536166,1.540882,0.123958,3.229812,0.430427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.544562,16.0,2.109661,2.201288,,,,,,,,,,2006-01-24,1


In [176]:
# NEW PET_REPORT_ID: unique combination of mrn and study date
ID = pet2.groupby(['mrn', 'cfr_study_date']).ngroup()
pet2 = pet2.assign(mrnstudydateid=ID)
print('Unique mrn_cfr_date_id values: {}'.format(len(pet2.mrnstudydateid.unique())))
print('model_cfr values: {}'.format(len(pet2.model_cfr.unique())))
print('Unique patients: {}'.format(len(pet2.mrn.unique())))

Unique mrn_cfr_date_id values: 2472
model_cfr values: 2467
Unique patients: 2343


In [177]:
# PET CFR REPORTS (OUTDATED)
getStress_reports_file = os.path.join(cfr_data_root, '210_getStressTest.xlsx')
getStress_df = pd.read_excel(getStress_reports_file)
keep_cols = ['reportID', 'mrn', 'subjectid', 'report_number', 'study_date', 'report_date', 'cfr']
getStress_df = getStress_df[keep_cols]

getStress_df = getStress_df.astype({'report_date': 'datetime64[ns]',
                                    'study_date': 'datetime64[ns]'})

pet_cfr = getStress_df.rename(columns = {'report_date': 'cfr_report_date',
                                         'study_date': 'cfr_study_date'})
pet_cfr.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,cfr_study_date,cfr_report_date,cfr
0,72946,4474599,3,0904-0007,2007-09-28,2007-09-04,1.428232
1,91480,9774688,6,EVS0154093,2008-11-11,2008-11-11,2.015579
2,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606
3,73048,16480857,10,0728923B,2007-11-26,2007-10-29,1.1075
4,100873,7633498,11,EVS0266850,2010-01-29,2010-09-24,1.776545


In [178]:
# Let's see where the mrn-date combination did not match
df_nomatch = pet2.loc[pet2.rahul_echo_date.isnull()]
print(df_nomatch.shape)
df_nomatch.head()

(0, 357)


Unnamed: 0,reportID,subjectID,mrn,cfr_study_date,Reason_post_MI_eval,Reason_post_CABG,Reason_post_PCI,Reason_myocardial_viability,reason_postHeartTransplant,hx_recent_MI_yes,troponin,hx_prior_MI_yes,hx_prior_transplant,hx_prior_transplant_yes,hx_CABG,hx_CABG_yes,hx_PCI,hx_PCI_yes,hx_CHF,hx_CHF_yes,hx_cardiomyopathy_yes,hx_CAD_yes,hx_diabetes_yes,height_in,weight_lb,summed_stress_score,summed_rest_score,summed_difference_score,cvdb_rptCard_NuclearScoring_image_quality,LV_size,myocardial_perfusion,LV_ejection_fraction_stress,LV_ejection_fraction_rest,LV_mass,end_diastolic_volume_stress,end_systolic_volume_stress,end_diastolic_volume_rest,end_systolic_volume_rest,TID_ratio,gated_spect_results,stress_protocol,imaging_protocol,peak_HR,peak_systolic_BP,peak_diastolic_BP,rest_systolic_BP,rest_diastolic_BP,rest_HR,agatston_coronary_calcium_score,agatston_score_uninterpretable,...,cfr_seg_4,cfr_seg_5,cfr_seg_6,cfr_seg_8,cfr_seg_9,cfr_seg_10,cfr_seg_11,cfr_seg_12,cfr_seg_13,cfr_seg_14,cfr_seg_15,cfr_seg_16,cfr_seg_17,ave_rest_mbf_segments,var_rest_mbf,ave_stress_mbf_segments,var_stress_mbf,perf_seg_1,perf_seg_2,perf_seg_3,perf_seg_4,perf_seg_5,perf_seg_6,perf_seg_7,perf_seg_8,perf_seg_9,perf_seg_10,perf_seg_11,perf_seg_12,perf_seg_13,perf_seg_14,perf_seg_15,perf_seg_16,perf_seg_17,model_rest_mbf,num_unaffected_segs,global_CFR,model_cfr,Stress,"Regional defect(normal, probably normal, abnormal)",CABG/Transplant,Comment,calc_res,Regional defect(yes/no),Unnamed: 7,Unnamed: 8,Unnamed: 9,rahul_echo_date,difference(days),mrnstudydateid


#### NPY ECHO FILES ####
This is a data frame that contains the filenames, MRNs and dates of all echo studies

In [179]:
echo_files_name = 'echo_npyFiles_BWH_'+cfr_meta_date+'.parquet'
echo = pd.read_parquet(os.path.join(meta_dir, echo_files_name))
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)

echo = echo.astype({'mrn': 'int64',
                    'echo_study_date': 'datetime64[ns]'})

print(echo_files_name)
print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()

# For matching dates with PET, we only need the studies
# So we can greatly reduce this data frame
echo = echo[['study', 'mrn', 'echo_study_date']].drop_duplicates().reset_index(drop = True).\
    sort_values(by = 'mrn', ascending = True)

echo_npyFiles_BWH_200219.parquet
Number of npy files: 1056913
Unique echo studies: 21045
Unique patients:     12520



In [180]:
echo.head()

Unnamed: 0,study,mrn,echo_study_date
2979,4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5,9241,2010-01-26
2984,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,9241,2010-04-02
3053,4b7b463e0550e9de_4903a58051d59d426fe452739a29,41723,2015-11-03
2999,4b7b463fc2f49a36_4903a58793b43bb4bb1078d62f4e,51599,2012-03-08
2990,4b7b463fc2f49a36_4903a58dd5465407d53d20ba3790,51599,2018-05-22


#### PET STUDIES + NPY ECHO DATA ####

In [181]:
pet_echo = pet2.merge(right = echo, on = 'mrn', how = 'left')
pet_echo.head()

Unnamed: 0,reportID,subjectID,mrn,cfr_study_date,Reason_post_MI_eval,Reason_post_CABG,Reason_post_PCI,Reason_myocardial_viability,reason_postHeartTransplant,hx_recent_MI_yes,troponin,hx_prior_MI_yes,hx_prior_transplant,hx_prior_transplant_yes,hx_CABG,hx_CABG_yes,hx_PCI,hx_PCI_yes,hx_CHF,hx_CHF_yes,hx_cardiomyopathy_yes,hx_CAD_yes,hx_diabetes_yes,height_in,weight_lb,summed_stress_score,summed_rest_score,summed_difference_score,cvdb_rptCard_NuclearScoring_image_quality,LV_size,myocardial_perfusion,LV_ejection_fraction_stress,LV_ejection_fraction_rest,LV_mass,end_diastolic_volume_stress,end_systolic_volume_stress,end_diastolic_volume_rest,end_systolic_volume_rest,TID_ratio,gated_spect_results,stress_protocol,imaging_protocol,peak_HR,peak_systolic_BP,peak_diastolic_BP,rest_systolic_BP,rest_diastolic_BP,rest_HR,agatston_coronary_calcium_score,agatston_score_uninterpretable,...,cfr_seg_6,cfr_seg_8,cfr_seg_9,cfr_seg_10,cfr_seg_11,cfr_seg_12,cfr_seg_13,cfr_seg_14,cfr_seg_15,cfr_seg_16,cfr_seg_17,ave_rest_mbf_segments,var_rest_mbf,ave_stress_mbf_segments,var_stress_mbf,perf_seg_1,perf_seg_2,perf_seg_3,perf_seg_4,perf_seg_5,perf_seg_6,perf_seg_7,perf_seg_8,perf_seg_9,perf_seg_10,perf_seg_11,perf_seg_12,perf_seg_13,perf_seg_14,perf_seg_15,perf_seg_16,perf_seg_17,model_rest_mbf,num_unaffected_segs,global_CFR,model_cfr,Stress,"Regional defect(normal, probably normal, abnormal)",CABG/Transplant,Comment,calc_res,Regional defect(yes/no),Unnamed: 7,Unnamed: 8,Unnamed: 9,rahul_echo_date,difference(days),mrnstudydateid,study,echo_study_date
0,69718.0,2540.0,7924277,2008-08-15,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,67.0,133.0,18.0,18.0,0.0,excellent,moderately dilated,abnormal,30.0,25.0,202.0,190.0,132.0,188.0,141.0,1.09,normal,Rb-82 Dipyridamole,One-day study,76.0,160.0,55.0,152.0,44.0,75.0,,False,...,1.682635,1.624719,1.184358,1.293814,1.871499,1.441069,1.467842,1.278068,1.505618,0.876119,1.114362,0.695471,0.115952,0.888437,0.161178,0.0,0.0,0.0,8.0,6.0,0.0,0.0,0.0,0.0,8.0,6.0,0.0,0.0,0.0,8.0,0.0,0.0,0.6705,4.0,1.368012,1.623875,,,,,,,,,,2008-08-14,1,639,,NaT
1,69969.0,3379.0,12853099,2006-01-25,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,64.0,160.0,0.0,0.0,0.0,excellent,normal,normal,,72.0,,,,59.0,17.0,,not assessed,Rb-82 Dipyridamole,One-day study,82.0,146.0,89.0,144.0,84.0,75.0,0.0,False,...,1.908483,1.699039,1.501695,2.421498,2.577866,2.689058,1.803998,1.196789,2.076923,2.533481,1.536166,1.540882,0.123958,3.229812,0.430427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.544562,16.0,2.109661,2.201288,,,,,,,,,,2006-01-24,1,995,4a14f25359c230bf_4903a44516dacae1fafd6299fae6,2006-01-24
2,69993.0,3542.0,20710471,2006-01-23,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,True,True,71.0,200.0,20.0,18.0,2.0,good,normal,abnormal,,42.0,145.0,,,139.0,81.0,0.97,normal,Rb-82 Dipyridamole,One-day study,73.0,142.0,80.0,121.0,71.0,71.0,6602.0,False,...,1.831599,0.97586,1.809524,1.272362,1.184735,1.378594,1.11087,1.528986,1.725581,1.099682,1.396004,1.130588,0.086979,1.402625,0.098134,0.0,0.0,0.0,6.0,8.0,2.0,0.0,0.0,0.0,6.0,8.0,0.0,0.0,0.0,8.0,0.0,0.0,1.309667,3.0,1.229572,1.004246,,,,,,,,,,2006-04-11,78,1600,4903a37dd868ef56_4903a44516df7db647ae31be78ef,2006-04-11
3,69993.0,3542.0,20710471,2006-01-23,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,True,True,71.0,200.0,20.0,18.0,2.0,good,normal,abnormal,,42.0,145.0,,,139.0,81.0,0.97,normal,Rb-82 Dipyridamole,One-day study,73.0,142.0,80.0,121.0,71.0,71.0,6602.0,False,...,1.831599,0.97586,1.809524,1.272362,1.184735,1.378594,1.11087,1.528986,1.725581,1.099682,1.396004,1.130588,0.086979,1.402625,0.098134,0.0,0.0,0.0,6.0,8.0,2.0,0.0,0.0,0.0,6.0,8.0,0.0,0.0,0.0,8.0,0.0,0.0,1.309667,3.0,1.229572,1.004246,,,,,,,,,,2006-04-11,78,1600,4903a37dd868ef56_4903a44516d8ca53ad30d0e53fe4,2006-03-06
4,69993.0,3542.0,20710471,2006-01-23,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,True,True,71.0,200.0,20.0,18.0,2.0,good,normal,abnormal,,42.0,145.0,,,139.0,81.0,0.97,normal,Rb-82 Dipyridamole,One-day study,73.0,142.0,80.0,121.0,71.0,71.0,6602.0,False,...,1.831599,0.97586,1.809524,1.272362,1.184735,1.378594,1.11087,1.528986,1.725581,1.099682,1.396004,1.130588,0.086979,1.402625,0.098134,0.0,0.0,0.0,6.0,8.0,2.0,0.0,0.0,0.0,6.0,8.0,0.0,0.0,0.0,8.0,0.0,0.0,1.309667,3.0,1.229572,1.004246,,,,,,,,,,2006-03-06,42,1600,4903a37dd868ef56_4903a44516df7db647ae31be78ef,2006-04-11


In [119]:
pet_echo[pet_echo.study=='4b7a857e1fbeb850_4903a58c1bcca937f65d91c9e305']

Unnamed: 0,reportID,subjectID,mrn,cfr_study_date,Reason_post_MI_eval,Reason_post_CABG,Reason_post_PCI,Reason_myocardial_viability,reason_postHeartTransplant,hx_recent_MI_yes,troponin,hx_prior_MI_yes,hx_prior_transplant,hx_prior_transplant_yes,hx_CABG,hx_CABG_yes,hx_PCI,hx_PCI_yes,hx_CHF,hx_CHF_yes,hx_cardiomyopathy_yes,hx_CAD_yes,hx_diabetes_yes,height_in,weight_lb,summed_stress_score,summed_rest_score,summed_difference_score,cvdb_rptCard_NuclearScoring_image_quality,LV_size,myocardial_perfusion,LV_ejection_fraction_stress,LV_ejection_fraction_rest,LV_mass,end_diastolic_volume_stress,end_systolic_volume_stress,end_diastolic_volume_rest,end_systolic_volume_rest,TID_ratio,gated_spect_results,stress_protocol,imaging_protocol,peak_HR,peak_systolic_BP,peak_diastolic_BP,rest_systolic_BP,rest_diastolic_BP,rest_HR,agatston_coronary_calcium_score,agatston_score_uninterpretable,...,cfr_seg_11,cfr_seg_12,cfr_seg_13,cfr_seg_14,cfr_seg_15,cfr_seg_16,cfr_seg_17,ave_rest_mbf_segments,var_rest_mbf,ave_stress_mbf_segments,var_stress_mbf,perf_seg_1,perf_seg_2,perf_seg_3,perf_seg_4,perf_seg_5,perf_seg_6,perf_seg_7,perf_seg_8,perf_seg_9,perf_seg_10,perf_seg_11,perf_seg_12,perf_seg_13,perf_seg_14,perf_seg_15,perf_seg_16,perf_seg_17,unaffected_segments,unaffected_cfr,perf_segs,perf_seg_cfr,model_rest_mbf,num_unaffected_segs,global_CFR,model_cfr,Stress,"Regional defect(normal, probably normal, abnormal)",CABG/Transplant,Comment,calc_res,Regional defect(yes/no),Unnamed: 7,Unnamed: 8,Unnamed: 9,rahul_echo_date,difference(days),mrnstudydateid,study,echo_study_date
8095,,,1252980,2019-04-02,,,,,,,,,NaT,,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.03,,,1.59,1.64,,,,,normal,,1.59223301,0.00223301,2019-03-20,13,80,4b7a857e1fbeb850_4903a58c1bcca937f65d91c9e305,2019-03-20


In [182]:
# Filter NAN in study from merged PET report (we don't have .npy files from those studies)
pet_echo_nofiles = pet_echo.loc[pet_echo.study.isnull()].reset_index(drop = True)
petecho = pet_echo.loc[~pet_echo.study.isnull()].reset_index(drop = True)

In [183]:
print('PET with ECHO:')
#print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique echo studies: {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(petecho.mrn.unique())))

PET with ECHO:
Unique echo studies: 2751
Unique patients:     1626


In [184]:
print('PET without ECHO:')
#print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique CFR studies: {}'.format(len(pet_echo_nofiles.reportID.unique())))
print('Unique patients:     {}'.format(len(pet_echo_nofiles.mrn.unique())))

PET without ECHO:
Unique CFR studies: 743
Unique patients:     717


### Match ECHO study with PET reportID  ###
Goal:
Find a CFR label for each echo video taken from the closest PET measurement.
Requirement: We want to pick the most recent CFR measurments for each echo that is not older than 365 days.
For this, we need a data frame that contains for a given MRN:

- All echo studies for this patient
- A date for each echo study
- All pet reports for that patient
- Date for each pet report

In [185]:
petecho.head()

Unnamed: 0,reportID,subjectID,mrn,cfr_study_date,Reason_post_MI_eval,Reason_post_CABG,Reason_post_PCI,Reason_myocardial_viability,reason_postHeartTransplant,hx_recent_MI_yes,troponin,hx_prior_MI_yes,hx_prior_transplant,hx_prior_transplant_yes,hx_CABG,hx_CABG_yes,hx_PCI,hx_PCI_yes,hx_CHF,hx_CHF_yes,hx_cardiomyopathy_yes,hx_CAD_yes,hx_diabetes_yes,height_in,weight_lb,summed_stress_score,summed_rest_score,summed_difference_score,cvdb_rptCard_NuclearScoring_image_quality,LV_size,myocardial_perfusion,LV_ejection_fraction_stress,LV_ejection_fraction_rest,LV_mass,end_diastolic_volume_stress,end_systolic_volume_stress,end_diastolic_volume_rest,end_systolic_volume_rest,TID_ratio,gated_spect_results,stress_protocol,imaging_protocol,peak_HR,peak_systolic_BP,peak_diastolic_BP,rest_systolic_BP,rest_diastolic_BP,rest_HR,agatston_coronary_calcium_score,agatston_score_uninterpretable,...,cfr_seg_6,cfr_seg_8,cfr_seg_9,cfr_seg_10,cfr_seg_11,cfr_seg_12,cfr_seg_13,cfr_seg_14,cfr_seg_15,cfr_seg_16,cfr_seg_17,ave_rest_mbf_segments,var_rest_mbf,ave_stress_mbf_segments,var_stress_mbf,perf_seg_1,perf_seg_2,perf_seg_3,perf_seg_4,perf_seg_5,perf_seg_6,perf_seg_7,perf_seg_8,perf_seg_9,perf_seg_10,perf_seg_11,perf_seg_12,perf_seg_13,perf_seg_14,perf_seg_15,perf_seg_16,perf_seg_17,model_rest_mbf,num_unaffected_segs,global_CFR,model_cfr,Stress,"Regional defect(normal, probably normal, abnormal)",CABG/Transplant,Comment,calc_res,Regional defect(yes/no),Unnamed: 7,Unnamed: 8,Unnamed: 9,rahul_echo_date,difference(days),mrnstudydateid,study,echo_study_date
0,69969.0,3379.0,12853099,2006-01-25,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,64.0,160.0,0.0,0.0,0.0,excellent,normal,normal,,72.0,,,,59.0,17.0,,not assessed,Rb-82 Dipyridamole,One-day study,82.0,146.0,89.0,144.0,84.0,75.0,0.0,False,...,1.908483,1.699039,1.501695,2.421498,2.577866,2.689058,1.803998,1.196789,2.076923,2.533481,1.536166,1.540882,0.123958,3.229812,0.430427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.544562,16.0,2.109661,2.201288,,,,,,,,,,2006-01-24,1,995,4a14f25359c230bf_4903a44516dacae1fafd6299fae6,2006-01-24
1,69993.0,3542.0,20710471,2006-01-23,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,True,True,71.0,200.0,20.0,18.0,2.0,good,normal,abnormal,,42.0,145.0,,,139.0,81.0,0.97,normal,Rb-82 Dipyridamole,One-day study,73.0,142.0,80.0,121.0,71.0,71.0,6602.0,False,...,1.831599,0.97586,1.809524,1.272362,1.184735,1.378594,1.11087,1.528986,1.725581,1.099682,1.396004,1.130588,0.086979,1.402625,0.098134,0.0,0.0,0.0,6.0,8.0,2.0,0.0,0.0,0.0,6.0,8.0,0.0,0.0,0.0,8.0,0.0,0.0,1.309667,3.0,1.229572,1.004246,,,,,,,,,,2006-04-11,78,1600,4903a37dd868ef56_4903a44516df7db647ae31be78ef,2006-04-11
2,69993.0,3542.0,20710471,2006-01-23,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,True,True,71.0,200.0,20.0,18.0,2.0,good,normal,abnormal,,42.0,145.0,,,139.0,81.0,0.97,normal,Rb-82 Dipyridamole,One-day study,73.0,142.0,80.0,121.0,71.0,71.0,6602.0,False,...,1.831599,0.97586,1.809524,1.272362,1.184735,1.378594,1.11087,1.528986,1.725581,1.099682,1.396004,1.130588,0.086979,1.402625,0.098134,0.0,0.0,0.0,6.0,8.0,2.0,0.0,0.0,0.0,6.0,8.0,0.0,0.0,0.0,8.0,0.0,0.0,1.309667,3.0,1.229572,1.004246,,,,,,,,,,2006-04-11,78,1600,4903a37dd868ef56_4903a44516d8ca53ad30d0e53fe4,2006-03-06
3,69993.0,3542.0,20710471,2006-01-23,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,True,True,71.0,200.0,20.0,18.0,2.0,good,normal,abnormal,,42.0,145.0,,,139.0,81.0,0.97,normal,Rb-82 Dipyridamole,One-day study,73.0,142.0,80.0,121.0,71.0,71.0,6602.0,False,...,1.831599,0.97586,1.809524,1.272362,1.184735,1.378594,1.11087,1.528986,1.725581,1.099682,1.396004,1.130588,0.086979,1.402625,0.098134,0.0,0.0,0.0,6.0,8.0,2.0,0.0,0.0,0.0,6.0,8.0,0.0,0.0,0.0,8.0,0.0,0.0,1.309667,3.0,1.229572,1.004246,,,,,,,,,,2006-03-06,42,1600,4903a37dd868ef56_4903a44516df7db647ae31be78ef,2006-04-11
4,69993.0,3542.0,20710471,2006-01-23,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,True,True,71.0,200.0,20.0,18.0,2.0,good,normal,abnormal,,42.0,145.0,,,139.0,81.0,0.97,normal,Rb-82 Dipyridamole,One-day study,73.0,142.0,80.0,121.0,71.0,71.0,6602.0,False,...,1.831599,0.97586,1.809524,1.272362,1.184735,1.378594,1.11087,1.528986,1.725581,1.099682,1.396004,1.130588,0.086979,1.402625,0.098134,0.0,0.0,0.0,6.0,8.0,2.0,0.0,0.0,0.0,6.0,8.0,0.0,0.0,0.0,8.0,0.0,0.0,1.309667,3.0,1.229572,1.004246,,,,,,,,,,2006-03-06,42,1600,4903a37dd868ef56_4903a44516d8ca53ad30d0e53fe4,2006-03-06


In [186]:
# Pick a patient
#np.random.seed(112)
#mrn = np.random.choice(petecho.mrn.unique(), size = 1,)[0]
mrn = 15467822
print('MRN {}'.format(mrn))
petecho_mrn = petecho[petecho.mrn == mrn].reset_index(drop=True)
print(petecho_mrn[['mrn', 'mrnstudydateid', 'cfr_study_date', 'echo_study_date', 'study', 'model_cfr']].iloc[8:12])

MRN 15467822
        mrn  mrnstudydateid cfr_study_date echo_study_date                                          study  model_cfr
8  15467822            1179     2012-10-11      2014-05-01  4a1347300a20f7e7_4903a581a72e1c3a0014f8f71932   1.366817
9  15467822            1179     2012-10-11      2014-08-29  4a1347300a20f7e7_4903a581a723bbe47f2a139a23e4   1.366817


In [187]:
petecho.columns

Index(['reportID', 'subjectID', 'mrn', 'cfr_study_date', 'Reason_post_MI_eval', 'Reason_post_CABG', 'Reason_post_PCI', 'Reason_myocardial_viability', 'reason_postHeartTransplant', 'hx_recent_MI_yes',
       ...
       'calc_res', 'Regional defect(yes/no)', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'rahul_echo_date', 'difference(days)', 'mrnstudydateid', 'study', 'echo_study_date'], dtype='object', length=359)

In [188]:
print(petecho['Unnamed: 7'].unique())
print(petecho['Unnamed: 8'].unique())
print(petecho['Unnamed: 9'].unique())

[None 'from images' 'from image']
[None '2.375' '1.755102041' '1.53030303' '2.101449275' '2.155172414'
 '1.016260163' '3.830188679' '2' '1.113924051' '2.025641026' '1.617647059'
 '1.59223301' '2.05952381' '2.203703704' '2.745762712' '2.541666667'
 '2.053571429' '2.132352941' '2.614285714' '2.623188406' '1.349514563'
 '4.111111111' '2.820512821' '1.524271845' '2.61038961' '2.03030303'
 '2.581818182' '1.969230769' '2.12244898' '2.460526316' '3.58974359'
 '1.595505618' '2.139240506' '1.473282443' '1.366972477' '3.729166667'
 '2.592592593' '2.416666667' '2.196721311' '3' '2.268041237' '2.3'
 '1.873417722' '3.774193548']
[None '-0.005' '-0.004897959' '0.00030303' '0.001449275' '-0.004827586'
 '-0.003739837' '0.000188679' '0' '0.003924051' '-0.004358974'
 '-0.002352941' '0.00223301' '-0.00047619' '0.003703704' '0.005762712'
 '0.001666667' '0.003571429' '0.002352941' '0.004285714' '0.003188406'
 '-0.000485437' '0.001111111' '0.000512821' '0.004271845' '0.00038961'
 '0.001818182' '-0.000769231

In [189]:
# Pick a study
echo_study = petecho_mrn.study.unique()[2]
echo_study_date = petecho_mrn[petecho_mrn.study == echo_study].echo_study_date.values[0]
print(echo_study)
print(echo_study_date)

# Now, list the reportIDs with the corresponding dates
pet_dates = petecho_mrn[['reportID', 'cfr_study_date']].groupby('reportID').first().reset_index(drop=False)
pet_dates = pet_dates.assign(days=(pet_dates.cfr_study_date-echo_study_date).dt.days)
pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days))
print()
print(pet_dates)

# Now, we can find the earliest pet_study_date and the corresponding reportID
ser = pet_dates.loc[pet_dates.days_abs.idxmin()]
print('ReportID of earliest PET study {}'.format(ser.reportID))
print('cfr_days_after_echo {}'.format(ser.days))

4a1347300a20f7e7_4903a581a72ff2f12e988b7e2f8d
2014-04-24T00:00:00.000000000

   reportID cfr_study_date  days  days_abs
0  119959.0     2012-10-11  -560       560
ReportID of earliest PET study 119959.0
cfr_days_after_echo -560


In [190]:
# Now, we can come up with a function that determines the reportID and the dates of the earliest CFR
# For any echo date for a given patient.

def get_earliest_cfr_reports(df_cfr_echo):
    """ 
    Determine the earliest CFR measurement for all studies
    args: df_cfr_echo: pd.DataFrame() 
          with columns: mrn, study, reportID, cfr_study_date, echo_study_date
    returns: cfr_echo_days: pd.DataFrame()
          with columns: mrn, study, reportID, cfr_days_after_echo
    """

    # Loop through all MRNs in df_pet_echo
    mrn_list = sorted(list(df_cfr_echo.mrn.unique()))
    cfr_echo_days = pd.DataFrame()
    for m, mrn in enumerate(mrn_list):

        if (m+1)%200==0:
            print('Processing MRN {} of {}'.format(m+1, len(mrn_list)))

        df_mrn = df_cfr_echo[df_cfr_echo.mrn == mrn]
        echo_study_list = list(df_mrn.study.unique())

        # Loop through all studies for this MRN
        for study in echo_study_list:
            echo_study_date = df_mrn[df_mrn.study == study].echo_study_date.values[0]

            # Find all pet dates for this patient and add the difference to the echo dates
            pet_dates = df_mrn[['mrnstudydateid', 'cfr_study_date']].groupby('mrnstudydateid').first().reset_index(drop=False)
            pet_dates = pet_dates.assign(days_post_cfr=(echo_study_date-pet_dates.cfr_study_date).dt.days)
            pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days_post_cfr))
            ser = pet_dates.loc[pet_dates.days_abs.idxmin()]

            # We can return the reportID and the days of the earliest pet study
            study_dict = {'mrn': [mrn],
                          'study': [study],
                          'echo_study_date': [echo_study_date],
                          'mrnstudydateid': [ser.mrnstudydateid],
                          'days_post_cfr': [ser.days_post_cfr]}
            cfr_echo_days = pd.concat([cfr_echo_days, pd.DataFrame(study_dict)], ignore_index=True)
    
    return cfr_echo_days

In [191]:
cfr_echo_days = get_earliest_cfr_reports(petecho)

Processing MRN 200 of 1626
Processing MRN 400 of 1626
Processing MRN 600 of 1626
Processing MRN 800 of 1626
Processing MRN 1000 of 1626
Processing MRN 1200 of 1626
Processing MRN 1400 of 1626
Processing MRN 1600 of 1626


In [192]:
cfr_echo_days.dtypes

mrn                         int64
study                      object
echo_study_date    datetime64[ns]
mrnstudydateid              int64
days_post_cfr               int64
dtype: object

In [193]:
# We can merge this back with the original PET reports
pet_cfr_echo_days = cfr_echo_days.merge(right = pet2, on = ['mrnstudydateid', 'mrn'], how = 'left')

In [199]:
pet_cfr_echo_days.study

0       4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5
1       4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5
2       4b7b463ae45ad0e0_4903a585842a266f561aee3f563e
3       4b7b463ae45ad0e0_4903a585842a266f561aee3f563e
4       4b7b4633cc96d8c2_4903a5858427dadde17abcd03be6
                            ...                      
6492    4f264df68f974061_4903a58c1a3846ed8e66b1070caa
6493    4f264df7bb568062_4903a58c1a395137157141a56e7d
6494    4f264df7bb568062_4903a58c1a395137157141a56e7d
6495    4f264df7bb568062_4903a58c1a395137157141a56e7d
6496    4f264cfb260d2b99_4903a67b11f085db5b5a55a51d1f
Name: study, Length: 6497, dtype: object

In [201]:
# To match with filenames, we really only need one study per row. 
pet_cfr_echo_days_study = pet_cfr_echo_days.groupby('study').first().reset_index(drop=False)


print('Original shape: {} now: {}'.format(pet_cfr_echo_days.shape,
                                          pet_cfr_echo_days_study.shape))

print('Original number of studies: {} now: {}'.format(len(pet_cfr_echo_days.study.unique()),
                                                      len(pet_cfr_echo_days_study.study.unique())))

Original shape: (6497, 360) now: (2751, 360)
Original number of studies: 2751 now: 2751


In [202]:
# Next, we will filter those echo studies that occurred earlier or later than 365 days after CFR
pet_cfr_echo_days = pet_cfr_echo_days.assign(abs_cfr_days = np.abs(pet_cfr_echo_days.days_post_cfr))
pet_cfr_echo_days = pet_cfr_echo_days[pet_cfr_echo_days.abs_cfr_days<=365].drop(columns = ['abs_cfr_days'])
# Sort this by study so that we know how well we matched the dates
pet_cfr_echo_days = pet_cfr_echo_days.sort_values(by = 'study')
pet_cfr_echo_days.head(4)

Unnamed: 0,mrn,study,echo_study_date,mrnstudydateid,days_post_cfr,reportID,subjectID,cfr_study_date,Reason_post_MI_eval,Reason_post_CABG,Reason_post_PCI,Reason_myocardial_viability,reason_postHeartTransplant,hx_recent_MI_yes,troponin,hx_prior_MI_yes,hx_prior_transplant,hx_prior_transplant_yes,hx_CABG,hx_CABG_yes,hx_PCI,hx_PCI_yes,hx_CHF,hx_CHF_yes,hx_cardiomyopathy_yes,hx_CAD_yes,hx_diabetes_yes,height_in,weight_lb,summed_stress_score,summed_rest_score,summed_difference_score,cvdb_rptCard_NuclearScoring_image_quality,LV_size,myocardial_perfusion,LV_ejection_fraction_stress,LV_ejection_fraction_rest,LV_mass,end_diastolic_volume_stress,end_systolic_volume_stress,end_diastolic_volume_rest,end_systolic_volume_rest,TID_ratio,gated_spect_results,stress_protocol,imaging_protocol,peak_HR,peak_systolic_BP,peak_diastolic_BP,rest_systolic_BP,...,cfr_seg_3,cfr_seg_4,cfr_seg_5,cfr_seg_6,cfr_seg_8,cfr_seg_9,cfr_seg_10,cfr_seg_11,cfr_seg_12,cfr_seg_13,cfr_seg_14,cfr_seg_15,cfr_seg_16,cfr_seg_17,ave_rest_mbf_segments,var_rest_mbf,ave_stress_mbf_segments,var_stress_mbf,perf_seg_1,perf_seg_2,perf_seg_3,perf_seg_4,perf_seg_5,perf_seg_6,perf_seg_7,perf_seg_8,perf_seg_9,perf_seg_10,perf_seg_11,perf_seg_12,perf_seg_13,perf_seg_14,perf_seg_15,perf_seg_16,perf_seg_17,model_rest_mbf,num_unaffected_segs,global_CFR,model_cfr,Stress,"Regional defect(normal, probably normal, abnormal)",CABG/Transplant,Comment,calc_res,Regional defect(yes/no),Unnamed: 7,Unnamed: 8,Unnamed: 9,rahul_echo_date,difference(days)
6429,35133560,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,2018-11-02,2435,191,162094.0,7817.0,2018-04-25,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,2015-01-01,True,NaT,False,False,False,False,65.0,186.0,0.0,0.0,0.0,excellent,normal,normal,64.0,63.0,130.0,102.0,37.0,96.0,36.0,,normal,Regadenoson,One-day,98.0,148.0,65.0,133.0,...,1.499309,1.594322,1.888417,2.027922,1.645216,1.687712,1.599403,1.651093,1.726783,1.733784,1.87044,1.876251,1.704592,1.823339,0.934766,0.008917,1.596918,0.039836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93646,16.0,1.716143,1.707419,,,,,,,,,,2017-10-25,182
6428,35133560,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,2018-11-02,2435,191,162094.0,7817.0,2018-04-25,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,2015-01-01,True,NaT,False,False,False,False,65.0,186.0,0.0,0.0,0.0,excellent,normal,normal,64.0,63.0,130.0,102.0,37.0,96.0,36.0,,normal,Regadenoson,One-day,98.0,148.0,65.0,133.0,...,1.499309,1.594322,1.888417,2.027922,1.645216,1.687712,1.599403,1.651093,1.726783,1.733784,1.87044,1.876251,1.704592,1.823339,0.934766,0.008917,1.596918,0.039836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93646,16.0,1.716143,1.707419,,,,,,,,,,2018-11-02,191
6427,35133560,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,2018-11-02,2435,191,162094.0,7817.0,2018-04-25,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,2015-01-01,True,NaT,False,False,False,False,65.0,186.0,0.0,0.0,0.0,excellent,normal,normal,64.0,63.0,130.0,102.0,37.0,96.0,36.0,,normal,Regadenoson,One-day,98.0,148.0,65.0,133.0,...,1.499309,1.594322,1.888417,2.027922,1.645216,1.687712,1.599403,1.651093,1.726783,1.733784,1.87044,1.876251,1.704592,1.823339,0.934766,0.008917,1.596918,0.039836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93646,16.0,1.716143,1.707419,,,,,,,,,,2017-05-16,344
6425,35133560,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,2018-11-02,2435,191,162094.0,7817.0,2018-04-25,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,2015-01-01,True,NaT,False,False,False,False,65.0,186.0,0.0,0.0,0.0,excellent,normal,normal,64.0,63.0,130.0,102.0,37.0,96.0,36.0,,normal,Regadenoson,One-day,98.0,148.0,65.0,133.0,...,1.499309,1.594322,1.888417,2.027922,1.645216,1.687712,1.599403,1.651093,1.726783,1.733784,1.87044,1.876251,1.704592,1.823339,0.934766,0.008917,1.596918,0.039836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93646,16.0,1.716143,1.707419,,,,,,,,,,2018-04-24,1


In [203]:
pet_cfr_echo_days_study.head()

Unnamed: 0,study,mrn,echo_study_date,mrnstudydateid,days_post_cfr,reportID,subjectID,cfr_study_date,Reason_post_MI_eval,Reason_post_CABG,Reason_post_PCI,Reason_myocardial_viability,reason_postHeartTransplant,hx_recent_MI_yes,troponin,hx_prior_MI_yes,hx_prior_transplant,hx_prior_transplant_yes,hx_CABG,hx_CABG_yes,hx_PCI,hx_PCI_yes,hx_CHF,hx_CHF_yes,hx_cardiomyopathy_yes,hx_CAD_yes,hx_diabetes_yes,height_in,weight_lb,summed_stress_score,summed_rest_score,summed_difference_score,cvdb_rptCard_NuclearScoring_image_quality,LV_size,myocardial_perfusion,LV_ejection_fraction_stress,LV_ejection_fraction_rest,LV_mass,end_diastolic_volume_stress,end_systolic_volume_stress,end_diastolic_volume_rest,end_systolic_volume_rest,TID_ratio,gated_spect_results,stress_protocol,imaging_protocol,peak_HR,peak_systolic_BP,peak_diastolic_BP,rest_systolic_BP,...,cfr_seg_3,cfr_seg_4,cfr_seg_5,cfr_seg_6,cfr_seg_8,cfr_seg_9,cfr_seg_10,cfr_seg_11,cfr_seg_12,cfr_seg_13,cfr_seg_14,cfr_seg_15,cfr_seg_16,cfr_seg_17,ave_rest_mbf_segments,var_rest_mbf,ave_stress_mbf_segments,var_stress_mbf,perf_seg_1,perf_seg_2,perf_seg_3,perf_seg_4,perf_seg_5,perf_seg_6,perf_seg_7,perf_seg_8,perf_seg_9,perf_seg_10,perf_seg_11,perf_seg_12,perf_seg_13,perf_seg_14,perf_seg_15,perf_seg_16,perf_seg_17,model_rest_mbf,num_unaffected_segs,global_CFR,model_cfr,Stress,"Regional defect(normal, probably normal, abnormal)",CABG/Transplant,Comment,calc_res,Regional defect(yes/no),Unnamed: 7,Unnamed: 8,Unnamed: 9,rahul_echo_date,difference(days)
0,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2435,191,162094.0,7817.0,2018-04-25,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,2015-01-01,True,NaT,False,False,False,False,65.0,186.0,0.0,0.0,0.0,excellent,normal,normal,64.0,63.0,130.0,102.0,37.0,96.0,36.0,,normal,Regadenoson,One-day,98.0,148.0,65.0,133.0,...,1.499309,1.594322,1.888417,2.027922,1.645216,1.687712,1.599403,1.651093,1.726783,1.733784,1.87044,1.876251,1.704592,1.823339,0.934766,0.008917,1.596918,0.039836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93646,16.0,1.716143,1.707419,,,,,,,,,,2018-04-24,1
1,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,35014216,2019-04-25,2433,352,162416.0,7843.0,2018-05-08,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,69.0,155.0,9.0,0.0,9.0,excellent,normal,abnormal,59.0,49.0,131.0,102.0,42.0,121.0,62.0,,normal,Regadenoson,One-day,92.0,109.0,66.0,117.0,...,2.206746,1.108902,1.926145,2.607426,2.729797,3.016283,1.637713,2.419572,3.089375,2.707036,3.617701,3.260995,3.157073,3.064116,0.650694,0.006229,1.651436,0.12937,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.617024,7.0,2.644905,2.996385,,,,,,,,,,2019-04-25,352
2,48b091b714b41ffe_4903a582ec746dce5e9b6c1f4070,35029545,2017-05-16,2434,-69,156019.0,7376.0,2017-07-24,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,64.0,143.0,7.0,5.0,2.0,excellent,normal,abnormal,36.0,39.0,171.0,135.0,87.0,139.0,85.0,,normal,Regadenoson,One-day,96.0,93.0,55.0,125.0,...,2.895005,3.287303,2.982176,2.312197,2.515621,2.766535,2.789369,2.898802,2.700145,1.987649,2.848762,3.335913,2.583818,2.4914,0.683994,0.013291,1.844327,0.127143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.574958,4.0,2.698341,2.631967,,,,,,,,,,2017-07-21,3
3,48b09412e8754782_4903a582ec78ec0366e9db9e17cd,35583509,2017-09-06,2437,-336,164488.0,7590.0,2018-08-08,False,False,False,False,False,False,0.0,True,NaT,False,NaT,False,2015-07-12,True,NaT,False,False,True,False,66.0,207.0,11.0,7.0,4.0,excellent,mildly dilated,abnormal,49.0,50.0,172.0,165.0,84.0,152.0,76.0,,normal,Regadenoson,One-day,95.0,132.0,72.0,130.0,...,1.740763,1.986156,1.537804,2.199859,1.725213,2.520402,2.549419,1.970909,2.453504,1.90599,2.358209,2.094356,1.764772,2.212575,0.785589,0.120003,1.541073,0.36054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,6.0,0.544319,3.0,1.99466,2.063722,,,,,,,,,,2017-09-06,336
4,48b095533cbfdf01_4903a582ec760da4fd52ba61f21c,35401116,2017-07-07,2436,-643,,,2019-04-11,,,,,,,,,NaT,,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.79,,,1.11,0.88,,,,,normal,,1.113924051,0.003924051,2019-04-09,2


In [204]:
print(pet_cfr_echo_days_study.shape)
pet_cfr_echo_days_study.head()

(2751, 360)


Unnamed: 0,study,mrn,echo_study_date,mrnstudydateid,days_post_cfr,reportID,subjectID,cfr_study_date,Reason_post_MI_eval,Reason_post_CABG,Reason_post_PCI,Reason_myocardial_viability,reason_postHeartTransplant,hx_recent_MI_yes,troponin,hx_prior_MI_yes,hx_prior_transplant,hx_prior_transplant_yes,hx_CABG,hx_CABG_yes,hx_PCI,hx_PCI_yes,hx_CHF,hx_CHF_yes,hx_cardiomyopathy_yes,hx_CAD_yes,hx_diabetes_yes,height_in,weight_lb,summed_stress_score,summed_rest_score,summed_difference_score,cvdb_rptCard_NuclearScoring_image_quality,LV_size,myocardial_perfusion,LV_ejection_fraction_stress,LV_ejection_fraction_rest,LV_mass,end_diastolic_volume_stress,end_systolic_volume_stress,end_diastolic_volume_rest,end_systolic_volume_rest,TID_ratio,gated_spect_results,stress_protocol,imaging_protocol,peak_HR,peak_systolic_BP,peak_diastolic_BP,rest_systolic_BP,...,cfr_seg_3,cfr_seg_4,cfr_seg_5,cfr_seg_6,cfr_seg_8,cfr_seg_9,cfr_seg_10,cfr_seg_11,cfr_seg_12,cfr_seg_13,cfr_seg_14,cfr_seg_15,cfr_seg_16,cfr_seg_17,ave_rest_mbf_segments,var_rest_mbf,ave_stress_mbf_segments,var_stress_mbf,perf_seg_1,perf_seg_2,perf_seg_3,perf_seg_4,perf_seg_5,perf_seg_6,perf_seg_7,perf_seg_8,perf_seg_9,perf_seg_10,perf_seg_11,perf_seg_12,perf_seg_13,perf_seg_14,perf_seg_15,perf_seg_16,perf_seg_17,model_rest_mbf,num_unaffected_segs,global_CFR,model_cfr,Stress,"Regional defect(normal, probably normal, abnormal)",CABG/Transplant,Comment,calc_res,Regional defect(yes/no),Unnamed: 7,Unnamed: 8,Unnamed: 9,rahul_echo_date,difference(days)
0,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2435,191,162094.0,7817.0,2018-04-25,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,2015-01-01,True,NaT,False,False,False,False,65.0,186.0,0.0,0.0,0.0,excellent,normal,normal,64.0,63.0,130.0,102.0,37.0,96.0,36.0,,normal,Regadenoson,One-day,98.0,148.0,65.0,133.0,...,1.499309,1.594322,1.888417,2.027922,1.645216,1.687712,1.599403,1.651093,1.726783,1.733784,1.87044,1.876251,1.704592,1.823339,0.934766,0.008917,1.596918,0.039836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93646,16.0,1.716143,1.707419,,,,,,,,,,2018-04-24,1
1,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,35014216,2019-04-25,2433,352,162416.0,7843.0,2018-05-08,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,69.0,155.0,9.0,0.0,9.0,excellent,normal,abnormal,59.0,49.0,131.0,102.0,42.0,121.0,62.0,,normal,Regadenoson,One-day,92.0,109.0,66.0,117.0,...,2.206746,1.108902,1.926145,2.607426,2.729797,3.016283,1.637713,2.419572,3.089375,2.707036,3.617701,3.260995,3.157073,3.064116,0.650694,0.006229,1.651436,0.12937,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.617024,7.0,2.644905,2.996385,,,,,,,,,,2019-04-25,352
2,48b091b714b41ffe_4903a582ec746dce5e9b6c1f4070,35029545,2017-05-16,2434,-69,156019.0,7376.0,2017-07-24,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,64.0,143.0,7.0,5.0,2.0,excellent,normal,abnormal,36.0,39.0,171.0,135.0,87.0,139.0,85.0,,normal,Regadenoson,One-day,96.0,93.0,55.0,125.0,...,2.895005,3.287303,2.982176,2.312197,2.515621,2.766535,2.789369,2.898802,2.700145,1.987649,2.848762,3.335913,2.583818,2.4914,0.683994,0.013291,1.844327,0.127143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.574958,4.0,2.698341,2.631967,,,,,,,,,,2017-07-21,3
3,48b09412e8754782_4903a582ec78ec0366e9db9e17cd,35583509,2017-09-06,2437,-336,164488.0,7590.0,2018-08-08,False,False,False,False,False,False,0.0,True,NaT,False,NaT,False,2015-07-12,True,NaT,False,False,True,False,66.0,207.0,11.0,7.0,4.0,excellent,mildly dilated,abnormal,49.0,50.0,172.0,165.0,84.0,152.0,76.0,,normal,Regadenoson,One-day,95.0,132.0,72.0,130.0,...,1.740763,1.986156,1.537804,2.199859,1.725213,2.520402,2.549419,1.970909,2.453504,1.90599,2.358209,2.094356,1.764772,2.212575,0.785589,0.120003,1.541073,0.36054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,6.0,0.544319,3.0,1.99466,2.063722,,,,,,,,,,2017-09-06,336
4,48b095533cbfdf01_4903a582ec760da4fd52ba61f21c,35401116,2017-07-07,2436,-643,,,2019-04-11,,,,,,,,,NaT,,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.79,,,1.11,0.88,,,,,normal,,1.113924051,0.003924051,2019-04-09,2


In [205]:
print('Number of echo studies within 365 days of cfr {}'.format(len(pet_cfr_echo_days_study.study.unique())))
print('Total number of studies in petecho df {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(pet_cfr_echo_days_study.mrn.unique())))

Number of echo studies within 365 days of cfr 2751
Total number of studies in petecho df 2751
Unique patients:     1626


In [206]:
# We can now save the data frame with the echo studies and the matched CFR measurements
match_filename = 'BWH_2020-Feb-4_pet_match365_diff_'+cfr_meta_date+'.parquet'
pet_cfr_echo_days_study.to_parquet(os.path.join(meta_dir, match_filename))

# Save also as .csv file
match_filename_csv = 'BWH_2020-Feb-4_pet_match365_diff_'+cfr_meta_date+'.csv'
pet_cfr_echo_days_study.to_csv(os.path.join(meta_dir, match_filename_csv))

In [207]:
pet_cfr_echo_days_study.head()

Unnamed: 0,study,mrn,echo_study_date,mrnstudydateid,days_post_cfr,reportID,subjectID,cfr_study_date,Reason_post_MI_eval,Reason_post_CABG,Reason_post_PCI,Reason_myocardial_viability,reason_postHeartTransplant,hx_recent_MI_yes,troponin,hx_prior_MI_yes,hx_prior_transplant,hx_prior_transplant_yes,hx_CABG,hx_CABG_yes,hx_PCI,hx_PCI_yes,hx_CHF,hx_CHF_yes,hx_cardiomyopathy_yes,hx_CAD_yes,hx_diabetes_yes,height_in,weight_lb,summed_stress_score,summed_rest_score,summed_difference_score,cvdb_rptCard_NuclearScoring_image_quality,LV_size,myocardial_perfusion,LV_ejection_fraction_stress,LV_ejection_fraction_rest,LV_mass,end_diastolic_volume_stress,end_systolic_volume_stress,end_diastolic_volume_rest,end_systolic_volume_rest,TID_ratio,gated_spect_results,stress_protocol,imaging_protocol,peak_HR,peak_systolic_BP,peak_diastolic_BP,rest_systolic_BP,...,cfr_seg_3,cfr_seg_4,cfr_seg_5,cfr_seg_6,cfr_seg_8,cfr_seg_9,cfr_seg_10,cfr_seg_11,cfr_seg_12,cfr_seg_13,cfr_seg_14,cfr_seg_15,cfr_seg_16,cfr_seg_17,ave_rest_mbf_segments,var_rest_mbf,ave_stress_mbf_segments,var_stress_mbf,perf_seg_1,perf_seg_2,perf_seg_3,perf_seg_4,perf_seg_5,perf_seg_6,perf_seg_7,perf_seg_8,perf_seg_9,perf_seg_10,perf_seg_11,perf_seg_12,perf_seg_13,perf_seg_14,perf_seg_15,perf_seg_16,perf_seg_17,model_rest_mbf,num_unaffected_segs,global_CFR,model_cfr,Stress,"Regional defect(normal, probably normal, abnormal)",CABG/Transplant,Comment,calc_res,Regional defect(yes/no),Unnamed: 7,Unnamed: 8,Unnamed: 9,rahul_echo_date,difference(days)
0,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2435,191,162094.0,7817.0,2018-04-25,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,2015-01-01,True,NaT,False,False,False,False,65.0,186.0,0.0,0.0,0.0,excellent,normal,normal,64.0,63.0,130.0,102.0,37.0,96.0,36.0,,normal,Regadenoson,One-day,98.0,148.0,65.0,133.0,...,1.499309,1.594322,1.888417,2.027922,1.645216,1.687712,1.599403,1.651093,1.726783,1.733784,1.87044,1.876251,1.704592,1.823339,0.934766,0.008917,1.596918,0.039836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93646,16.0,1.716143,1.707419,,,,,,,,,,2018-04-24,1
1,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,35014216,2019-04-25,2433,352,162416.0,7843.0,2018-05-08,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,69.0,155.0,9.0,0.0,9.0,excellent,normal,abnormal,59.0,49.0,131.0,102.0,42.0,121.0,62.0,,normal,Regadenoson,One-day,92.0,109.0,66.0,117.0,...,2.206746,1.108902,1.926145,2.607426,2.729797,3.016283,1.637713,2.419572,3.089375,2.707036,3.617701,3.260995,3.157073,3.064116,0.650694,0.006229,1.651436,0.12937,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.617024,7.0,2.644905,2.996385,,,,,,,,,,2019-04-25,352
2,48b091b714b41ffe_4903a582ec746dce5e9b6c1f4070,35029545,2017-05-16,2434,-69,156019.0,7376.0,2017-07-24,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,64.0,143.0,7.0,5.0,2.0,excellent,normal,abnormal,36.0,39.0,171.0,135.0,87.0,139.0,85.0,,normal,Regadenoson,One-day,96.0,93.0,55.0,125.0,...,2.895005,3.287303,2.982176,2.312197,2.515621,2.766535,2.789369,2.898802,2.700145,1.987649,2.848762,3.335913,2.583818,2.4914,0.683994,0.013291,1.844327,0.127143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.574958,4.0,2.698341,2.631967,,,,,,,,,,2017-07-21,3
3,48b09412e8754782_4903a582ec78ec0366e9db9e17cd,35583509,2017-09-06,2437,-336,164488.0,7590.0,2018-08-08,False,False,False,False,False,False,0.0,True,NaT,False,NaT,False,2015-07-12,True,NaT,False,False,True,False,66.0,207.0,11.0,7.0,4.0,excellent,mildly dilated,abnormal,49.0,50.0,172.0,165.0,84.0,152.0,76.0,,normal,Regadenoson,One-day,95.0,132.0,72.0,130.0,...,1.740763,1.986156,1.537804,2.199859,1.725213,2.520402,2.549419,1.970909,2.453504,1.90599,2.358209,2.094356,1.764772,2.212575,0.785589,0.120003,1.541073,0.36054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,6.0,0.544319,3.0,1.99466,2.063722,,,,,,,,,,2017-09-06,336
4,48b095533cbfdf01_4903a582ec760da4fd52ba61f21c,35401116,2017-07-07,2436,-643,,,2019-04-11,,,,,,,,,NaT,,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.79,,,1.11,0.88,,,,,normal,,1.113924051,0.003924051,2019-04-09,2


#### Filter the large echo_BWH_npy_feather_files dataframe for easier metadata collection ####

In [208]:
# With the names of the matched echo studies, we can now get the file names
# of the meta files that we will need

meta_files_name = 'echo_BWH_npy_feather_files_'+cfr_meta_date+'.parquet'
meta_files_cfr_name = 'echo_BWH_npy_feather_files_cfr_'+cfr_meta_date+'.parquet'

meta_files_df = pd.read_parquet(os.path.join(meta_dir, meta_files_name))
meta_files_df_matched = meta_files_df[meta_files_df.study.isin(pet_cfr_echo_days_study.study.unique())]
print(meta_files_df_matched.shape)
print('Unique studies in meta data: {}'.format(len(meta_files_df_matched.study.unique())))
meta_files_df_matched.head(2)
meta_files_df_matched.to_parquet(os.path.join(meta_dir, meta_files_cfr_name))

(728753, 8)
Unique studies in meta data: 2751
