### Match ECHO videos with PET CFR labels ###
We need this to assign the CFR labels to the videos. The goal is to have a data set with PET CFR values for each video that we will use to train.

In [1]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 2000)

In [2]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
cfr_meta_date = '200208'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200208


#### CFR REPORTS ####

In [3]:
# Rahul's PET studies
pet_echo_file = os.path.join(cfr_data_root, 'bwh_2020-Feb-4_pet_365_diff.txt')
pet_echo_df = pd.read_csv(pet_echo_file, sep = '\t') # Contains all PET MRNs use this one.
pet = pet_echo_df.rename(columns = {'MRN': 'mrn'}).reset_index(drop = True)
print('Unique patients:   {}'.format(len(pet.mrn.unique())))

# Let's rename the columns to allow matching the dates
pet = pet.rename(columns = {'pet_date': 'cfr_study_date',
                            'echo_date': 'rahul_echo_date'})
pet = pet.astype({'cfr_study_date': 'datetime64[ns]',
                  'rahul_echo_date': 'datetime64[ns]'})

Unique patients:   3567


In [4]:
pet_echo_file = os.path.normpath('/mnt/obi0/phi/pet/pet_cfr/non_defect_plus_p2018_noseg7.parquet')
pet_echo_df = pd.read_parquet(pet_echo_file)

In [5]:
pet_echo_df.head()

Unnamed: 0,reportID,subjectID,mrn,study_date,Reason_post_MI_eval,Reason_post_CABG,Reason_post_PCI,Reason_myocardial_viability,reason_postHeartTransplant,hx_recent_MI_yes,troponin,hx_prior_MI_yes,hx_prior_transplant,hx_prior_transplant_yes,hx_CABG,hx_CABG_yes,hx_PCI,hx_PCI_yes,hx_CHF,hx_CHF_yes,hx_cardiomyopathy_yes,hx_CAD_yes,hx_diabetes_yes,height_in,weight_lb,summed_stress_score,summed_rest_score,summed_difference_score,cvdb_rptCard_NuclearScoring_image_quality,LV_size,myocardial_perfusion,LV_ejection_fraction_stress,LV_ejection_fraction_rest,LV_mass,end_diastolic_volume_stress,end_systolic_volume_stress,end_diastolic_volume_rest,end_systolic_volume_rest,TID_ratio,gated_spect_results,stress_protocol,imaging_protocol,peak_HR,peak_systolic_BP,peak_diastolic_BP,rest_systolic_BP,rest_diastolic_BP,rest_HR,agatston_coronary_calcium_score,agatston_score_uninterpretable,...,cfr_seg_5,cfr_seg_6,cfr_seg_8,cfr_seg_9,cfr_seg_10,cfr_seg_11,cfr_seg_12,cfr_seg_13,cfr_seg_14,cfr_seg_15,cfr_seg_16,cfr_seg_17,ave_rest_mbf_segments,var_rest_mbf,ave_stress_mbf_segments,var_stress_mbf,perf_seg_1,perf_seg_2,perf_seg_3,perf_seg_4,perf_seg_5,perf_seg_6,perf_seg_7,perf_seg_8,perf_seg_9,perf_seg_10,perf_seg_11,perf_seg_12,perf_seg_13,perf_seg_14,perf_seg_15,perf_seg_16,perf_seg_17,unaffected_segments,unaffected_cfr,perf_segs,perf_seg_cfr,model_rest_mbf,num_unaffected_segs,global_CFR,model_cfr,Stress,"Regional defect(normal, probably normal, abnormal)",CABG/Transplant,Comment,calc_res,Regional defect(yes/no),Unnamed: 7,Unnamed: 8,Unnamed: 9
0,69718.0,2540.0,7924277,2008-08-15,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,67.0,133.0,18.0,18.0,0.0,excellent,moderately dilated,abnormal,30.0,25.0,202.0,190.0,132.0,188.0,141.0,1.09,normal,Rb-82 Dipyridamole,One-day study,76.0,160.0,55.0,152.0,44.0,75.0,,False,...,1.38806,1.682635,1.624719,1.184358,1.293814,1.871499,1.441069,1.467842,1.278068,1.505618,0.876119,1.114362,0.695471,0.115952,0.888437,0.161178,0.0,0.0,0.0,8.0,6.0,0.0,0.0,0.0,0.0,8.0,6.0,0.0,0.0,0.0,8.0,0.0,0.0,"[1, 2, 8, 13]",1.623875,"[4, 5, 10, 11, 15]",1.295654,0.6705,4.0,1.368012,1.623875,,,,,,,,,
1,69969.0,3379.0,12853099,2006-01-25,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,64.0,160.0,0.0,0.0,0.0,excellent,normal,normal,,72.0,,,,59.0,17.0,,not assessed,Rb-82 Dipyridamole,One-day study,82.0,146.0,89.0,144.0,84.0,75.0,0.0,False,...,1.971471,1.908483,1.699039,1.501695,2.421498,2.577866,2.689058,1.803998,1.196789,2.076923,2.533481,1.536166,1.540882,0.123958,3.229812,0.430427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 1...",2.201288,[],0.0,1.544562,16.0,2.109661,2.201288,,,,,,,,,
2,69993.0,3542.0,20710471,2006-01-23,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,True,True,71.0,200.0,20.0,18.0,2.0,good,normal,abnormal,,42.0,145.0,,,139.0,81.0,0.97,normal,Rb-82 Dipyridamole,One-day study,73.0,142.0,80.0,121.0,71.0,71.0,6602.0,False,...,1.244109,1.831599,0.97586,1.809524,1.272362,1.184735,1.378594,1.11087,1.528986,1.725581,1.099682,1.396004,1.130588,0.086979,1.402625,0.098134,0.0,0.0,0.0,6.0,8.0,2.0,0.0,0.0,0.0,6.0,8.0,0.0,0.0,0.0,8.0,0.0,0.0,"[2, 8, 13]",1.004246,"[4, 5, 6, 10, 11, 15]",1.290234,1.309667,3.0,1.229572,1.004246,,,,,,,,,
3,70008.0,2155.0,12627030,2006-02-10,False,False,False,False,False,False,0.0,True,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,68.0,217.0,16.0,0.0,16.0,good,normal,abnormal,,62.0,183.0,,,180.0,69.0,0.79,normal,Rb-82 Dipyridamole,One-day study,70.0,157.0,70.0,148.0,77.0,64.0,331.0,False,...,1.246032,2.627709,0.690919,1.261891,1.332776,1.761879,2.438696,1.801924,1.248541,2.499398,2.584257,2.049601,1.736824,0.103225,2.704125,1.641842,2.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,[14],1.248541,"[1, 3, 5, 12, 16]",1.63243,2.056,1.0,1.588915,1.248541,,,,,,,,,
4,70012.0,3417.0,15324312,2006-02-23,False,False,False,False,False,False,0.0,False,NaT,False,NaT,False,NaT,False,NaT,False,False,False,False,64.0,146.0,0.0,0.0,0.0,excellent,normal,normal,,54.0,126.0,,,114.0,53.0,0.94,normal,Rb-82 Dipyridamole,One-day study,100.0,132.0,69.0,138.0,46.0,82.0,0.0,False,...,2.364739,2.877568,2.746835,3.052955,2.130812,2.355508,2.755853,3.056298,2.336717,2.831246,2.402961,2.344529,1.469,0.149121,3.824063,0.694937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 1...",2.677636,[],0.0,1.459937,16.0,2.656655,2.677636,,,,,,,,,


In [4]:
pet.head(200)

Unnamed: 0,mrn,rahul_echo_date,cfr_study_date,difference(days)
0,17666439,2017-03-14,2017-07-20,128
1,12287447,2019-05-16,2020-01-23,252
2,12287447,2020-01-23,2020-01-23,0
3,31953938,2017-11-09,2018-08-20,284
4,31953938,2018-08-13,2018-08-20,7
5,31953938,2018-02-12,2018-08-20,189
6,3573623,2019-11-26,2020-01-21,56
7,3573623,2020-01-02,2020-01-21,19
8,12224283,2019-11-22,2020-01-17,56
9,41128695,2019-12-08,2020-01-10,33


In [5]:
# PET CFR REPORTS
getStress_reports_file = os.path.join(cfr_data_root, '210_getStressTest.xlsx')
getStress_df = pd.read_excel(getStress_reports_file)
keep_cols = ['reportID', 'mrn', 'subjectid', 'report_number', 'study_date', 'report_date', 'cfr']
getStress_df = getStress_df[keep_cols]

getStress_df = getStress_df.astype({'report_date': 'datetime64[ns]',
                                    'study_date': 'datetime64[ns]'})

pet_cfr = getStress_df.rename(columns = {'report_date': 'cfr_report_date',
                                         'study_date': 'cfr_study_date'})
pet_cfr.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,cfr_study_date,cfr_report_date,cfr
0,72946,4474599,3,0904-0007,2007-09-28,2007-09-04,1.428232
1,91480,9774688,6,EVS0154093,2008-11-11,2008-11-11,2.015579
2,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606
3,73048,16480857,10,0728923B,2007-11-26,2007-10-29,1.1075
4,100873,7633498,11,EVS0266850,2010-01-29,2010-09-24,1.776545


In [6]:
# Lets merge pet with pet_cfr
pet2 = pet_cfr.merge(right = pet, on = ['mrn', 'cfr_study_date'], how = 'left')
print('Number of CFR report numbers: {}'.format(len(pet2.report_number.unique())))
print('Unique patients:   {}'.format(len(pet2.mrn.unique())))
print('Unique cfr values: {}'.format(len(pet2.cfr.unique())))

pet2.tail()

Number of CFR report numbers: 3350
Unique patients:   3072
Unique cfr values: 3414


Unnamed: 0,reportID,mrn,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,rahul_echo_date,difference(days)
5942,154266,30951032,7296,E4895329,2017-05-08,2017-05-02,1.782776,2017-04-25,13.0
5943,154859,35294396,7326,E2892404,2017-06-01,2016-08-12,2.522388,NaT,
5944,155419,34940536,7337,E6256840,2017-06-26,2017-10-20,3.483159,2017-10-20,116.0
5945,157204,32221723,7422,E6107354,2017-09-15,2017-09-14,1.512618,2017-09-14,1.0
5946,157776,288407,7446,E6330362,2017-10-12,2017-10-07,1.844106,2017-10-07,5.0


In [7]:
# Let's see where the mrn-date combination did not match
df_nomatch = pet2.loc[pet2.rahul_echo_date.isnull()]
print(df_nomatch.shape)
df_nomatch.head()

(392, 9)


Unnamed: 0,reportID,mrn,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,rahul_echo_date,difference(days)
0,72946,4474599,3,0904-0007,2007-09-28,2007-09-04,1.428232,NaT,
6,100873,7633498,11,EVS0266850,2010-01-29,2010-09-24,1.776545,NaT,
21,97226,16466641,29,EVS0204489,2009-08-04,2009-09-03,1.527831,NaT,
35,74181,3186582,33,1201-0003,2007-02-01,2006-12-01,1.281909,NaT,
49,104202,3567948,46,0709-0006,2010-07-02,2010-07-09,0.659384,NaT,


#### NPY ECHO FILES ####
This is a data frame that contains the filenames, MRNs and dates of all echo studies

In [8]:
echo_files_name = 'echo_npyFiles_BWH_'+cfr_meta_date+'.parquet'
echo = pd.read_parquet(os.path.join(meta_dir, echo_files_name))
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)

echo = echo.astype({'mrn': 'int64',
                    'echo_study_date': 'datetime64[ns]'})

print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()

# For matching dates with PET, we only need the studies
# So we can greatly reduce this data frame
echo = echo[['study', 'mrn', 'echo_study_date']].drop_duplicates().reset_index(drop = True).\
    sort_values(by = 'mrn', ascending = True)

Number of npy files: 1056913
Unique echo studies: 21045
Unique patients:     12520



#### PET STUDIES + NPY ECHO DATA ####

In [9]:
pet_echo = pet2.merge(right = echo, on = 'mrn', how = 'left')
pet_echo.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,rahul_echo_date,difference(days),study,echo_study_date
0,72946,4474599,3,0904-0007,2007-09-28,2007-09-04,1.428232,NaT,,,NaT
1,91480,9774688,6,EVS0154093,2008-11-11,2008-11-11,2.015579,2008-11-11,0.0,,NaT
2,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,2007-04-30,184.0,4b7351bee96d1016_4903a44b32edba11d3b5854898ef,2008-01-29
3,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,2007-04-30,184.0,4b7351bee96d1016_4903a444c30c185baf6a7befa860,2007-04-30
4,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,2007-04-30,184.0,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29


In [10]:
# Filter filter NAN in study from merged PET report (we don't have .npy files from those studies)
pet_echo_nofiles = pet_echo.loc[pet_echo.study.isnull()].reset_index(drop = True)
petecho = pet_echo.loc[~pet_echo.study.isnull()].reset_index(drop = True)

In [11]:
print('PET with ECHO:')
#print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique echo studies: {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(petecho.mrn.unique())))

PET with ECHO:
Unique echo studies: 3912
Unique patients:     1960


In [12]:
print('PET with out ECHO:')
#print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique CFR studies: {}'.format(len(pet_echo_nofiles.reportID.unique())))
print('Unique patients:     {}'.format(len(pet_echo_nofiles.mrn.unique())))

PET with out ECHO:
Unique CFR studies: 1166
Unique patients:     1112


### Match ECHO study with PET reportID  ###
Goal:
Find a CFR label for each echo video taken from the closest PET measurement.
Requirement: We want to pick the most recent CFR measurments for each echo that is not older than 365 days.
For this, we need a data frame that contains for a given MRN:

- All echo studies for this patient
- A date for each echo study
- All pet reports for that patient
- Date for each pet report

In [13]:
petecho.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,rahul_echo_date,difference(days),study,echo_study_date
0,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,2007-04-30,184.0,4b7351bee96d1016_4903a44b32edba11d3b5854898ef,2008-01-29
1,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,2007-04-30,184.0,4b7351bee96d1016_4903a444c30c185baf6a7befa860,2007-04-30
2,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,2007-04-30,184.0,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29
3,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,2008-01-29,90.0,4b7351bee96d1016_4903a44b32edba11d3b5854898ef,2008-01-29
4,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,2008-01-29,90.0,4b7351bee96d1016_4903a444c30c185baf6a7befa860,2007-04-30


In [14]:
# Pick a patient
#np.random.seed(112)
#mrn = np.random.choice(petecho.mrn.unique(), size = 1,)[0]
mrn = 15467822
print('MRN {}'.format(mrn))
petecho_mrn = petecho[petecho.mrn == mrn].reset_index(drop=True)
print(petecho_mrn[['mrn', 'reportID', 'cfr_study_date', 'echo_study_date', 'study', 'cfr']].iloc[8:12])

MRN 15467822
         mrn  reportID cfr_study_date echo_study_date                                          study       cfr
8   15467822    119959     2012-10-11      2014-05-01  4a1347300a20f7e7_4903a581a72e1c3a0014f8f71932  0.875204
9   15467822    119959     2012-10-11      2014-08-29  4a1347300a20f7e7_4903a581a723bbe47f2a139a23e4  0.875204
10  15467822    126435     2013-09-05      2012-09-06  4a1347300a20f7e7_4903a58793be6eaf2db65cba1044  1.182464
11  15467822    126435     2013-09-05      2014-10-28  4a1347300a20f7e7_4903a581a681209d89d1fed6737d  1.182464


In [15]:
# Pick a study
echo_study = petecho_mrn.study.unique()[2]
echo_study_date = petecho_mrn[petecho_mrn.study == echo_study].echo_study_date.values[0]
print(echo_study)
print(echo_study_date)

# Now, list the reportIDs with the corresponding dates
pet_dates = petecho_mrn[['reportID', 'cfr_study_date']].groupby('reportID').first().reset_index(drop=False)
pet_dates = pet_dates.assign(days=(pet_dates.cfr_study_date-echo_study_date).dt.days)
pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days))
print()
print(pet_dates)

# Now, we can find the earliest pet_study_date and the corresponding reportID
ser = pet_dates.loc[pet_dates.days_abs.idxmin()]
print('ReportID of earliest PET study {}'.format(ser.reportID))
print('cfr_days_after_echo {}'.format(ser.days))

4a1347300a20f7e7_4903a581a72ff2f12e988b7e2f8d
2014-04-24T00:00:00.000000000

   reportID cfr_study_date  days  days_abs
0    119959     2012-10-11  -560       560
1    126435     2013-09-05  -231       231
2    131307     2014-04-29     5         5
3    133986     2014-09-02   131       131
ReportID of earliest PET study 131307
cfr_days_after_echo 5


In [16]:
# Now, we can come up with a function that determines the reportID and the dates of the earliest CFR
# For any echo date for a given patient.

def get_earliest_cfr_reports(df_cfr_echo):
    """ 
    Determine the earliest CFR measurement for all studies
    args: df_cfr_echo: pd.DataFrame() 
          with columns: mrn, study, reportID, cfr_study_date, echo_study_date
    returns: cfr_echo_days: pd.DataFrame()
          with columns: mrn, study, reportID, cfr_days_after_echo
    """

    # Loop through all MRNs in df_pet_echo
    mrn_list = sorted(list(df_cfr_echo.mrn.unique()))
    cfr_echo_days = pd.DataFrame()
    for m, mrn in enumerate(mrn_list):

        if (m+1)%200==0:
            print('Processing MRN {} of {}'.format(m+1, len(mrn_list)))

        df_mrn = df_cfr_echo[df_cfr_echo.mrn == mrn]
        echo_study_list = list(df_mrn.study.unique())

        # Loop through all studies for this MRN
        for study in echo_study_list:
            echo_study_date = df_mrn[df_mrn.study == study].echo_study_date.values[0]

            # Find all pet dates for this patient and add the difference to the echo dates
            pet_dates = df_mrn[['reportID', 'cfr_study_date']].groupby('reportID').first().reset_index(drop=False)
            pet_dates = pet_dates.assign(days_post_cfr=(echo_study_date-pet_dates.cfr_study_date).dt.days)
            pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days_post_cfr))
            ser = pet_dates.loc[pet_dates.days_abs.idxmin()]

            # We can return the reportID and the days of the earliest pet study
            study_dict = {'mrn': [mrn],
                          'study': [study],
                          'echo_study_date': [echo_study_date],
                          'reportID': [ser.reportID],
                          'days_post_cfr': [ser.days_post_cfr]}
            cfr_echo_days = pd.concat([cfr_echo_days, pd.DataFrame(study_dict)], ignore_index=True)
    
    return cfr_echo_days

In [17]:
cfr_echo_days = get_earliest_cfr_reports(petecho)

Processing MRN 200 of 1960
Processing MRN 400 of 1960
Processing MRN 600 of 1960
Processing MRN 800 of 1960
Processing MRN 1000 of 1960
Processing MRN 1200 of 1960
Processing MRN 1400 of 1960
Processing MRN 1600 of 1960
Processing MRN 1800 of 1960


In [18]:
cfr_echo_days.head()

Unnamed: 0,mrn,study,echo_study_date,reportID,days_post_cfr
0,9241,4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5,2010-01-26,100820,-2
1,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,64
2,59691,4b7b463fca25269d_4903a5864e60c69dea709396260d,2013-06-03,127002,-121
3,59691,4b7b463fca25269d_4903a580509f97a83ea0d31ca15c,2015-08-31,141524,-17
4,88435,4b7b46323562dafe_4903a58c1bccabff29a2958b368c,2019-03-07,137391,1476


In [19]:
pet2.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,rahul_echo_date,difference(days)
0,72946,4474599,3,0904-0007,2007-09-28,2007-09-04,1.428232,NaT,
1,91480,9774688,6,EVS0154093,2008-11-11,2008-11-11,2.015579,2008-11-11,0.0
2,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,2007-04-30,184.0
3,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,2008-01-29,90.0
4,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,2007-10-29,2.0


In [20]:
# We can merge this back with the original PET reports
pet_cfr_echo_days = cfr_echo_days.merge(right = pet2, on = ['reportID', 'mrn'], how = 'left')

In [21]:
# Next, we will filter those echo studies that occurred earlier or later than 365 days after CFR
pet_cfr_echo_days = pet_cfr_echo_days.assign(abs_cfr_days = np.abs(pet_cfr_echo_days.days_post_cfr))
pet_cfr_echo_days = pet_cfr_echo_days[pet_cfr_echo_days.abs_cfr_days<=365].drop(columns = ['abs_cfr_days'])
# Sort this by study so that we know how well we matched the dates
pet_cfr_echo_days = pet_cfr_echo_days.sort_values(by = 'study')

In [22]:
print(cfr_echo_days.shape)
pet_cfr_echo_days.head(4)

(3912, 5)


Unnamed: 0,mrn,study,echo_study_date,reportID,days_post_cfr,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,rahul_echo_date,difference(days)
9989,34940536,48b1a5aeaf91d4d2_4903a582edf18da48d51c09a959b,2017-10-20,155419,116,7337,E6256840,2017-06-26,2017-10-20,3.483159,2017-10-20,116.0
9987,34672006,48b1aaae1d6494d9_4903a582ec78ee7c84541cef7bc4,2017-09-28,153280,184,7257,E5055970,2017-03-28,2017-05-19,1.65,2017-09-28,184.0
9986,34672006,48b1aaae1d6494d9_4903a582ec78ee7c84541cef7bc4,2017-09-28,153280,184,7257,E5055970,2017-03-28,2017-05-19,1.65,2017-05-19,52.0
9988,34672006,48b1aaae1d6494d9_4903a582ec78ee7c84541cef7bc4,2017-09-28,153280,184,7257,E5055970,2017-03-28,2017-05-19,1.65,2017-09-29,185.0


In [23]:
# To match with filenames, we really only need one unique study. 
# Let's remove Rahul's columns and delete duplicate rows
pet_cfr_echo_days_study = pet_cfr_echo_days.drop(columns = ['rahul_echo_date', 'difference(days)']).\
                            drop_duplicates().reset_index(drop=True)

In [24]:
print(pet_cfr_echo_days_study.shape)
pet_cfr_echo_days_study.head()

(2960, 10)


Unnamed: 0,mrn,study,echo_study_date,reportID,days_post_cfr,subjectid,report_number,cfr_study_date,cfr_report_date,cfr
0,34940536,48b1a5aeaf91d4d2_4903a582edf18da48d51c09a959b,2017-10-20,155419,116,7337,E6256840,2017-06-26,2017-10-20,3.483159
1,34672006,48b1aaae1d6494d9_4903a582ec78ee7c84541cef7bc4,2017-09-28,153280,184,7257,E5055970,2017-03-28,2017-05-19,1.65
2,34048454,48b1acc7b7842a39_4903a58359a27de07d6d9d93506d,2016-10-31,149905,5,6934,E3456497,2016-10-26,2016-10-31,1.591283
3,31685753,48b4c0c64fde033c_4903a583583692ec8f2755c07930,2016-01-06,143670,-1,6644,E1665507,2016-01-07,2016-01-06,1.399824
4,31693849,48b4c0c7a4a58759_4903a583583e2e79c02e50f4ec23,2016-09-30,149377,0,6913,E3183443,2016-09-30,2016-09-30,4.139535


In [25]:
print('Number of echo studies within 365 days of cfr {}'.format(len(pet_cfr_echo_days_study.study.unique())))
print('Total number of studies in petecho df {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(pet_cfr_echo_days_study.mrn.unique())))

Number of echo studies within 365 days of cfr 2960
Total number of studies in petecho df 3912
Unique patients:     1831


In [26]:
# We can now save the data frame with the echo studies and the matched CFR measurements
match_filename = 'BWH_2020-Feb-4_pet_match365_diff_'+cfr_meta_date+'.parquet'
pet_cfr_echo_days_study.to_parquet(os.path.join(meta_dir, match_filename))

# Save also as .csv file
match_filename_csv = 'BWH_2020-Feb-4_pet_match365_diff_'+cfr_meta_date+'.csv'
pet_cfr_echo_days_study.to_csv(os.path.join(meta_dir, match_filename_csv))

In [27]:
pet_cfr_echo_days_study.head()

Unnamed: 0,mrn,study,echo_study_date,reportID,days_post_cfr,subjectid,report_number,cfr_study_date,cfr_report_date,cfr
0,34940536,48b1a5aeaf91d4d2_4903a582edf18da48d51c09a959b,2017-10-20,155419,116,7337,E6256840,2017-06-26,2017-10-20,3.483159
1,34672006,48b1aaae1d6494d9_4903a582ec78ee7c84541cef7bc4,2017-09-28,153280,184,7257,E5055970,2017-03-28,2017-05-19,1.65
2,34048454,48b1acc7b7842a39_4903a58359a27de07d6d9d93506d,2016-10-31,149905,5,6934,E3456497,2016-10-26,2016-10-31,1.591283
3,31685753,48b4c0c64fde033c_4903a583583692ec8f2755c07930,2016-01-06,143670,-1,6644,E1665507,2016-01-07,2016-01-06,1.399824
4,31693849,48b4c0c7a4a58759_4903a583583e2e79c02e50f4ec23,2016-09-30,149377,0,6913,E3183443,2016-09-30,2016-09-30,4.139535


#### Filter the large echo_BWH_npy_feather_files dataframe for easier metadata collection ####

In [28]:
# With the names of the matched echo studies, we can now get the file names
# of the meta files that we will need

meta_files_name = 'echo_BWH_npy_feather_files_'+cfr_meta_date+'.parquet'
meta_files_cfr_name = 'echo_BWH_npy_feather_files_cfr_'+cfr_meta_date+'.parquet'
meta_files_df = pd.read_parquet(os.path.join(meta_dir, meta_files_name))
meta_files_df_matched = meta_files_df[meta_files_df.study.isin(pet_cfr_echo_days_study.study.unique())]
print(meta_files_df_matched.shape)
print('Unique studies in meta data: {}'.format(len(meta_files_df_matched.study.unique())))
meta_files_df_matched.head(2)
meta_files_df_matched.to_parquet(os.path.join(meta_dir, meta_files_cfr_name))

(875240, 8)
Unique studies in meta data: 2960
