### Match ECHO videos with PET CFR labels ###
Master PET-ECHO match list: All data available on FS. Need to be filtered to create data sets.

In [7]:
import os
import pickle
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 2000)

In [8]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
cfr_meta_date = '200416'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200416


#### PET DATA ####

In [28]:
pet_echo_file = os.path.normpath('/mnt/obi0/phi/pet/pet_cfr/pets_with_echos_all.parquet')
pet_cols = ['mrn', 'pet_date', 'petmrn_identifier']
pet = pd.read_parquet(pet_echo_file)
pet = pet[pet_cols].dropna(how='all').drop_duplicates()
pet = pet.astype({'pet_date': 'datetime64[ns]'})
print(f'Unique PET measurements {len(pet.petmrn_identifier.unique())}')
pet.head()

Unique PET measurements 4136


Unnamed: 0,mrn,pet_date,petmrn_identifier
0,7924277,2008-08-15,7924277_2008-08-15
1,12853099,2006-01-25,12853099_2006-01-25
2,20710471,2006-01-23,20710471_2006-01-23
3,12627030,2006-02-10,12627030_2006-02-10
4,15324312,2006-02-23,15324312_2006-02-23


#### PET ECHO PAIRS ####

In [38]:
match_list_file = 'bwh_2020-Feb-4_pet_365_diff.txt'
rdf = pd.read_csv(os.path.join(cfr_data_root, match_list_file), sep = '\t')
rdf = rdf.astype({'echo_date': 'datetime64[ns]',
                  'pet_date': 'datetime64[ns]'}).\
        rename(columns={'MRN': 'mrn'})
rdf.head()

Unnamed: 0,mrn,echo_date,pet_date,difference(days)
0,17666439,2017-03-14,2017-07-20,128
1,12287447,2019-05-16,2020-01-23,252
2,12287447,2020-01-23,2020-01-23,0
3,31953938,2017-11-09,2018-08-20,284
4,31953938,2018-08-13,2018-08-20,7


In [61]:
# Join PET and PET-ECHO tables
pet_echo = rdf.merge(pet, on=['mrn', 'pet_date'], how='left')
pet_without_pair = pet_echo.loc[pet_echo.petmrn_identifier.isnull()]
pet2 = pet_with_pair = pet_echo.loc[~pet_echo.petmrn_identifier.isnull()]
print(f'PET-ECHO pairs without PET data: {pet_without_pair.shape[0]} ' \
      f'from {len(pet_without_pair.mrn.unique())} mrns.')
print(f'PET-ECHO pairs with PET data: {pet_with_pair.shape[0]} ' \
      f'from {len(pet_with_pair.mrn.unique())} mrns.')
print(f'PET-ECHO pairs from match list: {rdf.shape[0]} from {len(rdf.mrn.unique())} mrns.')

PET-ECHO pairs without PET data: 196 from 95 mrns.
PET-ECHO pairs with PET data: 7227 from 3498 mrns.
PET-ECHO pairs from match list: 7423 from 3567 mrns.


In [62]:
pet2.head()

Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier
0,17666439,2017-03-14,2017-07-20,128,17666439_2017-07-20
1,12287447,2019-05-16,2020-01-23,252,12287447_2020-01-23
2,12287447,2020-01-23,2020-01-23,0,12287447_2020-01-23
3,31953938,2017-11-09,2018-08-20,284,31953938_2018-08-20
4,31953938,2018-08-13,2018-08-20,7,31953938_2018-08-20


#### NPY ECHO FILES ####
This is a data frame that contains the filenames, MRNs and dates of all echo studies

In [63]:
echo_files_name = 'echo_npyFiles_BWH_'+cfr_meta_date+'.parquet'
echo = pd.read_parquet(os.path.join(meta_dir, echo_files_name))
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)

echo = echo.astype({'mrn': 'int64',
                    'echo_study_date': 'datetime64[ns]'})

print(echo_files_name)
print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()

# For matching dates with PET, we only need the studies
# So we can greatly reduce this data frame
echo = echo[['study', 'mrn', 'echo_study_date']].drop_duplicates().reset_index(drop = True).\
    sort_values(by = 'mrn', ascending = True)

echo_npyFiles_BWH_200416.parquet
Number of npy files: 1366266
Unique echo studies: 27251
Unique patients:     16373



In [64]:
echo.head()

Unnamed: 0,study,mrn,echo_study_date
3912,4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5,9241,2010-01-26
3917,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,9241,2010-04-02
4000,4b7b463e0550e9de_4903a58051d59d426fe452739a29,41723,2015-11-03
3923,4b7b463fc2f49a36_4903a58dd5465407d53d20ba3790,51599,2018-05-22
3932,4b7b463fc2f49a36_4903a58793b43bb4bb1078d62f4e,51599,2012-03-08


#### PET STUDIES + NPY ECHO DATA merged on mrn only ####

In [78]:
pet_study = pet2.merge(right = echo, on = 'mrn', how = 'left')
pet_study = pet_study.loc[~pet_study.study.isnull()].sort_values(by='mrn')
print(f'PET-ECHO pairs with PET data: {pet2.shape[0]} ' \
      f'from {len(pet2.mrn.unique())} mrns.')
print(f'PET-ECHO pairs with PET data and echo study: {pet_study.shape[0]} ' \
      f'from {len(pet_study.mrn.unique())} mrns.')

PET-ECHO pairs with PET data: 7227 from 3498 mrns.
PET-ECHO pairs with PET data and echo study: 18338 from 2880 mrns.


In [79]:
pet_study[pet_study.mrn==39835152]

Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,study,echo_study_date
1154,39835152,2019-07-29,2019-09-10,43,39835152_2019-09-10,48bca16ebb9f79e9_4903a58c1bc99380baa33591b299,2019-06-14
1153,39835152,2019-07-29,2019-09-10,43,39835152_2019-09-10,48bca16ebb9f79e9_4903a58c1bc8b8b4edc2ff652898,2019-07-29
1152,39835152,2019-06-14,2019-09-10,88,39835152_2019-09-10,48bca16ebb9f79e9_4903a58c1bc99380baa33591b299,2019-06-14
1151,39835152,2019-06-14,2019-09-10,88,39835152_2019-09-10,48bca16ebb9f79e9_4903a58c1bc8b8b4edc2ff652898,2019-07-29


### Match ECHO study with PET reportID  ###
Goal:
Find a PET study for each echo video taken from the closest PET measurement.
Requirement: We want to pick the most recent CFR measurments for each echo that is not older than 365 days.
For this, we need a data frame that contains for a given MRN:

- All echo studies for this patient
- A date for each echo study
- All pet reports for that patient
- Date for each pet report

In [113]:
df = pet_study.copy()
mrn = 92742
df_mrn = df[df.mrn==mrn]
echo_study_list = list(df_mrn.echo_study_date.unique())
echo_study_date = echo_date_list[0]
pet_dates = df_mrn[['petmrn_identifier', 'pet_date']].groupby('petmrn_identifier').first().reset_index(drop=False)
pet_dates = pet_dates.assign(days_post_cfr=(echo_study_date-pet_dates.pet_date).dt.days)
pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days_post_cfr))
ser = pet_dates.loc[pet_dates.days_abs.idxmin()]

In [154]:
def match_pet_echo(df):
    """ 
    Determine the earliest PET measurement for all studies
    """

    # Loop through all MRNs in df_pet_echo
    mrn_list = sorted(list(df.mrn.unique()))
    cfr_echo_days = pd.DataFrame()
    for m, mrn in enumerate(mrn_list):

        if (m+1)%200==0:
            print('Processing MRN {} of {}'.format(m+1, len(mrn_list)))

        df_mrn = df[df.mrn == mrn]
        echo_study_list = list(df_mrn.study.unique())

        # Loop through all echo studies for this MRN
        for study in echo_study_list:
            echo_study_date = df_mrn[df_mrn.study == study].echo_study_date.values[0]

            # Find all pet dates for this patient and add the difference to the echo dates
            pet_dates = df_mrn[['petmrn_identifier', 'pet_date']].groupby('petmrn_identifier').first().reset_index(drop=False)
            pet_dates = pet_dates.assign(days_post_pet=(echo_study_date-pet_dates.pet_date).dt.days)
            pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days_post_pet))
            ser = pet_dates.loc[pet_dates.days_abs.idxmin()]

            # We can return the reportID and the days of the earliest pet study
            study_dict = {'mrn': [mrn],
                          'study': [study],
                          'pet_date': [ser.pet_date],
                          'echo_date': [echo_study_date],
                          'petmrn_identifier': [ser.petmrn_identifier],
                          'days_post_pet': [ser.days_post_pet]}
            cfr_echo_days = pd.concat([cfr_echo_days, pd.DataFrame(study_dict)], ignore_index=True)
    
    return cfr_echo_days

In [155]:
pet_echo_days = match_pet_echo(pet_study)

Processing MRN 200 of 2880
Processing MRN 400 of 2880
Processing MRN 600 of 2880
Processing MRN 800 of 2880
Processing MRN 1000 of 2880
Processing MRN 1200 of 2880
Processing MRN 1400 of 2880
Processing MRN 1600 of 2880
Processing MRN 1800 of 2880
Processing MRN 2000 of 2880
Processing MRN 2200 of 2880
Processing MRN 2400 of 2880
Processing MRN 2600 of 2880
Processing MRN 2800 of 2880


Here, we have picked the closest PET date for all echo studies for this mrn. And the closest PET date was the same for all echos. Therefore, PET scans that occurred earlier that do not have an echo that is closer, will disappear.

In [145]:
pet_echo_days.head()

Unnamed: 0,mrn,study,pet_date,echo_study_date,petmrn_identifier,days_post_cfr
0,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-01-28,2010-04-02,9241_2010-01-28,64
1,9241,4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5,2010-01-28,2010-01-26,9241_2010-01-28,-2
2,59691,4b7b463fca25269d_4903a5864e60c69dea709396260d,2015-09-17,2013-06-03,59691_2015-09-17,-836
3,59691,4b7b463fca25269d_4903a580509f97a83ea0d31ca15c,2015-09-17,2015-08-31,59691_2015-09-17,-17
4,88435,4b7b46323562dafe_4903a58c1bccabff29a2958b368c,2015-02-20,2019-03-07,88435_2015-02-20,1476


In [151]:
rdf.head()

Unnamed: 0,mrn,echo_date,pet_date,difference(days)
0,17666439,2017-03-14,2017-07-20,128
1,12287447,2019-05-16,2020-01-23,252
2,12287447,2020-01-23,2020-01-23,0
3,31953938,2017-11-09,2018-08-20,284
4,31953938,2018-08-13,2018-08-20,7


In [153]:
# Join back Rahul's PET-ECHO pairs
pet_echo_days = pet_echo_days.merge(rdf, on = ['mrn', 'echo_date', 'pet_date'], how = 'left')

KeyError: 'echo_date'

In [147]:
# After the match, do we have the same number of patients
print('Total patients after match:{}'.format(len(pet_echo_days.mrn.unique())))
print('Echo studies after match:  {}'.format(len(pet_echo_days.study.unique())))
print('PET studies after match:   {}'.format(len(pet_echo_days.petmrn_identifier.unique())))
print('Original PET studies:      {}'.format(len(pet.petmrn_identifier.unique())))

Total patients after match:2880
Echo studies after match:  5546
PET studies after match:   3001
Original PET studies:      4136


In [34]:
# Next, we will filter those echo studies that occurred earlier or later than 365 days after CFR
pet_echo_days_365 = pet_echo_days.assign(abs_pet_days = np.abs(pet_echo_days.days_post_cfr))
pet_cfr_echo_days_365 = pet_cfr_echo_days_365[pet_cfr_echo_days_365.abs_cfr_days<=365].drop(columns = ['abs_cfr_days'])
# Sort this by study so that we know how well we matched the dates
pet_cfr_echo_days_365 = pet_cfr_echo_days_365.sort_values(by = 'study')

In [36]:
# After the match, do we have the same number of patients
print('Patients after 365 filter     {}'.format(len(pet_cfr_echo_days_365.mrn.unique())))
print('Echo studies after 365 filter {}'.format(len(pet_cfr_echo_days_365.study.unique())))

Patients after 365 filter     1577
Echo studies after 365 filter 2292


In [54]:
# Patients who dropped out because of the 365 time limit
mrn_before_filter = set(list(cfr_echo_days.mrn.unique()))
mrn_after_filter = set(list(pet_cfr_echo_days_365.mrn.unique()))
mrn_365_dropped = list(mrn_before_filter.difference(mrn_after_filter))
print('MRNs lost after 365 filter {}'.format(len(mrn_365_dropped)))
mrn_dropped_file = os.path.join(meta_dir, 'mrn_365_dropped.pkl')
with open(mrn_dropped_file, 'wb') as fl:
    pickle.dump(mrn_365_dropped, fl)

MRNs lost after 365 filter 85


In [55]:
# To match with filenames, we really only need one study per row. 
# Remove colums where some rows have missing values
pet_cfr_echo_days_noNA = pet_cfr_echo_days_365.dropna(axis=1, how='any') 
pet_cfr_echo_days_study = pet_cfr_echo_days_noNA.groupby('study').first().reset_index(drop=False)

print('Original shape: {} now: {}'.format(cfr_echo_days.shape,
                                          pet_cfr_echo_days_365.shape))

print('Original number of studies: {} now: {}'.format(len(cfr_echo_days.study.unique()),
                                                      len(pet_cfr_echo_days_study.study.unique())))

Original shape: (2954, 5) now: (2297, 116)
Original number of studies: 2954 now: 2292


In [56]:
d = pet_cfr_echo_days_study[['mrn', 'study', 'cfr_study_date', 'echo_study_date', 'days_post_cfr']]
d.head(10)

Unnamed: 0,mrn,study,cfr_study_date,echo_study_date,days_post_cfr
0,35133560,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,2018-04-25,2018-11-02,191
1,35014216,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,2018-05-08,2019-04-25,352
2,35029545,48b091b714b41ffe_4903a582ec746dce5e9b6c1f4070,2017-07-24,2017-05-16,-69
3,35583509,48b09412e8754782_4903a582ec78ec0366e9db9e17cd,2018-08-08,2017-09-06,-336
4,35401116,48b095533cbfdf01_4903a58c1bcb2198682a98b7503a,2019-04-11,2019-04-09,-2
5,35401116,48b095533cbfdf01_4903a58dd5441181fcd696f7d50c,2019-04-11,2018-07-25,-260
6,35647072,48b097cb6874ce59_4903a582edf18da17812c4897c25,2017-11-01,2017-10-25,-7
7,35995307,48b0984d74745d55_4903a582edf18da3b9aa6d4547eb,2017-10-27,2017-10-27,0
8,34940536,48b1a5aeaf91d4d2_4903a582edf18da48d51c09a959b,2017-06-26,2017-10-20,116
9,34530097,48b1a9c06ce2b304_4903a582ec7226d2e7bd522f25ce,2017-04-18,2017-03-31,-18


In [57]:
pet_cfr_echo_days_study.head(2)

Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf
0,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2618,191,2018-04-25,35133560_2018-04-25,0,normal,1.716143,1.699871,0.934766,1.588982,17,0.923,1.584
1,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,35014216,2019-04-25,2616,352,2018-05-08,35014216_2018-05-08,0,abnormal,2.644905,2.876778,0.63689,1.832192,8,0.628,1.661


In [58]:
print(pet_cfr_echo_days_study.shape)
pet_cfr_echo_days_study.head()

(2292, 16)


Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf
0,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2618,191,2018-04-25,35133560_2018-04-25,0,normal,1.716143,1.699871,0.934766,1.588982,17,0.923,1.584
1,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,35014216,2019-04-25,2616,352,2018-05-08,35014216_2018-05-08,0,abnormal,2.644905,2.876778,0.63689,1.832192,8,0.628,1.661
2,48b091b714b41ffe_4903a582ec746dce5e9b6c1f4070,35029545,2017-05-16,2617,-69,2017-07-24,35029545_2017-07-24,0,abnormal,2.698341,2.554037,0.599508,1.531167,5,0.663,1.789
3,48b09412e8754782_4903a582ec78ec0366e9db9e17cd,35583509,2017-09-06,2620,-336,2018-08-08,35583509_2018-08-08,0,abnormal,1.99466,2.02646,0.544319,1.103042,3,0.749,1.494
4,48b095533cbfdf01_4903a58c1bcb2198682a98b7503a,35401116,2019-04-09,2619,-2,2019-04-11,35401116_2019-04-11,1,normal,1.11,1.11,0.79,0.88,17,0.79,0.88


In [59]:
print('Number of echo studies within 365 days of cfr {}'.format(len(pet_cfr_echo_days_study.study.unique())))
print('Number of PET studies {}'.format(len(pet_cfr_echo_days_study.mrnstudyid.unique())))
print('Total number of studies in petecho df {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(pet_cfr_echo_days_study.mrn.unique())))

Number of echo studies within 365 days of cfr 2292
Number of PET studies 1638
Total number of studies in petecho df 2954
Unique patients:     1577


In [60]:
# We can now save the data frame with the echo studies and the matched CFR measurements
match_filename = 'pet_match365_diff_'+cfr_meta_date+'.parquet'
pet_cfr_echo_days_study.to_parquet(os.path.join(meta_dir, match_filename))

# Save also as .csv file
match_filename_csv = 'pet_match365_diff_'+cfr_meta_date+'.csv'
pet_cfr_echo_days_study.to_csv(os.path.join(meta_dir, match_filename_csv))

In [61]:
pet_cfr_echo_days_study.head()

Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf
0,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2618,191,2018-04-25,35133560_2018-04-25,0,normal,1.716143,1.699871,0.934766,1.588982,17,0.923,1.584
1,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,35014216,2019-04-25,2616,352,2018-05-08,35014216_2018-05-08,0,abnormal,2.644905,2.876778,0.63689,1.832192,8,0.628,1.661
2,48b091b714b41ffe_4903a582ec746dce5e9b6c1f4070,35029545,2017-05-16,2617,-69,2017-07-24,35029545_2017-07-24,0,abnormal,2.698341,2.554037,0.599508,1.531167,5,0.663,1.789
3,48b09412e8754782_4903a582ec78ec0366e9db9e17cd,35583509,2017-09-06,2620,-336,2018-08-08,35583509_2018-08-08,0,abnormal,1.99466,2.02646,0.544319,1.103042,3,0.749,1.494
4,48b095533cbfdf01_4903a58c1bcb2198682a98b7503a,35401116,2019-04-09,2619,-2,2019-04-11,35401116_2019-04-11,1,normal,1.11,1.11,0.79,0.88,17,0.79,0.88


#### Filter the large echo_BWH_npy_feather_files dataframe for easier metadata collection ####

In [65]:
# With the names of the matched echo studies, we can now get the file names
# of the meta files that we will need

meta_files_name = 'echo_BWH_npy_feather_files_'+cfr_meta_date+'.parquet'
meta_files_cfr_name = 'echo_BWH_npy_feather_files_cfr_'+cfr_meta_date+'.parquet'

meta_files_df = pd.read_parquet(os.path.join(meta_dir, meta_files_name))
meta_files_df_matched = meta_files_df[meta_files_df.study.isin(pet_cfr_echo_days_study.study.unique())]
print(meta_files_df_matched.shape)
print('Unique studies in meta data: {}'.format(len(meta_files_df_matched.study.unique())))
meta_files_df_matched.head(2)
meta_files_df_matched.to_parquet(os.path.join(meta_dir, meta_files_cfr_name))

(603984, 8)
Unique studies in meta data: 2291
