### Match ECHO videos with PET CFR labels ###
Master PET-ECHO match list: All data available on FS. Need to be filtered to create data sets.

In [2]:
import os
import pickle
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 2000)

In [3]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
cfr_meta_date = '200519'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200519


#### PET DATA ####

In [4]:
pet_echo_file = os.path.normpath('/mnt/obi0/phi/pet/pet_cfr/pets_with_echos_all.parquet')
pet_cols = ['mrn', 'pet_date', 'petmrn_identifier']
pet = pd.read_parquet(pet_echo_file)
pet = pet[pet_cols].dropna(how='all').drop_duplicates()
pet = pet.astype({'pet_date': 'datetime64[ns]'})
print(f'mrn-pet_date pairs {len(pet.petmrn_identifier.unique())}')
pet.head(2)

# We need to indicate that we have PET data for these patients
pet = pet.assign(pet_measurement=1).astype({'pet_measurement': 'int'})

mrn-pet_date pairs 4136


#### PET ECHO PAIRS (Rahul's list) ####

In [6]:
match_list_file = 'bwh_2020-Feb-4_pet_365_diff.txt'
rdf = pd.read_csv(os.path.join(cfr_data_root, match_list_file), sep = '\t')
rdf = rdf.astype({'echo_date': 'datetime64[ns]',
                  'pet_date': 'datetime64[ns]'}).\
        rename(columns={'MRN': 'mrn'}).sort_values('mrn')

# Add petmrn_identifier
rdf = rdf.assign(petmrn_identifier=rdf.mrn.astype(str)+'_'+rdf.pet_date.astype(str))
print(f'PET echo pairs: {len(rdf.petmrn_identifier.unique())}')

PET echo pairs: 3891


In [7]:
r = rdf[rdf.mrn==59691]
j = pet[pet.mrn==59691]

In [8]:
r

Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier
564,59691,2019-11-18,2019-01-17,305,59691_2019-01-17
566,59691,2016-09-22,2016-09-22,0,59691_2016-09-22
565,59691,2015-08-31,2015-09-17,17,59691_2015-09-17
563,59691,2018-11-19,2019-01-17,59,59691_2019-01-17


In [9]:
j

Unnamed: 0,mrn,pet_date,petmrn_identifier,pet_measurement
2606,59691,2013-10-02,59691_2013-10-02,1
2785,59691,2014-06-27,59691_2014-06-27,1
3065,59691,2015-09-17,59691_2015-09-17,1


In [10]:
m = r.merge(j, on=['mrn', 'pet_date', 'petmrn_identifier'], how='outer')

In [11]:
m

Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,pet_measurement
0,59691,2019-11-18,2019-01-17,305.0,59691_2019-01-17,
1,59691,2018-11-19,2019-01-17,59.0,59691_2019-01-17,
2,59691,2016-09-22,2016-09-22,0.0,59691_2016-09-22,
3,59691,2015-08-31,2015-09-17,17.0,59691_2015-09-17,1.0
4,59691,NaT,2013-10-02,,59691_2013-10-02,1.0
5,59691,NaT,2014-06-27,,59691_2014-06-27,1.0


In [12]:
# Join PET and PET-ECHO PAIRS tables
pet_echo = rdf.merge(pet, on=['mrn', 'pet_date', 'petmrn_identifier'], how='outer')
pet_echo.head(2)

Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,pet_measurement
0,9241,2010-01-26,2010-01-28,2.0,9241_2010-01-28,1.0
1,9241,2010-04-02,2010-01-28,64.0,9241_2010-01-28,1.0


In [13]:
# MRN-PET combinations from Rahul's match list that are not in Jenine's PET table:
missing_pet_file = 'missing_pet.parquet'
missing_pet = pet_echo.loc[pet_echo.pet_measurement.isnull()]
missing_pet.to_parquet(os.path.join(meta_dir, missing_pet_file))
print(f'mrn-pet_date pairs in Rahuls list: {len(rdf.petmrn_identifier.unique())}')
print(f'mrn-pet_date pairs without PET measurement: {len(missing_pet.petmrn_identifier.unique())}')
print(f'mrns that are missing PET measurements: {len(missing_pet.mrn.unique())}')
missing_pet.head()

mrn-pet_date pairs in Rahuls list: 3891
mrn-pet_date pairs without PET measurement: 100
mrns that are missing PET measurements: 95


Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,pet_measurement
2,59691,2019-11-18,2019-01-17,305.0,59691_2019-01-17,
3,59691,2018-11-19,2019-01-17,59.0,59691_2019-01-17,
4,59691,2016-09-22,2016-09-22,0.0,59691_2016-09-22,
321,1816339,2015-12-09,2015-12-10,1.0,1816339_2015-12-10,
322,1816339,2016-01-15,2015-12-10,36.0,1816339_2015-12-10,


In [14]:
# MRN-PET combinations in Jenine's table that were not identified by Rahul
missing_echo_match_file = 'missing_echo_pet_pairs.parquet'
missing_echo_match = pet_echo.loc[(pet_echo.pet_measurement==1) & 
                                  (pet_echo.echo_date.isnull())]
missing_echo_match.to_parquet(os.path.join(meta_dir, missing_echo_match_file))

print(f'mrn-pet_date pairs in Jenines list: {len(pet.petmrn_identifier.unique())}')
print(f'mrn-pet_date pairs without echo date: {len(missing_echo_match.petmrn_identifier.unique())}')
print(f'mrns with pet and no echos: {len(missing_echo_match.mrn.unique())}')
missing_echo_match.head()

mrn-pet_date pairs in Jenines list: 4136
mrn-pet_date pairs without echo date: 345
mrns with pet and no echos: 289


Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,pet_measurement
7423,12627030,NaT,2006-02-10,,12627030_2006-02-10,1.0
7424,15314750,NaT,2006-02-23,,15314750_2006-02-23,1.0
7425,1764927,NaT,2006-11-13,,1764927_2006-11-13,1.0
7426,6040208,NaT,2006-11-14,,6040208_2006-11-14,1.0
7427,14135529,NaT,2007-05-09,,14135529_2007-05-09,1.0


In [15]:
# We will continue with our list of pet_measurements, even if they are not on Rahul's list
# Because we may find echo data from mrns on disk
pet2 = pet_echo.loc[~pet_echo.pet_measurement.isnull()]
print(f'mrn-pet_date pairs {len(pet2.petmrn_identifier.unique())}')
pet2.head()

mrn-pet_date pairs 4136


Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,pet_measurement
0,9241,2010-01-26,2010-01-28,2.0,9241_2010-01-28,1.0
1,9241,2010-04-02,2010-01-28,64.0,9241_2010-01-28,1.0
5,59691,2015-08-31,2015-09-17,17.0,59691_2015-09-17,1.0
6,68189,2013-10-22,2013-10-22,0.0,68189_2013-10-22,1.0
7,81182,2008-01-04,2007-08-17,140.0,81182_2007-08-17,1.0


#### NPY ECHO FILES ####
This is a data frame that contains the filenames, MRNs and dates of all echo studies

In [16]:
echo_files_name = 'echo_npyFiles_BWH_'+cfr_meta_date+'.parquet'
echo = pd.read_parquet(os.path.join(meta_dir, echo_files_name))
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)

echo = echo.astype({'mrn': 'int64',
                    'echo_study_date': 'datetime64[ns]'})

print(echo_files_name)
print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()

# For matching dates with PET, we only need the studies
# So we can greatly reduce this data frame
echo = echo[['study', 'mrn', 'echo_study_date']].drop_duplicates().reset_index(drop = True).\
    sort_values(by = 'mrn', ascending = True)
echo.head()

echo_npyFiles_BWH_200519.parquet
Number of npy files: 2039672
Unique echo studies: 41003
Unique patients:     23803



Unnamed: 0,study,mrn,echo_study_date
5689,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,9241,2010-04-02
5679,4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5,9241,2010-01-26
5823,4b7b463e0550e9de_4903a58051d59d426fe452739a29,41723,2015-11-03
5715,4b7b463fc2f49a36_4903a58793b43bb4bb1078d62f4e,51599,2012-03-08
5702,4b7b463fc2f49a36_4903a58dd5465407d53d20ba3790,51599,2018-05-22


#### PET STUDIES + NPY ECHO DATA merged on mrn only ####

In [17]:
pet_study = pet2.merge(right = echo, on = 'mrn', how = 'left')

In [18]:
print(f'total mrn-pet_date pairs:  {len(pet2.petmrn_identifier.unique())}')
print(f'mrn-pet_date pairs merged: {len(pet_study.petmrn_identifier.unique())}')
pet_study.loc[pet_study.study.isnull()]
# Some of these mrns did not have an echo on file
pet_study_missing_echo = pet_study.loc[pet_study.study.isnull()]
print(f'mrn-pet_date paris without echo on disk: {len(pet_study_missing_echo.petmrn_identifier.unique())}')
print(f'mrns without echo on file: {len(pet_study_missing_echo.petmrn_identifier.unique())}')

mrn_pet_date_missing_echo_file = 'mrn_pet_missing_echo_file.parquet'
pet_study_missing_echo.to_parquet(os.path.join(meta_dir, mrn_pet_date_missing_echo_file))

total mrn-pet_date pairs:  4136
mrn-pet_date pairs merged: 4136
mrn-pet_date paris without echo on disk: 245
mrns without echo on file: 245


In [20]:
pet_study_missing_echo.head()

Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,pet_measurement,study,echo_study_date
76,371260,2010-08-05,2010-01-27,190.0,371260_2010-01-27,1.0,,NaT
296,813071,2008-03-05,2008-04-09,35.0,813071_2008-04-09,1.0,,NaT
528,1145168,2011-09-08,2012-01-23,137.0,1145168_2012-01-23,1.0,,NaT
1168,1797034,2011-07-13,2011-06-10,33.0,1797034_2011-06-10,1.0,,NaT
1199,1822717,2014-05-27,2014-06-16,20.0,1822717_2014-06-16,1.0,,NaT


In [21]:
# OK let's drop all rows where we dont have echo studies
pet_study = pet_study.loc[~pet_study.study.isnull()]
print(f'mrn-pet_date pairs with PET and echo:    {len(pet_study.petmrn_identifier.unique())}')
# Now we need to match pet_date and echo_dates (we have two echo dates from the merge with .npy files)
pet_study[pet_study.mrn==39835152]

mrn-pet_date pairs with PET and echo:    3891


Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,pet_measurement,study,echo_study_date
28074,39835152,2019-06-14,2019-09-10,88.0,39835152_2019-09-10,1.0,48bca16ebb9f79e9_4903a58c1bc8b8b4edc2ff652898,2019-07-29
28075,39835152,2019-06-14,2019-09-10,88.0,39835152_2019-09-10,1.0,48bca16ebb9f79e9_4903a58c1bc99380baa33591b299,2019-06-14
28076,39835152,2019-07-29,2019-09-10,43.0,39835152_2019-09-10,1.0,48bca16ebb9f79e9_4903a58c1bc8b8b4edc2ff652898,2019-07-29
28077,39835152,2019-07-29,2019-09-10,43.0,39835152_2019-09-10,1.0,48bca16ebb9f79e9_4903a58c1bc99380baa33591b299,2019-06-14


### Match ECHO study with PET reportID  ###
Goal:
Find a PET study for each echo video taken from the closest PET measurement.
Requirement: We want to pick the most recent CFR measurments for each echo that is not older than 365 days.
For this, we need a data frame that contains for a given MRN:

- All echo studies for this patient
- A date for each echo study
- All pet reports for that patient
- Date for each pet report

In [22]:
def match_pet_echo(df):
    """ 
    Determine the earliest PET measurement for all studies
    """

    # Loop through all MRNs in df_pet_echo
    mrn_list = sorted(list(df.mrn.unique()))
    cfr_echo_days = pd.DataFrame()
    for m, mrn in enumerate(mrn_list):

        if (m+1)%200==0:
            print('Processing MRN {} of {}'.format(m+1, len(mrn_list)))

        df_mrn = df[df.mrn == mrn]
        echo_study_list = list(df_mrn.study.unique())

        # Loop through all echo studies for this MRN
        for study in echo_study_list:
            echo_study_date = df_mrn[df_mrn.study == study].echo_study_date.values[0]

            # Find all pet dates for this patient and add the difference to the echo dates
            pet_dates = df_mrn[['petmrn_identifier', 'pet_date']].groupby('petmrn_identifier').first().reset_index(drop=False)
            pet_dates = pet_dates.assign(days_post_pet=(echo_study_date-pet_dates.pet_date).dt.days)
            pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days_post_pet))
            ser = pet_dates.loc[pet_dates.days_abs.idxmin()]

            # We can return the reportID and the days of the earliest pet study
            study_dict = {'mrn': [mrn],
                          'study': [study],
                          'pet_date': [ser.pet_date],
                          'echo_date': [echo_study_date],
                          'petmrn_identifier': [ser.petmrn_identifier],
                          'days_post_pet': [ser.days_post_pet]}
            cfr_echo_days = pd.concat([cfr_echo_days, pd.DataFrame(study_dict)], ignore_index=True)
    
    return cfr_echo_days

In [23]:
pet_echo_days = match_pet_echo(pet_study)

Processing MRN 200 of 3285
Processing MRN 400 of 3285
Processing MRN 600 of 3285
Processing MRN 800 of 3285
Processing MRN 1000 of 3285
Processing MRN 1200 of 3285
Processing MRN 1400 of 3285
Processing MRN 1600 of 3285
Processing MRN 1800 of 3285
Processing MRN 2000 of 3285
Processing MRN 2200 of 3285
Processing MRN 2400 of 3285
Processing MRN 2600 of 3285
Processing MRN 2800 of 3285
Processing MRN 3000 of 3285
Processing MRN 3200 of 3285


Here, we have picked the closest PET date for all echo studies for this mrn. And the closest PET date was the same for all echos. Therefore, PET scans that occurred earlier that do not have an echo that is closer, will disappear.

In [24]:
pet2.head()

Unnamed: 0,mrn,echo_date,pet_date,difference(days),petmrn_identifier,pet_measurement
0,9241,2010-01-26,2010-01-28,2.0,9241_2010-01-28,1.0
1,9241,2010-04-02,2010-01-28,64.0,9241_2010-01-28,1.0
5,59691,2015-08-31,2015-09-17,17.0,59691_2015-09-17,1.0
6,68189,2013-10-22,2013-10-22,0.0,68189_2013-10-22,1.0
7,81182,2008-01-04,2007-08-17,140.0,81182_2007-08-17,1.0


In [25]:
# I still want to know how this match compares to Rahu's original match list
pet_echo_matched = pet_echo_days.merge(pet2, on = ['mrn', 'echo_date', 'pet_date', 'petmrn_identifier'], how='left')
pet_echo_matched = pet_echo_matched.sort_values(by='mrn').reset_index(drop=True)
pet_echo_matched.head(20)
print(f'PET studies before match with echo: {len(pet2.petmrn_identifier.unique())}')
print(f'PET studies after match with echo:  {len(pet_echo_matched.petmrn_identifier.unique())}')
# After match there are fewer PET studies because we only picked the closest PET measurment for each echo
print(f'mrns in original PET file:          {len(pet.mrn.unique())}')
print(f'mrns in  match file:                {len(pet2.mrn.unique())}')
print(f'mrns with PET and echo after match: {len(pet_echo_matched.mrn.unique())}')

PET studies before match with echo: 4136
PET studies after match with echo:  3643
mrns in original PET file:          3516
mrns in  match file:                3516
mrns with PET and echo after match: 3285


In [26]:
# Filter echo studies that do not have a PET within 365 days
pet_echo_matched_365 = pet_echo_matched.assign(abs_pet_days = np.abs(pet_echo_matched.days_post_pet))
pet_echo_matched_365 = pet_echo_matched_365[pet_echo_matched_365.abs_pet_days<=365].drop(columns = ['abs_pet_days'])
pet_echo_matched_365 = pet_echo_matched_365.sort_values(by = 'study')

In [27]:
# After the match, do we have the same number of patients
print(f'mrns before 365 filter:        {len(pet_echo_matched.mrn.unique())}')
print(f'mrns after 365 filter:         {len(pet_echo_matched_365.mrn.unique())}')
print()
print(f'echo studies before 365 filter:{len(pet_echo_matched.study.unique())}')
print(f'echo studies after 365 filter: {len(pet_echo_matched_365.study.unique())}')

mrns before 365 filter:        3285
mrns after 365 filter:         3257

echo studies before 365 filter:8103
echo studies after 365 filter: 6443


In [28]:
len(pet_echo_matched_365.petmrn_identifier.unique())

3590

In [29]:
len(pet_echo_matched_365.mrn.unique())

3257

In [30]:
len(pet_echo_matched_365.study.unique())

6443

In [31]:
pet_echo_matched_365.head(20)

Unnamed: 0,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),pet_measurement
7910,35156678,48b09010a2219aad_4903a582edf3bd118ffb3386065b,2018-10-15,2017-12-06,35156678_2018-10-15,-313,313.0,1.0
7911,35156678,48b09010a2219aad_4903a58dd44a9e3cbfca18de5a4a,2018-10-15,2018-10-12,35156678_2018-10-15,-3,3.0,1.0
7912,35169325,48b09013185d7d6b_4903a582ec746dc05201b6bc42b3,2017-05-22,2017-05-18,35169325_2017-05-22,-4,4.0,1.0
7913,35169325,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113,2017-05-22,2017-06-21,35169325_2017-05-22,30,30.0,1.0
7908,35133560,48b09016b25a7f36_4903a582ec746dce5e9b6e469ed0,2018-04-25,2017-05-16,35133560_2018-04-25,-344,344.0,1.0
7905,35133560,48b09016b25a7f36_4903a582edf01a92b1053e7601dc,2018-04-25,2017-11-18,35133560_2018-04-25,-158,158.0,1.0
7907,35133560,48b09016b25a7f36_4903a582edf18da17815dee81bd5,2018-04-25,2017-10-25,35133560_2018-04-25,-182,182.0,1.0
7906,35133560,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,2018-04-25,2018-11-02,35133560_2018-04-25,191,191.0,1.0
7904,35133560,48b09016b25a7f36_4903a58dd547fecc6765cc3a9cba,2018-04-25,2018-04-24,35133560_2018-04-25,-1,1.0,1.0
7895,35014216,48b091b44637c987_4903a58c1bcb23767fc2818c6e63,2018-05-08,2019-04-25,35014216_2018-05-08,352,352.0,1.0


In [33]:
# We can now save the data frame with the echo studies and the matched CFR measurements
match_filename = 'pet_match365_diff_'+cfr_meta_date+'.parquet'
pet_echo_matched_365.to_parquet(os.path.join(meta_dir, match_filename))

# Save also as .csv file
match_filename_csv = 'pet_match365_diff_'+cfr_meta_date+'.csv'
pet_echo_matched_365.to_csv(os.path.join(meta_dir, match_filename_csv))
print(pet_echo_matched_365.shape)

(6443, 8)
