### Match ECHO videos with PET CFR labels ###
We need this to assign the CFR labels to the videos. The goal is to have a data set with PET CFR values for each video that we will use to train.

In [1]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 2000)

#### CFR REPORTS ####

In [117]:
# PET REPORTS
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
pet_reports_file = os.path.join(cfr_data_root, '210_getStressTest.xlsx')
pet_df = pd.read_excel(pet_reports_file)
keep_cols = ['reportID', 'mrn', 'subjectid', 'report_number', 'study_date', 'report_date', 'cfr']
pet = pet_df[keep_cols]

pet = pet.astype({'report_date': 'datetime64[ns]',
                  'study_date': 'datetime64[ns]'})

pet = pet.rename(columns = {'report_date': 'cfr_report_date',
                            'study_date': 'cfr_study_date'})

print('Number of reports: {}'.format(len(pet.report_number.unique())))
print('Unique patients:   {}'.format(len(pet.mrn.unique())))
print('Unique cfr values: {}'.format(len(pet.cfr.unique())))

Number of reports: 3350
Unique patients:   3072
Unique cfr values: 3414


In [118]:
pet.dtypes

reportID                    int64
mrn                         int64
subjectid                   int64
report_number              object
cfr_study_date     datetime64[ns]
cfr_report_date    datetime64[ns]
cfr                       float64
dtype: object

#### NPY ECHO FILES ####
This is a data frame that contains the filenames, MRNs and dates of all echo studies

In [119]:
echo_dir = os.path.join(cfr_data_root, 'metadata_200131')
echo_files_name = 'echo_npyFiles_BWH_200131.parquet'
echo = pd.read_parquet(os.path.join(echo_dir, echo_files_name))
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)

echo = echo.astype({'mrn': 'int64',
                    'echo_study_date': 'datetime64[ns]'})

print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()

# For matching dates with PET, we only need the studies
# So we can greatly reduce this data frame
echo = echo[['study', 'mrn', 'echo_study_date']].drop_duplicates().reset_index(drop = True).\
    sort_values(by = 'mrn', ascending = True)

Number of npy files: 1029179
Unique echo studies: 20525
Unique patients:     12000



#### PET STUDIES + NPY ECHO DATA ####

In [131]:
pet_echo = pet.merge(right = echo, on = 'mrn', how = 'left')
pet_echo.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,study,echo_study_date
0,72946,4474599,3,0904-0007,2007-09-28,2007-09-04,1.428232,,NaT
1,91480,9774688,6,EVS0154093,2008-11-11,2008-11-11,2.015579,,NaT
2,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29
3,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c30c185baf6a7befa860,2007-04-30
4,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a44b32edba11d3b5854898ef,2008-01-29


In [132]:
# Filter filter NAN in study from merged PET report
pet_echo_nofiles = pet_echo.loc[pet_echo.study.isnull()].reset_index(drop = True)
petecho = pet_echo.loc[~pet_echo.study.isnull()].reset_index(drop = True)

# Save those
pet_echo_nofiles_name = 'pet_no_npyFiles_BWH_200131.parquet'
pet_echo_files_name = 'pet_npyFiles_BWH_200131.parquet'

pet_echo_nofiles.to_parquet(os.path.join(cfr_data_root, 'metadata_200131', pet_echo_nofiles_name))
petecho.to_parquet(os.path.join(cfr_data_root, 'metadata_200131', pet_echo_files_name))

In [133]:
print('PET with ECHO:')
#print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique echo studies: {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(petecho.mrn.unique())))

PET with ECHO:
Unique echo studies: 3865
Unique patients:     1913


### Match ECHO study with PET reportID  ###
Goal:
Find a CFR label for each echo video taken from the closest PET measurement.
Requirement: We want to pick the most recent CFR measurments for each echo that is not older than 365 days.
For this, we need a data frame that contains for a given MRN:

- All echo studies for this patient
- A date for each echo study
- All pet reports for that patient
- Date for each pet report

In [134]:
petecho.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,study,echo_study_date
0,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29
1,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c30c185baf6a7befa860,2007-04-30
2,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a44b32edba11d3b5854898ef,2008-01-29
3,100873,7633498,11,EVS0266850,2010-01-29,2010-09-24,1.776545,4b7c286f1fbab5f7_4903a5858427dadef2c7c049a6e0,2010-09-24
4,103871,24935165,14,EVS0249881,2010-06-16,2010-06-07,1.383662,4907b0f8e5711d90_4903a58584283aa9efdd537dee9e,2010-06-07


In [135]:
# Pick a patient
#np.random.seed(112)
#mrn = np.random.choice(petecho.mrn.unique(), size = 1,)[0]
mrn = 15467822
print('MRN {}'.format(mrn))
petecho_mrn = petecho[petecho.mrn == mrn].reset_index(drop=True)
print(petecho_mrn[['mrn', 'reportID', 'cfr_study_date', 'echo_study_date', 'study', 'cfr']].iloc[8:12])

MRN 15467822
         mrn  reportID cfr_study_date echo_study_date                                          study       cfr
8   15467822    126435     2013-09-05      2014-10-28  4a1347300a20f7e7_4903a581a681209d89d1fed6737d  1.182464
9   15467822    126435     2013-09-05      2014-05-01  4a1347300a20f7e7_4903a581a72e1c3a0014f8f71932  1.182464
10  15467822    131307     2014-04-29      2012-09-06  4a1347300a20f7e7_4903a58793be6eaf2db65cba1044  0.909972
11  15467822    131307     2014-04-29      2014-08-29  4a1347300a20f7e7_4903a581a723bbe47f2a139a23e4  0.909972


In [136]:
# Pick a study
echo_study = petecho_mrn.study.unique()[2]
echo_study_date = petecho_mrn[petecho_mrn.study == echo_study].echo_study_date.values[0]
print(echo_study)
print(echo_study_date)

# Now, list the reportIDs with the corresponding dates
pet_dates = petecho_mrn[['reportID', 'cfr_study_date']].groupby('reportID').first().reset_index(drop=False)
pet_dates = pet_dates.assign(days=(pet_dates.cfr_study_date-echo_study_date).dt.days)
pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days))
print()
print(pet_dates)

# Now, we can find the earliest pet_study_date and the corresponding reportID
ser = pet_dates.loc[pet_dates.days_abs.idxmin()]
print('ReportID of earliest PET study {}'.format(ser.reportID))
print('cfr_days_after_echo {}'.format(ser.days))

4a1347300a20f7e7_4903a581a72ff2f12e988b7e2f8d
2014-04-24T00:00:00.000000000

   reportID cfr_study_date  days  days_abs
0    119959     2012-10-11  -560       560
1    126435     2013-09-05  -231       231
2    131307     2014-04-29     5         5
3    133986     2014-09-02   131       131
ReportID of earliest PET study 131307
cfr_days_after_echo 5


In [164]:
# Now, we can come up with a function that determines the reportID and the dates of the earliest CFR
# For any echo date for a given patient.

def get_earliest_cfr_reports(df_cfr_echo):
    """ 
    Determine the earliest CFR measurement for all studies
    args: df_cfr_echo: pd.DataFrame() 
          with columns: mrn, study, reportID, cfr_study_date, echo_study_date
    returns: cfr_echo_days: pd.DataFrame()
          with columns: mrn, study, reportID, cfr_days_after_echo
    """

    # Loop through all MRNs in df_pet_echo
    mrn_list = sorted(list(df_cfr_echo.mrn.unique()))
    cfr_echo_days = pd.DataFrame()
    for m, mrn in enumerate(mrn_list):

        if (m+1)%200==0:
            print('Processing MRN {} of {}'.format(m+1, len(mrn_list)))

        df_mrn = df_cfr_echo[df_cfr_echo.mrn == mrn]
        echo_study_list = list(df_mrn.study.unique())

        # Loop through all studies for this MRN
        for study in echo_study_list:
            echo_study_date = df_mrn[df_mrn.study == study].echo_study_date.values[0]

            # Find all pet dates for this patient and add the difference to the echo dates
            pet_dates = df_mrn[['reportID', 'cfr_study_date']].groupby('reportID').first().reset_index(drop=False)
            pet_dates = pet_dates.assign(days=(pet_dates.cfr_study_date-echo_study_date).dt.days)
            pet_dates = pet_dates.assign(days_abs = np.abs(pet_dates.days))
            ser = pet_dates.loc[pet_dates.days_abs.idxmin()]

            # We can return the reportID and the days of the earliest pet study
            reportID = ser.reportID
            days = ser.days
            study_dict = {'mrn': [mrn],
                          'study': [study],
                          'echo_study_date': [echo_study_date],
                          'reportID': [ser.reportID],
                          'cfr_days_after_echo': [ser.days]}
            cfr_echo_days = pd.concat([cfr_echo_days, pd.DataFrame(study_dict)], ignore_index=True)
    
    return cfr_echo_days

In [165]:
cfr_echo_days = get_earliest_cfr_reports(petecho)

Processing MRN 200 of 1913
Processing MRN 400 of 1913
Processing MRN 600 of 1913
Processing MRN 800 of 1913
Processing MRN 1000 of 1913
Processing MRN 1200 of 1913
Processing MRN 1400 of 1913
Processing MRN 1600 of 1913
Processing MRN 1800 of 1913


In [166]:
cfr_echo_days.head()

Unnamed: 0,mrn,study,echo_study_date,reportID,cfr_days_after_echo
0,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,-64
1,9241,4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5,2010-01-26,100820,2
2,59691,4b7b463fca25269d_4903a580509f97a83ea0d31ca15c,2015-08-31,141524,17
3,59691,4b7b463fca25269d_4903a5864e60c69dea709396260d,2013-06-03,127002,121
4,88435,4b7b46323562dafe_4903a58c1bccabff29a2958b368c,2019-03-07,137391,-1476


In [167]:
pet.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,cfr_study_date,cfr_report_date,cfr
0,72946,4474599,3,0904-0007,2007-09-28,2007-09-04,1.428232
1,91480,9774688,6,EVS0154093,2008-11-11,2008-11-11,2.015579
2,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606
3,73048,16480857,10,0728923B,2007-11-26,2007-10-29,1.1075
4,100873,7633498,11,EVS0266850,2010-01-29,2010-09-24,1.776545


In [174]:
# We can merge this back with the original PET reports
pet_cfr_echo_days = cfr_echo_days.merge(right = pet, on = ['reportID', 'mrn'], how = 'left')
pet_cfr_echo_days.head()

Unnamed: 0,mrn,study,echo_study_date,reportID,cfr_days_after_echo,subjectid,report_number,cfr_study_date,cfr_report_date,cfr
0,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,-64,121,EVS0228610,2010-01-28,2010-01-26,1.133683
1,9241,4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5,2010-01-26,100820,2,121,EVS0228610,2010-01-28,2010-01-26,1.133683
2,59691,4b7b463fca25269d_4903a580509f97a83ea0d31ca15c,2015-08-31,141524,17,5313,E1238046,2015-09-17,2015-08-31,2.277429
3,59691,4b7b463fca25269d_4903a5864e60c69dea709396260d,2013-06-03,127002,121,5313,EVS0419689,2013-10-02,2013-06-03,2.012465
4,88435,4b7b46323562dafe_4903a58c1bccabff29a2958b368c,2019-03-07,137391,-1476,6095,E1789200,2015-02-20,2016-02-18,1.956129


In [176]:
# Next, we will filter those echo studies that occurred earlier or later than 365 days after CFR
pet_cfr_echo_days = pet_cfr_echo_days.assign(abs_cfr_days = np.abs(pet_cfr_echo_days.cfr_days_after_echo))
pet_cfr_echo_days = pet_cfr_echo_days[pet_cfr_echo_days.abs_cfr_days<=365].drop(columns = ['abs_cfr_days'])

In [177]:
pet_cfr_echo_days.head(20)

Unnamed: 0,mrn,study,echo_study_date,reportID,cfr_days_after_echo,subjectid,report_number,cfr_study_date,cfr_report_date,cfr
0,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,-64,121,EVS0228610,2010-01-28,2010-01-26,1.133683
1,9241,4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5,2010-01-26,100820,2,121,EVS0228610,2010-01-28,2010-01-26,1.133683
2,59691,4b7b463fca25269d_4903a580509f97a83ea0d31ca15c,2015-08-31,141524,17,5313,E1238046,2015-09-17,2015-08-31,2.277429
3,59691,4b7b463fca25269d_4903a5864e60c69dea709396260d,2013-06-03,127002,121,5313,EVS0419689,2013-10-02,2013-06-03,2.012465
5,92742,4b7b4633cc96d8c2_4903a5858427dadde17abcd03be6,2010-09-27,107029,63,1646,EVS0276708,2010-11-29,2010-11-26,1.097118
6,92742,4b7b4633cc96d8c2_4903a585855f5450c473b5bd5d51,2010-11-26,107029,3,1646,EVS0276708,2010-11-29,2010-11-26,1.097118
7,92742,4b7b4633cc96d8c2_4903a584a1befac2ae418ace04c3,2011-04-13,107029,-135,1646,EVS0276708,2010-11-29,2010-11-26,1.097118
8,146456,4b7b475df0c06b82_4903a585842609311ccba48ea0c6,2010-08-23,105164,0,3677,EVS0262148,2010-08-23,2010-08-23,1.529131
9,146456,4b7b475df0c06b82_4903a58584260812ef8076acd261,2010-08-31,105164,-8,3677,EVS0262148,2010-08-23,2010-08-23,1.529131
10,181784,4b7b47510e0ec780_4903a582ec793db535b5ca68ed3a,2017-08-15,148971,-336,3679,E2093485,2016-09-13,2016-05-10,1.40754


In [178]:
print('Number of echo studies within 365 days of cfr {}'.format(len(pet_cfr_echo_days.study.unique())))
print('Total number of studies in petecho df {}'.format(len(petecho.study.unique())))

Number of echo studies within 365 days of cfr 2921
Total number of studies in petecho df 3865


In [180]:
# We can now save the data frame with the echo studies and the matched CFR measurements
match_filename = '210_getStressTest_match365_study_BWH_200131.parquet'
pet_cfr_echo_days.to_parquet(os.path.join(cfr_data_root, 'metadata_200131', match_filename))