### Match ECHO videos with PET CFR labels ###
We need this to assign the CFR labels to the videos. The goal is to have a data set with PET CFR values for each video that we will use to train.

In [47]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 2000)

#### PET REPORTS ####

In [111]:
# PET REPORTS
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
pet_reports_file = os.path.join(cfr_data_root, '210_getStressTest.xlsx')
pet_df = pd.read_excel(pet_reports_file)
keep_cols = ['reportID', 'mrn', 'subjectid', 'report_number', 'study_date', 'report_date', 'cfr']
pet = pet_df[keep_cols]

pet = pet.astype({'report_date': 'datetime64[ns]',
                  'study_date': 'datetime64[ns]'})

print('Number of reports: {}'.format(len(pet.report_number.unique())))
print('Unique patients:   {}'.format(len(pet.mrn.unique())))
print('Unique cfr values: {}'.format(len(pet.cfr.unique())))

Number of reports: 3350
Unique patients:   3072
Unique cfr values: 3414


#### NPY ECHO FILES ####

In [124]:
echo_dir = os.path.join(cfr_data_root, 'metadata_200130')
echo_files_name = 'echo_npyFiles_BWH.parquet'
echo = pd.read_parquet(os.path.join(echo_dir, echo_files_name))
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)

echo = echo.astype({'mrn': 'int64',
                    'echo_study_date': 'datetime64[ns]'})

print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()

# For matching with PET, we only need the dates for each study
# So we can greatly reduce this data frame

echo_studies = echo

Number of npy files: 1021868
Unique echo studies: 20371
Unique patients:     11846



In [144]:
echo.columns

Index(['filename', 'dir', 'study', 'mrn', 'datetime', 'echo_study_date', 'echo_study_time'], dtype='object')

#### PET STUDIES + NPY ECHO DATA ####

In [125]:
pet_echo = pet.merge(right = echo, on = 'mrn', how = 'left')
pet_echo.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,study_date,report_date,cfr,filename,dir,study,datetime,echo_study_date,echo_study_time
0,72946,4474599,3,0904-0007,2007-09-28,2007-09-04,1.428232,,,,NaT,NaT,
1,91480,9774688,6,EVS0154093,2008-11-11,2008-11-11,2.015579,,,,NaT,NaT,
2,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21
3,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21
4,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21


In [126]:
# Filter pet studies with current echo
pet_echo_nofiles = pet_echo.loc[pet_echo.filename.isnull()].reset_index(drop = True)
petecho = pet_echo.loc[~pet_echo.filename.isnull()].reset_index(drop = True)

# Save those
pet_echo_nofiles_name = 'pet_no_npyFiles_BWH_200130.parquet'
pet_echo_files_name = 'pet_npyFiles_BWH_200130.parquet'

pet_echo_nofiles.to_parquet(os.path.join(cfr_data_root, 'temp', pet_echo_nofiles_name))
petecho.to_parquet(os.path.join(cfr_data_root, 'temp', pet_echo_files_name))

In [127]:
print('PET with ECHO:')
print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique echo studies: {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(petecho.mrn.unique())))

PET with ECHO:
Number of npy files: 170158
Unique echo studies: 3711
Unique patients:     1759


In [128]:
petecho.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,study_date,report_date,cfr,filename,dir,study,datetime,echo_study_date,echo_study_time
0,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21
1,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21
2,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21
3,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21
4,78543,8880684,7,0802806B,2007-10-31,2008-01-29,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21


### Match PET with ECHO videos ###
Goal:
Find a CFR label for each echo video taken from the closest PET measurement.
Requirement: We want to pick the most recent CFR measurments for each echo that is not older than 365 days.

In [129]:
# Pick a patient
#np.random.seed(112)
#mrn = np.random.choice(petecho.mrn.unique(), size = 1,)[0]
mrn = 15467822
print('MRN {}'.format(mrn))
petecho_mrn = petecho[petecho.mrn == mrn]
print(petecho_mrn[['mrn', 'reportID', 'study_date', 'echo_study_date', 'filename', 'cfr']].iloc[150:160])

MRN 15467822
             mrn  reportID study_date echo_study_date                                           filename       cfr
129018  15467822    119959 2012-10-11      2014-05-01  4a1347300a20f7e7_4903a581a72e1c3a0014f8f71932_...  0.875204
129019  15467822    119959 2012-10-11      2014-05-01  4a1347300a20f7e7_4903a581a72e1c3a0014f8f71932_...  0.875204
129020  15467822    119959 2012-10-11      2014-05-01  4a1347300a20f7e7_4903a581a72e1c3a0014f8f71932_...  0.875204
129021  15467822    119959 2012-10-11      2014-05-01  4a1347300a20f7e7_4903a581a72e1c3a0014f8f71932_...  0.875204
129022  15467822    119959 2012-10-11      2014-05-01  4a1347300a20f7e7_4903a581a72e1c3a0014f8f71932_...  0.875204
129023  15467822    119959 2012-10-11      2014-05-01  4a1347300a20f7e7_4903a581a72e1c3a0014f8f71932_...  0.875204
129024  15467822    126435 2013-09-05      2014-04-24  4a1347300a20f7e7_4903a581a72ff2f12e988b7e2f8d_...  1.182464
129025  15467822    126435 2013-09-05      2014-04-24  4a1347300a20

In [130]:
# There are multiple PET studies
print('PET studies for this mrn {}'.format(petecho_mrn.reportID.unique()))
# And we have many echos for this patient
print('Videos for this patient {}'.format(len(petecho_mrn.filename.unique())))
print('Number of PET studies x number of echos = {}'.format(len(petecho_mrn.reportID.unique()) *\
                                                            len(petecho_mrn.filename.unique())))
print('Total number of rows for this patient: {}'.format(petecho_mrn.shape[0]))

print()
print('There is exactly one combination of reportID and filename for this patient.')

PET studies for this mrn [119959 126435 131307 133986]
Videos for this patient 156
Number of PET studies x number of echos = 624
Total number of rows for this patient: 624

There is exactly one combination of reportID and filename for this patient.


In [139]:
# Collect the dates for this patient
pet_dates = list(petecho_mrn.study_date.dt.date.unique())
echo_dates = list(petecho_mrn.echo_study_date.dt.date.unique())
print('PET dates:')
print(*pet_dates, sep = '\n')
print()
print('ECHO dates:')
print(*echo_dates, sep = '\n')

PET dates:
2012-10-11
2013-09-05
2014-04-29
2014-09-02

ECHO dates:
2014-04-24
2014-08-29
2014-10-28
2012-09-06
2014-05-01


5
5
