### Compare MRNs in PET and ECHO reports ###
This notebook just compares PET studies and ECHO videos on the file system. 

In [3]:
import os
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

#### PET REPORTS ####

In [4]:
# PET REPORTS
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
pet_reports_file = os.path.join(cfr_data_root, '210_getStressTest.xlsx')
pet_df = pd.read_excel(pet_reports_file)
keep_cols = ['reportID', 'mrn', 'subjectid', 'report_number', 'study_date', 'report_date', 'cfr']
pet = pet_df[keep_cols]
pet.loc[pet.study_date.index].study_date = pd.to_datetime(pet.study_date)
pet.loc[pet.report_date.index].study_date = pd.to_datetime(pet.report_date)

print('Number of reports: {}'.format(len(pet.report_number.unique())))
print('Unique patients:   {}'.format(len(pet.mrn.unique())))
print('Unique cfr values: {}'.format(len(pet.cfr.unique())))

# Lets keep only a few columns 
pet.head()

Number of reports: 3350
Unique patients:   3072
Unique cfr values: 3414


Unnamed: 0,reportID,mrn,subjectid,report_number,study_date,report_date,cfr
0,72946,4474599,3,0904-0007,28-Sep-07,04-Sep-07,1.428232
1,91480,9774688,6,EVS0154093,11-Nov-08,11-Nov-08,2.015579
2,78543,8880684,7,0802806B,31-Oct-07,29-Jan-08,2.401606
3,73048,16480857,10,0728923B,26-Nov-07,29-Oct-07,1.1075
4,100873,7633498,11,EVS0266850,29-Jan-10,24-Sep-10,1.776545


#### NPY ECHO DATA ####

In [5]:
echo_files_name = 'echo_npyFiles_BWH.parquet'
echo = pd.read_parquet(os.path.join(cfr_data_root, echo_files_name))
echo = echo.astype({'mrn': 'int64'})
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)

In [6]:
print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()
echo.head()

Number of npy files: 1021868
Unique echo studies: 20371
Unique patients:     11846



Unnamed: 0,filename,dir,study,mrn,datetime,echo_study_date,echo_study_time
0,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
1,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
2,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
3,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
4,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24


#### PET STUDIES + NPY ECHO DATA ####

In [7]:
pet_echo = pet.merge(right = echo, on = 'mrn', how = 'left')
pet_echo.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,study_date,report_date,cfr,filename,dir,study,datetime,echo_study_date,echo_study_time
0,72946,4474599,3,0904-0007,28-Sep-07,04-Sep-07,1.428232,,,,NaT,,
1,91480,9774688,6,EVS0154093,11-Nov-08,11-Nov-08,2.015579,,,,NaT,,
2,78543,8880684,7,0802806B,31-Oct-07,29-Jan-08,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21
3,78543,8880684,7,0802806B,31-Oct-07,29-Jan-08,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21
4,78543,8880684,7,0802806B,31-Oct-07,29-Jan-08,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21


In [8]:
# Filter pet studies with current echo
pet_echo_nofiles = pet_echo.loc[pet_echo.filename.isnull()].reset_index(drop = True)
pet_echo_files = pet_echo.loc[~pet_echo.filename.isnull()].reset_index(drop = True)

# Save those
pet_echo_nofiles_name = 'pet_no_npyFiles_BWH.parquet'
pet_echo_files_name = 'pet_npyFiles_BWH.parquet'

pet_echo_nofiles.to_parquet(os.path.join(cfr_data_root, pet_echo_nofiles_name))
pet_echo_files.to_parquet(os.path.join(cfr_data_root, pet_echo_files_name))

In [9]:
print('PET with ECHO:')
print('Number of npy files: {}'.format(len(pet_echo_files.filename.unique())))
print('Unique echo studies: {}'.format(len(pet_echo_files.study.unique())))
print('Unique patients:     {}'.format(len(pet_echo_files.mrn.unique())))

PET with ECHO:
Number of npy files: 170158
Unique echo studies: 3711
Unique patients:     1759


In [10]:
print('PET without ECHO:')
print('Number of reports:   {}'.format(len(pet_echo_nofiles.reportID.unique())))
print('Unique patients:     {}'.format(len(pet_echo_nofiles.mrn.unique())))
print('Total PET patients:  {}'.format(len(pet.mrn.unique())))

PET without ECHO:
Number of reports:   1384
Unique patients:     1313
Total PET patients:  3072


In [11]:
pet_echo_nofiles.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,study_date,report_date,cfr,filename,dir,study,datetime,echo_study_date,echo_study_time
0,72946,4474599,3,0904-0007,28-Sep-07,04-Sep-07,1.428232,,,,NaT,,
1,91480,9774688,6,EVS0154093,11-Nov-08,11-Nov-08,2.015579,,,,NaT,,
2,73048,16480857,10,0728923B,26-Nov-07,29-Oct-07,1.1075,,,,NaT,,
3,104144,25001462,13,EVS0310491,30-Jun-10,29-Jun-11,1.603968,,,,NaT,,
4,103368,3584661,17,EVS0246472,21-May-10,11-May-10,2.457241,,,,NaT,,


#### ECHO STUDIES ON DCM BUT NOT CONVERTED TO NPY ####

In [16]:
dcm_studies_file = 'echo_deIdentifyedEcho_BWH_dcm.parquet'
dcm = pd.read_parquet(os.path.join(cfr_data_root, dcm_studies_file))
dcm = dcm.astype({'mrn': 'int64'})
dcm.head()

Unnamed: 0,filename,dir,study,mrn,datetime
0,Image-4.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
1,Image-33.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
2,Image-34.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
3,Image-3.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
4,Image-48.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07


In [23]:
rm_cols = list(echo.columns)
rm_cols.remove('mrn')
pet_echo_nofiles_x = pet_echo_nofiles.drop(columns = rm_cols)
pet_echo_nofiles_x.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,study_date,report_date,cfr
0,72946,4474599,3,0904-0007,28-Sep-07,04-Sep-07,1.428232
1,91480,9774688,6,EVS0154093,11-Nov-08,11-Nov-08,2.015579
2,73048,16480857,10,0728923B,26-Nov-07,29-Oct-07,1.1075
3,104144,25001462,13,EVS0310491,30-Jun-10,29-Jun-11,1.603968
4,103368,3584661,17,EVS0246472,21-May-10,11-May-10,2.457241


In [28]:
pet_echo_nofiles_dcm = pet_echo_nofiles_x.merge(right = dcm, on = 'mrn', how = 'left')
pet_echo_nofiles_dcm = pet_echo_nofiles_dcm.groupby('mrn').first().reset_index(drop = False)

In [29]:
pet_echo_nofiles_dcm.head()

Unnamed: 0,mrn,reportID,subjectid,report_number,study_date,report_date,cfr,filename,dir,study,datetime
0,68189,127394,5580,EVS0446758,22-Oct-13,22-Oct-13,1.504065,,,,NaT
1,81182,86436,3149,0721092P,17-Aug-07,13-Aug-07,1.720486,,,,NaT
2,137091,94942,1777,EVS0182113,28-Apr-09,27-Apr-09,0.790368,,,,NaT
3,142232,90848,434,0827928A,12-Sep-08,12-Sep-08,1.443478,Image-16.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
4,240762,86216,751,0629-0010,14-May-07,29-Jun-06,1.405966,,,,NaT


In [39]:
#pet_echo_nofiles = pet_echo.loc[pet_echo.filename.isnull()].reset_index(drop = True)
df_pet_echo_nodcm = pet_echo_nofiles_dcm.loc[pet_echo_nofiles_dcm.filename.isnull()].reset_index(drop=True)
df_pet_echo_dcm = pet_echo_nofiles_dcm.loc[~pet_echo_nofiles_dcm.filename.isnull()].reset_index(drop=True)

# Save those
df_pet_echo_nodcm_file = 'echo_deIdentifyedEcho_BWH_nodcm.parquet'
df_pet_echo_dcm_file = 'echo_deIdentifyedEcho_BWH_dcm.parquet'
df_pet_echo_nodcm.to_parquet(os.path.join(cfr_data_root, df_pet_echo_nodcm_file))
df_pet_echo_dcm.to_parquet(os.path.join(cfr_data_root, df_pet_echo_dcm_file))

In [41]:
df_pet_echo_nodcm.head()

Unnamed: 0,mrn,reportID,subjectid,report_number,study_date,report_date,cfr,filename,dir,study,datetime
0,68189,127394,5580,EVS0446758,22-Oct-13,22-Oct-13,1.504065,,,,NaT
1,81182,86436,3149,0721092P,17-Aug-07,13-Aug-07,1.720486,,,,NaT
2,137091,94942,1777,EVS0182113,28-Apr-09,27-Apr-09,0.790368,,,,NaT
3,240762,86216,751,0629-0010,14-May-07,29-Jun-06,1.405966,,,,NaT
4,288407,157776,7446,E6330362,12-Oct-17,07-Oct-17,1.844106,,,,NaT


In [43]:
print('MRNs that are not available as .dcm files: {}'.format(len(df_pet_echo_nodcm.mrn.unique())))

MRNs that are not available as .dcm files: 949


In [44]:
df_pet_echo_dcm.head()

Unnamed: 0,mrn,reportID,subjectid,report_number,study_date,report_date,cfr,filename,dir,study,datetime
0,142232,90848,434,0827928A,12-Sep-08,12-Sep-08,1.443478,Image-16.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
1,506337,79286,840,0710196C,09-Jan-07,30-Apr-07,1.671347,Image-37.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b434352d29b77_4903a444c30c1b222bd352daa3c9,2007-04-05 10:28:38
2,614214,103254,274,EVS0245861,17-May-10,17-May-10,2.249077,Image-31.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b4022cd822c37_4903a585855e2e2fd9dcf31b41ab,2010-10-05 10:12:54
3,717272,84800,2734,0801598S,17-Jan-08,17-Jan-08,1.921582,Image-45.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b41ad12e52603_4903a44b32edb90a5078013b5d7a,2008-01-17 13:38:10
4,794941,72359,2605,EVS0182499,11-Jul-08,28-Apr-09,1.615031,Image-37.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b41a5f60a4ebe_4903a44ab12fcc860f4f20cd0ef8,2009-04-28 13:05:07


In [47]:
print('Studies that are yet available as .dcm files: {}'.format(len(df_pet_echo_dcm.study.unique())))
print('MRNs that are available as .dcm files: {}'.format(len(df_pet_echo_dcm.mrn.unique())))

Studies that are yet available as .dcm files: 364
MRNs that are available as .dcm files: 364
