### Compare MRNs in PET and ECHO reports ###
This notebook just compares PET studies and ECHO videos on the file system. 

In [1]:
import os
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [2]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
cfr_meta_date = '200208'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200208


#### PET REPORTS ####

In [3]:
# Here are Rahul's new files
echo_all_file = os.path.join(cfr_data_root, 'echo_studies_all_2020-Feb-4.txt')
pet_all_file = os.path.join(cfr_data_root, 'pet_studies_all_2020-Feb-4.txt')
pet_echo_file = os.path.join(cfr_data_root, 'bwh_2020-Feb-4_pet_365_diff.txt')
echo_df = pd.read_csv(echo_all_file, sep = '\t')
pet_df = pd.read_csv(pet_all_file, sep = '\t')
pet_echo_df = pd.read_csv(pet_echo_file, sep = '\t') # Contains all PET MRNs use this one.
pet = pet_echo_df.rename(columns = {'MRN': 'mrn'}).reset_index(drop = True)

print('Unique patients:   {}'.format(len(pet.mrn.unique())))

Unique patients:   3567


In [4]:
pet1 = set(list(pet_df.mrn.unique()))
pet2 = set(list(pet_echo_df.MRN.unique()))
print('Symmetric difference {}'.format(pet1.symmetric_difference(pet2)))

Symmetric difference set()


In [5]:
pet.head()

Unnamed: 0,mrn,echo_date,pet_date,difference(days)
0,17666439,2017-3-14,2017-07-20,128
1,12287447,2019-5-16,2020-01-23,252
2,12287447,2020-1-23,2020-01-23,0
3,31953938,2017-11-9,2018-08-20,284
4,31953938,2018-8-13,2018-08-20,7


#### NPY ECHO DATA ####

In [6]:
echo_files_name = 'echo_npyFiles_BWH_'+cfr_meta_date+'.parquet'
echo = pd.read_parquet(os.path.join(meta_dir, echo_files_name))
echo = echo.astype({'mrn': 'int64'})
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)
print('Echo file: {}'.format(echo_files_name))

Echo file: echo_npyFiles_BWH_200208.parquet


In [7]:
print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()
echo.head()

Number of npy files: 1056913
Unique echo studies: 21045
Unique patients:     12520



Unnamed: 0,filename,dir,study,mrn,datetime,echo_study_date,echo_study_time
0,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
1,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
2,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
3,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
4,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24


#### PET STUDIES + NPY ECHO DATA ####

In [8]:
pet_echo = pet.merge(right = echo, on = 'mrn', how = 'left')
pet_echo.tail()

Unnamed: 0,mrn,echo_date,pet_date,difference(days),filename,dir,study,datetime,echo_study_date,echo_study_time
717303,33433715,2016-8-10,2016-08-22,12,48b60f555feba794_4903a583583f6c11bb254ec9db72_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b6/48b60f555...,48b60f555feba794_4903a583583f6c11bb254ec9db72,2016-08-10 09:44:10,2016-08-10,09:44:10
717304,33433715,2016-8-10,2016-08-22,12,48b60f555feba794_4903a583583f6c11bb254ec9db72_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b6/48b60f555...,48b60f555feba794_4903a583583f6c11bb254ec9db72,2016-08-10 09:44:10,2016-08-10,09:44:10
717305,33433715,2016-8-10,2016-08-22,12,48b60f555feba794_4903a583583f6c11bb254ec9db72_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b6/48b60f555...,48b60f555feba794_4903a583583f6c11bb254ec9db72,2016-08-10 09:44:10,2016-08-10,09:44:10
717306,33433715,2016-8-10,2016-08-22,12,48b60f555feba794_4903a583583f6c11bb254ec9db72_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b6/48b60f555...,48b60f555feba794_4903a583583f6c11bb254ec9db72,2016-08-10 09:44:10,2016-08-10,09:44:10
717307,33433715,2016-8-10,2016-08-22,12,48b60f555feba794_4903a583583f6c11bb254ec9db72_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b6/48b60f555...,48b60f555feba794_4903a583583f6c11bb254ec9db72,2016-08-10 09:44:10,2016-08-10,09:44:10


In [9]:
# Filter pet studies with current echo
pet_echo_nofiles = pet_echo.loc[pet_echo.filename.isnull()].reset_index(drop = True)
petecho = pet_echo.loc[~pet_echo.study.isnull()].reset_index(drop = True)

# Save those
pet_echo_nofiles_name = 'pet_no_npyFiles_BWH_'+cfr_meta_date+'.parquet'
pet_echo_files_name = 'pet_npyFiles_BWH_'+cfr_meta_date+'.parquet'

pet_echo_nofiles.to_parquet(os.path.join(meta_dir, pet_echo_nofiles_name))
petecho.to_parquet(os.path.join(meta_dir, pet_echo_files_name))

In [10]:
pet_echo_nofiles.tail()

Unnamed: 0,mrn,echo_date,pet_date,difference(days),filename,dir,study,datetime,echo_study_date,echo_study_time
1586,30444061,2015-5-15,2015-05-11,4,,,,NaT,,
1587,30530000,2015-6-3,2015-06-04,1,,,,NaT,,
1588,31029101,2015-8-8,2015-08-13,5,,,,NaT,,
1589,31759749,2016-4-19,2015-12-30,111,,,,NaT,,
1590,32299455,2016-8-20,2016-08-19,1,,,,NaT,,


In [11]:
print('PET with ECHO:')
print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique echo studies: {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(petecho.mrn.unique())))

PET with ECHO:
Number of npy files: 223752
Unique echo studies: 4736
Unique patients:     2541


In [12]:
print('PET without ECHO:')
print('Unique patients:     {}'.format(len(pet_echo_nofiles.mrn.unique())))
print('Total PET patients:  {}'.format(len(pet.mrn.unique())))

PET without ECHO:
Unique patients:     1026
Total PET patients:  3567


In [13]:
pet_echo_nofiles.head()

Unnamed: 0,mrn,echo_date,pet_date,difference(days),filename,dir,study,datetime,echo_study_date,echo_study_time
0,12224283,2019-11-22,2020-01-17,56,,,,NaT,,
1,28694107,2019-12-4,2019-12-26,22,,,,NaT,,
2,17845447,2019-5-3,2019-12-18,229,,,,NaT,,
3,24425142,2019-11-29,2019-12-12,13,,,,NaT,,
4,8911976,2019-11-7,2019-11-08,1,,,,NaT,,


#### ECHO STUDIES ON DCM BUT NOT CONVERTED TO NPY ####

In [14]:
dcm_studies_file = 'echo_deIdentifyedEcho_BWH_dcm_'+cfr_meta_date+'.parquet'
dcm = pd.read_parquet(os.path.join(meta_dir, dcm_studies_file))
dcm = dcm.astype({'mrn': 'int64'})
print(dcm.shape)
dcm.head()

(2556099, 5)


Unnamed: 0,filename,dir,study,mrn,datetime
0,Image-4.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
1,Image-33.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
2,Image-34.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
3,Image-3.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
4,Image-48.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07


In [15]:
rm_cols = list(echo.columns)
rm_cols.remove('mrn')
pet_echo_nofiles_x = pet_echo_nofiles.drop(columns = rm_cols)

# These are CFR studies without .npy files
pet_echo_nofiles_x.head()

Unnamed: 0,mrn,echo_date,pet_date,difference(days)
0,12224283,2019-11-22,2020-01-17,56
1,28694107,2019-12-4,2019-12-26,22
2,17845447,2019-5-3,2019-12-18,229
3,24425142,2019-11-29,2019-12-12,13
4,8911976,2019-11-7,2019-11-08,1


In [16]:
# Merge CFR studies (without .npy) with _dcm data
pet_echo_nofiles_dcm = pet_echo_nofiles_x.merge(right = dcm, on = 'mrn', how = 'left')
pet_echo_nofiles_dcm.head()

Unnamed: 0,mrn,echo_date,pet_date,difference(days),filename,dir,study,datetime
0,12224283,2019-11-22,2020-01-17,56,Image-4.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4a14/4...,4a14f8a600bc4641_4903a58c1a3846ef33df7ef1086b,2019-11-22 09:52:04
1,12224283,2019-11-22,2020-01-17,56,Image-6.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4a14/4...,4a14f8a600bc4641_4903a58c1a3846ef33df7ef1086b,2019-11-22 09:52:04
2,12224283,2019-11-22,2020-01-17,56,Image-1.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4a14/4...,4a14f8a600bc4641_4903a58c1a3846ef33df7ef1086b,2019-11-22 09:52:04
3,12224283,2019-11-22,2020-01-17,56,Image-9.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4a14/4...,4a14f8a600bc4641_4903a58c1a3846ef33df7ef1086b,2019-11-22 09:52:04
4,28694107,2019-12-4,2019-12-26,22,Image-32.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/490b/4...,490b1cfefef39b31_4903a5864f021be00b576a00d335,2013-10-28 13:49:39


In [17]:
# We have many files for each mrn. 
# But we only want to know the mrn numbers for which have/not have .dcm data
pet_echo_nofiles_dcm = pet_echo_nofiles_dcm.groupby('mrn').first().reset_index(drop = False)
pet_echo_nofiles_dcm.head()

Unnamed: 0,mrn,echo_date,pet_date,difference(days),filename,dir,study,datetime
0,68189,2013-10-22,2013-10-22,0,,,,NaT
1,81182,2008-1-4,2007-08-17,140,,,,NaT
2,137091,2009-4-27,2009-04-28,1,,,,NaT
3,142232,2008-9-12,2008-09-12,0,Image-16.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
4,240762,2007-5-12,2007-05-14,2,,,,NaT


In [18]:
#pet_echo_nofiles = pet_echo.loc[pet_echo.filename.isnull()].reset_index(drop = True)
df_pet_echo_nodcm = pet_echo_nofiles_dcm.loc[pet_echo_nofiles_dcm.filename.isnull()].\
                    drop(columns = ['filename', 'dir', 'study', 'datetime']).\
                    reset_index(drop=True)

# PET CFR studies where we have .dcm files, but not yet converted to .npy
df_pet_echo_dcm = pet_echo_nofiles_dcm.loc[~pet_echo_nofiles_dcm.filename.isnull()].\
                    drop(columns = 'filename').\
                    reset_index(drop=True)

# Save those
df_pet_echo_nodcm_file = 'pet_echo_deIdentifyedEcho_BWH_nodcm_'+cfr_meta_date+'.parquet'
df_pet_echo_dcm_file = 'pet_echo_deIdentifyedEcho_BWH_dcm_'+cfr_meta_date+'.parquet'
df_pet_echo_nodcm.to_parquet(os.path.join(meta_dir, df_pet_echo_nodcm_file))
df_pet_echo_dcm.to_parquet(os.path.join(meta_dir, df_pet_echo_dcm_file))

In [19]:
print(len(df_pet_echo_dcm.study.unique()))
df_pet_echo_dcm.head()

236


Unnamed: 0,mrn,echo_date,pet_date,difference(days),dir,study,datetime
0,142232,2008-9-12,2008-09-12,0,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
1,794941,2008-7-1,2008-07-11,10,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b41a5f60a4ebe_4903a44ab12fcc860f4f20cd0ef8,2009-04-28 13:05:07
2,813071,2008-3-5,2008-04-09,35,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b4edb2e8d49d5_4903a44b32efc00c83a4de086041,2008-03-05 13:13:20
3,1145762,2010-5-4,2010-05-03,1,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7a/4...,4b7a8652c32ed64d_4903a585842b96fd35619b87b41a,2010-05-04 08:40:03
4,1248228,2006-5-12,2006-11-27,199,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7a/4...,4b7a857faa65b583_4903a44516de260f69be388d152d,2006-05-12 15:25:09


In [27]:
print(df_pet_echo_nodcm.shape)

(790, 4)


In [20]:
print(len(df_pet_echo_nodcm.mrn.unique()))
df_pet_echo_nodcm.head()

790


Unnamed: 0,mrn,echo_date,pet_date,difference(days)
0,68189,2013-10-22,2013-10-22,0
1,81182,2008-1-4,2007-08-17,140
2,137091,2009-4-27,2009-04-28,1
3,240762,2007-5-12,2007-05-14,2
4,337014,2015-1-13,2015-01-13,0


In [21]:
print('MRNs that are not available as .dcm files: {}'.format(len(df_pet_echo_nodcm.mrn.unique())))

MRNs that are not available as .dcm files: 790


In [22]:
df_pet_echo_dcm.head()

Unnamed: 0,mrn,echo_date,pet_date,difference(days),dir,study,datetime
0,142232,2008-9-12,2008-09-12,0,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
1,794941,2008-7-1,2008-07-11,10,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b41a5f60a4ebe_4903a44ab12fcc860f4f20cd0ef8,2009-04-28 13:05:07
2,813071,2008-3-5,2008-04-09,35,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b4edb2e8d49d5_4903a44b32efc00c83a4de086041,2008-03-05 13:13:20
3,1145762,2010-5-4,2010-05-03,1,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7a/4...,4b7a8652c32ed64d_4903a585842b96fd35619b87b41a,2010-05-04 08:40:03
4,1248228,2006-5-12,2006-11-27,199,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7a/4...,4b7a857faa65b583_4903a44516de260f69be388d152d,2006-05-12 15:25:09


In [23]:
print('Studies that are yet available as .dcm files: {}'.format(len(df_pet_echo_dcm.study.unique())))
print('MRNs that are available as .dcm files: {}'.format(len(df_pet_echo_dcm.mrn.unique())))

Studies that are yet available as .dcm files: 236
MRNs that are available as .dcm files: 236
