### Compare MRNs in PET and ECHO reports ###
This notebook just compares PET studies and ECHO videos on the file system. 

In [1]:
import os
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [2]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
cfr_meta_date = '200202'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200202


#### PET REPORTS ####

In [3]:
# PET REPORTS
pet_reports_file = os.path.join(cfr_data_root, '210_getStressTest.xlsx')
pet_df = pd.read_excel(pet_reports_file)
keep_cols = ['reportID', 'mrn', 'subjectid', 'report_number', 'study_date', 'report_date', 'cfr']
pet = pet_df[keep_cols]
pet.loc[pet.study_date.index].study_date = pd.to_datetime(pet.study_date)
pet.loc[pet.report_date.index].study_date = pd.to_datetime(pet.report_date)

print('Number of reports: {}'.format(len(pet.report_number.unique())))
print('Unique patients:   {}'.format(len(pet.mrn.unique())))
print('Unique cfr values: {}'.format(len(pet.cfr.unique())))

# Lets keep only a few columns 
pet.head()

# Additional PET studies MRNs
pet_studies_needed = os.path.join(cfr_data_root, 'pet_studies_needed.txt')
pet_df_needed = pd.read_csv(pet_studies_needed, sep = '\t', header = None)
pet_df_needed.columns = ['mrn', 'study_date']
pet_df_2 = pet_df.merge(right = pet_df_needed, on = ['mrn', 'study_date'], how = 'outer')

Number of reports: 3350
Unique patients:   3072
Unique cfr values: 3414


In [4]:
pet_df_3 = pet_df_2[['mrn', 'study_date', 'report_number']].groupby('mrn').first().\
            reset_index(drop = False)

In [5]:
print(len(pet_df_2.mrn.unique()))
print(len(pet_df_3.mrn.unique()))

3755
3755


In [6]:
pet_df_3.head(5)

Unnamed: 0,mrn,study_date,report_number
0,9241,28-Jan-10,EVS0228610
1,59691,02-Oct-13,EVS0419689
2,68189,22-Oct-13,EVS0446758
3,81182,17-Aug-07,0721092P
4,88435,20-Feb-15,E1789200


#### NPY ECHO DATA ####

In [7]:
echo_files_name = 'echo_npyFiles_BWH_'+cfr_meta_date+'.parquet'
echo = pd.read_parquet(os.path.join(meta_dir, echo_files_name))
echo = echo.astype({'mrn': 'int64'})
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)

In [8]:
print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()
echo.head()

Number of npy files: 1029179
Unique echo studies: 20525
Unique patients:     12000



Unnamed: 0,filename,dir,study,mrn,datetime,echo_study_date,echo_study_time
0,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
1,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
2,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
3,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
4,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24


#### PET STUDIES + NPY ECHO DATA ####

In [9]:
pet_echo = pet_df_3.merge(right = echo, on = 'mrn', how = 'left')
pet_echo.tail()

Unnamed: 0,mrn,study_date,report_number,filename,dir,study,datetime,echo_study_date,echo_study_time
200290,40604613,2019-11-20,,,,,NaT,,
200291,40611006,2019-10-17,,,,,NaT,,
200292,40768871,2019-12-11,,,,,NaT,,
200293,40902504,2019-11-06,,,,,NaT,,
200294,41128695,2020-01-10,,,,,NaT,,


In [12]:
# Filter pet studies with current echo
pet_echo_nofiles = pet_echo.loc[pet_echo.filename.isnull()].reset_index(drop = True)
petecho = pet_echo.loc[~pet_echo.study.isnull()].reset_index(drop = True)

# Save those
pet_echo_nofiles_name = 'pet_no_npyFiles_BWH_'+cfr_meta_date+'.parquet'
pet_echo_files_name = 'pet_npyFiles_BWH_'+cfr_meta_date+'.parquet'

pet_echo_nofiles.to_parquet(os.path.join(meta_dir, pet_echo_nofiles_name))
petecho.to_parquet(os.path.join(meta_dir, pet_echo_files_name))

In [13]:
pet_echo_nofiles.tail()

Unnamed: 0,mrn,study_date,report_number,filename,dir,study,datetime,echo_study_date,echo_study_time
1683,40604613,2019-11-20,,,,,NaT,,
1684,40611006,2019-10-17,,,,,NaT,,
1685,40768871,2019-12-11,,,,,NaT,,
1686,40902504,2019-11-06,,,,,NaT,,
1687,41128695,2020-01-10,,,,,NaT,,


In [14]:
print('PET with ECHO:')
print('Number of npy files: {}'.format(len(petecho.filename.unique())))
print('Unique echo studies: {}'.format(len(petecho.study.unique())))
print('Unique patients:     {}'.format(len(petecho.mrn.unique())))

PET with ECHO:
Number of npy files: 198607
Unique echo studies: 4274
Unique patients:     2067


In [15]:
print('PET without ECHO:')
print('Unique patients:     {}'.format(len(pet_echo_nofiles.mrn.unique())))
print('Total PET patients:  {}'.format(len(pet_df_3.mrn.unique())))

PET without ECHO:
Unique patients:     1688
Total PET patients:  3755


In [16]:
pet_echo_nofiles.head()

Unnamed: 0,mrn,study_date,report_number,filename,dir,study,datetime,echo_study_date,echo_study_time
0,68189,22-Oct-13,EVS0446758,,,,NaT,,
1,81182,17-Aug-07,0721092P,,,,NaT,,
2,137091,28-Apr-09,EVS0182113,,,,NaT,,
3,142232,12-Sep-08,0827928A,,,,NaT,,
4,178046,2019-01-10,,,,,NaT,,


#### ECHO STUDIES ON DCM BUT NOT CONVERTED TO NPY ####

In [18]:
dcm_studies_file = 'echo_deIdentifyedEcho_BWH_dcm_'+cfr_meta_date+'.parquet'
dcm = pd.read_parquet(os.path.join(meta_dir, dcm_studies_file))
dcm = dcm.astype({'mrn': 'int64'})
print(dcm.shape)
dcm.head()

(2489078, 5)


Unnamed: 0,filename,dir,study,mrn,datetime
0,Image-4.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
1,Image-33.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
2,Image-34.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
3,Image-3.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07
4,Image-48.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/48b3/4...,48b3dd6feca73d8c_4903a58dd540e605fed753883266,36876217,2018-03-05 09:00:07


In [19]:
rm_cols = list(echo.columns)
rm_cols.remove('mrn')
pet_echo_nofiles_x = pet_echo_nofiles.drop(columns = rm_cols)

# These are CFR studies without .npy files
pet_echo_nofiles_x.head()

Unnamed: 0,mrn,study_date,report_number
0,68189,22-Oct-13,EVS0446758
1,81182,17-Aug-07,0721092P
2,137091,28-Apr-09,EVS0182113
3,142232,12-Sep-08,0827928A
4,178046,2019-01-10,


In [20]:
# Merge CFR studies (without .npy) with _dcm data
pet_echo_nofiles_dcm = pet_echo_nofiles_x.merge(right = dcm, on = 'mrn', how = 'left')
pet_echo_nofiles_dcm.head()

Unnamed: 0,mrn,study_date,report_number,filename,dir,study,datetime
0,68189,22-Oct-13,EVS0446758,,,,NaT
1,81182,17-Aug-07,0721092P,,,,NaT
2,137091,28-Apr-09,EVS0182113,,,,NaT
3,142232,12-Sep-08,0827928A,Image-16.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
4,142232,12-Sep-08,0827928A,Image-11.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49


In [21]:
# We have many files for each mrn. 
# But we only want to know the mrn numbers for which have/not have .dcm data
pet_echo_nofiles_dcm = pet_echo_nofiles_dcm.groupby('mrn').first().reset_index(drop = False)
pet_echo_nofiles_dcm.head()

Unnamed: 0,mrn,study_date,report_number,filename,dir,study,datetime
0,68189,22-Oct-13,EVS0446758,,,,NaT
1,81182,17-Aug-07,0721092P,,,,NaT
2,137091,28-Apr-09,EVS0182113,,,,NaT
3,142232,12-Sep-08,0827928A,Image-16.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
4,178046,2019-01-10,,,,,NaT


In [35]:
#pet_echo_nofiles = pet_echo.loc[pet_echo.filename.isnull()].reset_index(drop = True)
df_pet_echo_nodcm = pet_echo_nofiles_dcm.loc[pet_echo_nofiles_dcm.filename.isnull()].\
                    drop(columns = ['filename', 'dir', 'study', 'datetime']).\
                    reset_index(drop=True)

# PET CFR studies where we have .dcm files, but not yet converted to .npy
df_pet_echo_dcm = pet_echo_nofiles_dcm.loc[~pet_echo_nofiles_dcm.filename.isnull()].\
                    drop(columns = 'filename').\
                    reset_index(drop=True)

# Save those
df_pet_echo_nodcm_file = 'pet_echo_deIdentifyedEcho_BWH_nodcm_'+cfr_meta_date+'.parquet'
df_pet_echo_dcm_file = 'pet_echo_deIdentifyedEcho_BWH_dcm_'+cfr_meta_date+'.parquet'
df_pet_echo_nodcm.to_parquet(os.path.join(meta_dir, df_pet_echo_nodcm_file))
df_pet_echo_dcm.to_parquet(os.path.join(meta_dir, df_pet_echo_dcm_file))

In [38]:
print(len(df_pet_echo_dcm.study.unique()))
df_pet_echo_dcm.head()

474


Unnamed: 0,mrn,study_date,report_number,dir,study,datetime
0,142232,12-Sep-08,0827928A,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
1,288407,12-Oct-17,E6330362,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b44d7d346b8b3_4903a582edf18fdf650ef98a8e6d,2017-10-07 10:30:38
2,634790,2017-11-01,,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b40204d1f9f46_4903a582ec793c1e3079bea28cf6,2017-08-08 09:41:07
3,794941,11-Jul-08,EVS0182499,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b41a5f60a4ebe_4903a44ab12fcc860f4f20cd0ef8,2009-04-28 13:05:07
4,813071,09-Apr-08,0806735A,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b4edb2e8d49d5_4903a44b32efc00c83a4de086041,2008-03-05 13:13:20


In [39]:
print(len(df_pet_echo_nodcm.mrn.unique()))
df_pet_echo_nodcm.head()

1214


Unnamed: 0,mrn,study_date,report_number
0,68189,22-Oct-13,EVS0446758
1,81182,17-Aug-07,0721092P
2,137091,28-Apr-09,EVS0182113
3,178046,2019-01-10,
4,240762,14-May-07,0629-0010


In [30]:
print('MRNs that are not available as .dcm files: {}'.format(len(df_pet_echo_nodcm.mrn.unique())))

MRNs that are not available as .dcm files: 1214


In [31]:
df_pet_echo_dcm.head()

Unnamed: 0,mrn,study_date,report_number,filename,dir,study,datetime
0,142232,12-Sep-08,0827928A,Image-16.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b475df4b17b04_4903a44b32e5a539f80e6819b98b,2008-09-12 14:45:49
1,288407,12-Oct-17,E6330362,Image-9.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b44d7d346b8b3_4903a582edf18fdf650ef98a8e6d,2017-10-07 10:30:38
2,634790,2017-11-01,,Image-82.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b40204d1f9f46_4903a582ec793c1e3079bea28cf6,2017-08-08 09:41:07
3,794941,11-Jul-08,EVS0182499,Image-37.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b41a5f60a4ebe_4903a44ab12fcc860f4f20cd0ef8,2009-04-28 13:05:07
4,813071,09-Apr-08,0806735A,Image-16.dcm,/mnt/obi0/phi/echo/deIdentifyedEcho/BWH/4b7b/4...,4b7b4edb2e8d49d5_4903a44b32efc00c83a4de086041,2008-03-05 13:13:20


In [32]:
print('Studies that are yet available as .dcm files: {}'.format(len(df_pet_echo_dcm.study.unique())))
print('MRNs that are available as .dcm files: {}'.format(len(df_pet_echo_dcm.mrn.unique())))

Studies that are yet available as .dcm files: 474
MRNs that are available as .dcm files: 474
